upload to github

2020-01-20 17:26:06 +01:00
parent 5ad82539c8
commit 170dae5344
6 changed files with 320 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,6 @@ fabric.properties
 .idea/**/markdown-navigator/

 # End of https://www.gitignore.io/api/intellij
+.idea/artifacts/WebCrawler_jar.xml
+.idea/modules.xml
+.idea/misc.xml
--- a/WebCrawler.iml
+++ b/WebCrawler.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="jsoup-1.12.1" level="project" />
+  </component>
+</module>
--- a/src/main/java/webcrawler/CrawlBranch.java
+++ b/src/main/java/webcrawler/CrawlBranch.java
@@ -0,0 +1,85 @@
+package main.java.webcrawler;
+
+import org.jsoup.Connection;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+public class CrawlBranch {
+    private List<String> links = new LinkedList<>();
+    private Document htmlDocument;
+    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
+
+    /**
+     * crawls the links in it's current arrayList of links
+     * @param url the url to start from
+     * @return <code>true</code> if the search was successful, <code>false otherwise</code>
+     */
+    public boolean crawl(String url) {
+        try {
+            Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
+            this.htmlDocument = connection.get();
+
+            if (connection.response().statusCode() == 200) {
+                System.out.println("VISITING -- Recieved web page at " + url);
+            } else {
+                System.out.println("FAIL -- recieved something else than a web page");
+                return false;
+            }
+
+            Elements linksOnPage = htmlDocument.select("a[href]");
+            System.out.println("FOUND (" + linksOnPage.size() + ") links");
+            for (Element link : linksOnPage) {
+                this.links.add(link.absUrl("href"));
+            }
+            return true;
+        } catch (Exception e) {
+            System.out.println("ERROR -- error in out http request : " + e);
+            return false;
+        }
+    }
+
+    /**
+     * searches how many times a word occurs in a page
+     * @param word the word to look for
+     * @return the amount of occurrences in the web page, -1 if the word is not found
+     */
+    public int searchForWord(String word) {
+        if (this.htmlDocument == null){
+            System.out.println("ERROR -- call crawl before searhing");
+            return -1;
+        }
+        System.out.printf("Searching for %s...", word);
+        System.out.println();
+        String bodyText = this.htmlDocument.body().text();
+        return count(bodyText.toLowerCase(),word.toLowerCase());
+    }
+
+    /**
+     * counts how many times a word occurs in a string
+     * @param text the string to search in for the word
+     * @param word the word to search for
+     * @return the amount of times the given word was found in the string
+     */
+    private int count(String text, String word) {
+        int amount = 0;
+        String[] words = text.split(" ");
+        for (int i = 0; i < words.length; i++) {
+            if (words[i].contains(word)) amount++;
+        }
+        return amount;
+    }
+
+    /**
+     * gets the links
+     * @return the links
+     */
+    public List<String> getLinks() {
+        return this.links;
+    }
+}
--- a/src/main/java/webcrawler/Main.java
+++ b/src/main/java/webcrawler/Main.java
@@ -0,0 +1,23 @@
+package main.java.webcrawler;
+
+import java.util.Scanner;
+
+public class Main {
+    public static void main(String[] args) {
+        Scanner scanner = new Scanner(System.in);
+        System.out.print("Enter a starting URL : ");
+        String startUrl = scanner.nextLine().trim();
+        System.out.print("Enter a word to search for : ");
+        String word = scanner.nextLine().trim();
+        System.out.print("Enter the maximum amount of pages the crawler should visit : ");
+        int amount = Integer.parseInt(scanner.nextLine().trim());
+        System.out.print("Should the crawler save the links with hits? (Y/N) : ");
+        String choice = scanner.nextLine().toLowerCase().trim();
+        boolean save;
+        if (choice.equals("y")) save = true;
+        else if (choice.equals("n")) save = false;
+        else save = false;
+        WebCrawler crawler = new WebCrawler(amount,save);
+        crawler.search(startUrl,word);
+    }
+}
--- a/src/main/java/webcrawler/WebCrawler.java
+++ b/src/main/java/webcrawler/WebCrawler.java
@@ -0,0 +1,194 @@
+package main.java.webcrawler;
+
+import java.util.*;
+
+public class WebCrawler {
+    private int amountOfPages;
+    private Set<String> pagesVisited;
+    private List<String> pagesPending;
+    private ArrayList<String> resultPages;
+    private Map<String,Integer> urlHits;
+    private int amountFound = 0;
+    private int successPages = 0;
+    private boolean shouldSaveHitLinks;
+
+    /**
+     * creates a new WebCrawler object with standard values
+     */
+    public WebCrawler() {
+        this(50, true);
+    }
+
+    /**
+     * creates a new WebCrawler object with the given amount of max pages
+     * @param maxPages the max amount of pages the crawler should visit
+     */
+    public WebCrawler(int maxPages) {
+        this(maxPages, true);
+    }
+
+    /**
+     * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
+     * @param maxPages the max amount of pages the crawler should visit
+     * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
+     */
+    public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
+        this.amountOfPages = maxPages;
+        this.shouldSaveHitLinks = shouldSaveHitLinks;
+        this.pagesVisited = new HashSet<>();
+        this.pagesPending = new LinkedList<>();
+        this.resultPages = new ArrayList<>();
+        this.urlHits = new HashMap<>();
+    }
+
+
+    /**
+     * gets the next url in the list
+     * @return the next url in the list
+     */
+    private String nextUrl() {
+        String next;
+        do {
+            next = this.pagesPending.remove(0);
+        } while (this.pagesVisited.contains(next));
+        this.pagesVisited.add(next);
+        return next;
+    }
+
+    /**
+     * searches for a word by crawling the web through hyperlinks
+     * @param url the url to start searching from
+     * @param searchWord the word to search for
+     */
+    public void search(String url, String searchWord) {
+        while (this.pagesVisited.size() < amountOfPages) {
+            String curUrl;
+            CrawlBranch branch = new CrawlBranch();
+            if (this.pagesPending.isEmpty()) {
+                curUrl = url;
+                this.pagesVisited.add(url);
+            } else {
+                curUrl = this.nextUrl();
+            }
+            branch.crawl(curUrl);
+
+            int amount = branch.searchForWord(searchWord);
+            if (amount > 0) {
+                System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
+                System.out.println();
+                successPages++;
+                amountFound += amount;
+                if (shouldSaveHitLinks)
+                resultPages.add(curUrl);
+                urlHits.put(curUrl,amount);
+            }
+            this.pagesPending.addAll(branch.getLinks());
+        }
+        System.out.println("========================");
+        System.out.printf("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages);
+        if (shouldSaveHitLinks) {
+            System.out.printf("Successful pages: \n%s", showCombinations(urlHits));
+        }
+    }
+
+    private String display(List<String> list) {
+        StringBuilder res = new StringBuilder();
+        for (int i = 0; i < list.size(); i++) {
+            res.append(list.get(i)).append("\n");
+        }
+        return res.toString();
+    }
+
+    private String showCombinations(Map<String, Integer> urls) {
+        StringBuilder res = new StringBuilder();
+        Set<String> keys = urls.keySet();
+        for (String url : keys) {
+            res.append(url).append(" (").append(urls.get(url)).append(" hits)\n");
+        }
+        return res.toString();
+    }
+
+    /**
+     * sets the amount of max pages
+     * @param amount the amount of pages
+     */
+    public void setAmountOfPages(int amount) {
+        this.amountOfPages = amount;
+    }
+
+    public void setShouldSaveHitLinks(boolean shouldSaveHitLinks) {
+        this.shouldSaveHitLinks = shouldSaveHitLinks;
+    }
+
+    public ArrayList<String> getResultPages() {
+        return this.resultPages;
+    }
+
+    public Map<String, Integer> getUrlHits() {
+        return urlHits;
+    }
+
+    public int getAmountFound() {
+        return amountFound;
+    }
+
+    /**
+     * clears the crawler
+     */
+    public void clear() {
+        this.urlHits.clear();
+        this.resultPages.clear();
+        this.pagesPending.clear();
+        this.pagesVisited.clear();
+        this.successPages = 0;
+        this.amountFound = 0;
+    }
+
+    //    public static void main(String[] args) {
+//        Scanner input = new Scanner(System.in);
+//        System.out.println("Enter a URL : ");
+//        String urlInput = input.nextLine().trim();
+//        crawler(urlInput);
+//
+//    }
+//
+//    public static void crawler(String startUrl) {
+//        ArrayList<String> pending = new ArrayList<>();
+//        ArrayList<String> traversed = new ArrayList<>();
+//
+//        pending.add(startUrl);
+//        while (!pending.isEmpty() && traversed.size() <= 100) {
+//            String tempUrl = pending.remove(0);
+//            if (!traversed.contains(tempUrl)) {
+//                traversed.add(tempUrl);
+//                System.out.println("crawling: " + tempUrl);
+//
+//                for (String s : getSubURLs(tempUrl)) {
+//                    if (!traversed.contains(s)) pending.add(s);
+//                }
+//            }
+//        }
+//    }
+//
+//    public static ArrayList<String> getSubURLs(String urlString) {
+//        ArrayList<String> subUrls = new ArrayList<>();
+//
+//        try {
+//            URL url = new URL(urlString);
+//            Scanner urlScanner = new Scanner(url.openStream());
+//            int cur = 0;
+//            while (urlScanner.hasNext()) {
+//                String input = urlScanner.nextLine();
+//                cur = input.indexOf("http:", cur);
+//                while (cur > 0) {
+//                    int endIndex = input.indexOf("\"", cur);
+//                    cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
+//                }
+//            }
+//        } catch (IOException e) {
+//            e.printStackTrace();
+//        }
+//
+//        return subUrls;
+//    }
+}
--- a/src/main/resources/META-INF/MANIFEST.MF
+++ b/src/main/resources/META-INF/MANIFEST.MF
@@ -0,0 +1,3 @@
+Manifest-Version: 1.0
+Main-Class: main.java.webcrawler.Main
+