fixed bug

Merge remote-tracking branch 'origin/master'
fixed counter
2020-03-03 18:55:29 +01:00 · 2020-03-03 17:40:20 +01:00 · 2020-03-03 17:39:42 +01:00 · 2020-03-03 16:50:38 +01:00 · 2020-03-03 16:38:36 +01:00 · 2020-03-03 16:33:12 +01:00
4 changed files with 73 additions and 70 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
+# WebCrawler
+a small little web crawler that searches for a word you give it
+
+## Usage
+to run the jar via the command line:
+`java -jar <JAR LOCATION>/.\WebCrawler.jar`
--- a/src/main/java/webcrawler/CrawlBranch.java
+++ b/src/main/java/webcrawler/CrawlBranch.java
@@ -13,10 +13,20 @@ import java.util.List;
 public class CrawlBranch {
    private List<String> links = new LinkedList<>();
    private Document htmlDocument;
+    private boolean debug;
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";

+    public CrawlBranch() {
+        this(false);
+    }
+
+    public CrawlBranch(boolean debug) {
+        this.debug = debug;
+    }
+
    /**
     * crawls the links in it's current arrayList of links
+     *
     * @param url the url to start from
     * @return <code>true</code> if the search was successful, <code>false otherwise</code>
     */
@@ -26,42 +36,43 @@ public class CrawlBranch {
            this.htmlDocument = connection.get();

            if (connection.response().statusCode() == 200) {
-                System.out.println("VISITING -- Recieved web page at " + url);
+                print("VISITING -- Recieved web page at " + url);
            } else {
-                System.out.println("FAIL -- recieved something else than a web page");
+                print("FAIL -- recieved something else than a web page");
                return false;
            }

            Elements linksOnPage = htmlDocument.select("a[href]");
-            System.out.println("FOUND (" + linksOnPage.size() + ") links");
+            print("FOUND (" + linksOnPage.size() + ") links");
            for (Element link : linksOnPage) {
                this.links.add(link.absUrl("href"));
            }
            return true;
        } catch (Exception e) {
-            System.out.println("ERROR -- error in out http request : " + e);
+            //System.out.println("ERROR -- error in out http request : " + e);
            return false;
        }
    }

    /**
     * searches how many times a word occurs in a page
+     *
     * @param word the word to look for
     * @return the amount of occurrences in the web page, -1 if the word is not found
     */
    public int searchForWord(String word) {
-        if (this.htmlDocument == null){
-            System.out.println("ERROR -- call crawl before searhing");
+        if (this.htmlDocument == null) {
+            //System.out.println("ERROR -- call crawl before searhing");
            return -1;
        }
-        System.out.printf("Searching for %s...", word);
-        System.out.println();
+        print(String.format("Searching for %s...\n", word));
        String bodyText = this.htmlDocument.body().text();
-        return count(bodyText.toLowerCase(),word.toLowerCase());
+        return count(bodyText.toLowerCase(), word.toLowerCase());
    }

    /**
     * counts how many times a word occurs in a string
+     *
     * @param text the string to search in for the word
     * @param word the word to search for
     * @return the amount of times the given word was found in the string
@@ -77,9 +88,14 @@ public class CrawlBranch {

    /**
     * gets the links
+     *
     * @return the links
     */
    public List<String> getLinks() {
        return this.links;
    }
+
+    private void print(String text) {
+        if (debug) System.out.println(text);
+    }
 }
--- a/src/main/java/webcrawler/Main.java
+++ b/src/main/java/webcrawler/Main.java
@@ -12,12 +12,16 @@ public class Main {
        System.out.print("Enter the maximum amount of pages the crawler should visit : ");
        int amount = Integer.parseInt(scanner.nextLine().trim());
        System.out.print("Should the crawler save the links with hits? (Y/N) : ");
-        String choice = scanner.nextLine().toLowerCase().trim();
-        boolean save;
-        if (choice.equals("y")) save = true;
-        else if (choice.equals("n")) save = false;
-        else save = false;
-        WebCrawler crawler = new WebCrawler(amount,save);
+        boolean save = getChoice(scanner.nextLine());
+        System.out.print("Do you want to enable debug mode? (Y/N) : ");
+        boolean debug = getChoice(scanner.nextLine());
+        if (debug) System.out.println("[INFO] - Debug mode enabled");
+        WebCrawler crawler = new WebCrawler(amount,save,debug);
        crawler.search(startUrl,word);
    }
+
+    private static boolean getChoice(String choice) {
+        if (choice.trim().toLowerCase().equals("y")) return true;
+        else return false;
+    }
 }
--- a/src/main/java/webcrawler/WebCrawler.java
+++ b/src/main/java/webcrawler/WebCrawler.java
@@ -7,10 +7,11 @@ public class WebCrawler {
    private Set<String> pagesVisited;
    private List<String> pagesPending;
    private ArrayList<String> resultPages;
-    private Map<String,Integer> urlHits;
+    private Map<String, Integer> urlHits;
    private int amountFound = 0;
    private int successPages = 0;
    private boolean shouldSaveHitLinks;
+    private boolean debug;

    /**
     * creates a new WebCrawler object with standard values
@@ -21,6 +22,7 @@ public class WebCrawler {

    /**
     * creates a new WebCrawler object with the given amount of max pages
+     *
     * @param maxPages the max amount of pages the crawler should visit
     */
    public WebCrawler(int maxPages) {
@@ -29,21 +31,28 @@ public class WebCrawler {

    /**
     * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
-     * @param maxPages the max amount of pages the crawler should visit
+     *
+     * @param maxPages           the max amount of pages the crawler should visit
     * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
     */
    public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
+        this(maxPages, shouldSaveHitLinks, false);
+    }
+
+    public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug) {
        this.amountOfPages = maxPages;
        this.shouldSaveHitLinks = shouldSaveHitLinks;
        this.pagesVisited = new HashSet<>();
        this.pagesPending = new LinkedList<>();
        this.resultPages = new ArrayList<>();
        this.urlHits = new HashMap<>();
+        this.debug = debug;
    }


    /**
     * gets the next url in the list
+     *
     * @return the next url in the list
     */
    private String nextUrl() {
@@ -57,30 +66,33 @@ public class WebCrawler {

    /**
     * searches for a word by crawling the web through hyperlinks
-     * @param url the url to start searching from
+     *
+     * @param url        the url to start searching from
     * @param searchWord the word to search for
     */
    public void search(String url, String searchWord) {
+        int counter = 0;
        while (this.pagesVisited.size() < amountOfPages) {
            String curUrl;
-            CrawlBranch branch = new CrawlBranch();
+            CrawlBranch branch = new CrawlBranch(debug);
            if (this.pagesPending.isEmpty()) {
                curUrl = url;
                this.pagesVisited.add(url);
            } else {
                curUrl = this.nextUrl();
+                counter++;
+                System.out.println(String.format("visiting page %s / %s",counter,amountOfPages));
            }
            branch.crawl(curUrl);

            int amount = branch.searchForWord(searchWord);
            if (amount > 0) {
-                System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
-                System.out.println();
+                print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount));
                successPages++;
                amountFound += amount;
                if (shouldSaveHitLinks)
-                resultPages.add(curUrl);
-                urlHits.put(curUrl,amount);
+                    resultPages.add(curUrl);
+                urlHits.put(curUrl, amount);
            }
            this.pagesPending.addAll(branch.getLinks());
        }
@@ -110,6 +122,7 @@ public class WebCrawler {

    /**
     * sets the amount of max pages
+     *
     * @param amount the amount of pages
     */
    public void setAmountOfPages(int amount) {
@@ -132,6 +145,14 @@ public class WebCrawler {
        return amountFound;
    }

+    public boolean usesDebug() {
+        return debug;
+    }
+
+    public void setDebug(boolean debug) {
+        this.debug = debug;
+    }
+
    /**
     * clears the crawler
     */
@@ -144,51 +165,7 @@ public class WebCrawler {
        this.amountFound = 0;
    }

-    //    public static void main(String[] args) {
-//        Scanner input = new Scanner(System.in);
-//        System.out.println("Enter a URL : ");
-//        String urlInput = input.nextLine().trim();
-//        crawler(urlInput);
-//
-//    }
-//
-//    public static void crawler(String startUrl) {
-//        ArrayList<String> pending = new ArrayList<>();
-//        ArrayList<String> traversed = new ArrayList<>();
-//
-//        pending.add(startUrl);
-//        while (!pending.isEmpty() && traversed.size() <= 100) {
-//            String tempUrl = pending.remove(0);
-//            if (!traversed.contains(tempUrl)) {
-//                traversed.add(tempUrl);
-//                System.out.println("crawling: " + tempUrl);
-//
-//                for (String s : getSubURLs(tempUrl)) {
-//                    if (!traversed.contains(s)) pending.add(s);
-//                }
-//            }
-//        }
-//    }
-//
-//    public static ArrayList<String> getSubURLs(String urlString) {
-//        ArrayList<String> subUrls = new ArrayList<>();
-//
-//        try {
-//            URL url = new URL(urlString);
-//            Scanner urlScanner = new Scanner(url.openStream());
-//            int cur = 0;
-//            while (urlScanner.hasNext()) {
-//                String input = urlScanner.nextLine();
-//                cur = input.indexOf("http:", cur);
-//                while (cur > 0) {
-//                    int endIndex = input.indexOf("\"", cur);
-//                    cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
-//                }
-//            }
-//        } catch (IOException e) {
-//            e.printStackTrace();
-//        }
-//
-//        return subUrls;
-//    }
+    private void print(String text) {
+        if (debug) System.out.println(text);
+    }
 }
Author	SHA1	Message	Date
Sem van der Hoeven	5f79fb06f7	fixed bug	2020-03-03 18:55:29 +01:00
Sem van der Hoeven	996e6abc4b	Merge remote-tracking branch 'origin/master'	2020-03-03 17:40:20 +01:00
Sem van der Hoeven	9946dc9d12	fixed counter	2020-03-03 17:39:42 +01:00
Sem van der Hoeven	a922615e96	added debug mode	2020-03-03 16:50:38 +01:00
Sem van der Hoeven	eb12a813b0	added debug boolean and get choice method	2020-03-03 16:38:36 +01:00
Sem van der Hoeven	38b0524b0d	removed error message spam	2020-03-03 16:33:12 +01:00
SemvdH	e2ce5cac3b	Update README.md	2020-01-20 17:33:38 +01:00
SemvdH	cb163f8ac7	Create README.md	2020-01-20 17:31:57 +01:00