From a922615e963bc56f5393817d382e8a81fd716e04 Mon Sep 17 00:00:00 2001 From: Sem van der Hoeven Date: Tue, 3 Mar 2020 16:50:38 +0100 Subject: [PATCH] added debug mode --- src/main/java/webcrawler/CrawlBranch.java | 30 +++++++++++++++++------ src/main/java/webcrawler/Main.java | 7 +++--- src/main/java/webcrawler/WebCrawler.java | 27 ++++++++++++++------ 3 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/main/java/webcrawler/CrawlBranch.java b/src/main/java/webcrawler/CrawlBranch.java index 90a5bbc..7fd33e3 100644 --- a/src/main/java/webcrawler/CrawlBranch.java +++ b/src/main/java/webcrawler/CrawlBranch.java @@ -13,10 +13,20 @@ import java.util.List; public class CrawlBranch { private List links = new LinkedList<>(); private Document htmlDocument; + private boolean debug; private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1"; + public CrawlBranch() { + this(false); + } + + public CrawlBranch(boolean debug) { + this.debug = debug; + } + /** * crawls the links in it's current arrayList of links + * * @param url the url to start from * @return true if the search was successful, false otherwise */ @@ -26,14 +36,14 @@ public class CrawlBranch { this.htmlDocument = connection.get(); if (connection.response().statusCode() == 200) { - System.out.println("VISITING -- Recieved web page at " + url); + print("VISITING -- Recieved web page at " + url); } else { - System.out.println("FAIL -- recieved something else than a web page"); + print("FAIL -- recieved something else than a web page"); return false; } Elements linksOnPage = htmlDocument.select("a[href]"); - System.out.println("FOUND (" + linksOnPage.size() + ") links"); + print("FOUND (" + linksOnPage.size() + ") links"); for (Element link : linksOnPage) { this.links.add(link.absUrl("href")); } @@ -46,22 +56,23 @@ public class CrawlBranch { /** * searches how many times a word occurs in a page + * * @param word the word to look for * @return the amount of occurrences in the web page, -1 if the word is not found */ public int searchForWord(String word) { - if (this.htmlDocument == null){ + if (this.htmlDocument == null) { //System.out.println("ERROR -- call crawl before searhing"); return -1; } - System.out.printf("Searching for %s...", word); - System.out.println(); + print(String.format("Searching for %s...\n", word)); String bodyText = this.htmlDocument.body().text(); - return count(bodyText.toLowerCase(),word.toLowerCase()); + return count(bodyText.toLowerCase(), word.toLowerCase()); } /** * counts how many times a word occurs in a string + * * @param text the string to search in for the word * @param word the word to search for * @return the amount of times the given word was found in the string @@ -77,9 +88,14 @@ public class CrawlBranch { /** * gets the links + * * @return the links */ public List getLinks() { return this.links; } + + private void print(String text) { + if (debug) System.out.println(text); + } } diff --git a/src/main/java/webcrawler/Main.java b/src/main/java/webcrawler/Main.java index 18b11a4..11bef7f 100644 --- a/src/main/java/webcrawler/Main.java +++ b/src/main/java/webcrawler/Main.java @@ -12,11 +12,10 @@ public class Main { System.out.print("Enter the maximum amount of pages the crawler should visit : "); int amount = Integer.parseInt(scanner.nextLine().trim()); System.out.print("Should the crawler save the links with hits? (Y/N) : "); - String choice = scanner.nextLine().toLowerCase().trim(); - boolean save = getChoice(choice); - + boolean save = getChoice(scanner.nextLine()); System.out.print("Do you want to enable debug mode? (Y/N) : "); - WebCrawler crawler = new WebCrawler(amount,save); + boolean debug = getChoice(scanner.nextLine()); + WebCrawler crawler = new WebCrawler(amount,save,debug); crawler.search(startUrl,word); } diff --git a/src/main/java/webcrawler/WebCrawler.java b/src/main/java/webcrawler/WebCrawler.java index 6d8c5a5..ae2ee17 100644 --- a/src/main/java/webcrawler/WebCrawler.java +++ b/src/main/java/webcrawler/WebCrawler.java @@ -7,7 +7,7 @@ public class WebCrawler { private Set pagesVisited; private List pagesPending; private ArrayList resultPages; - private Map urlHits; + private Map urlHits; private int amountFound = 0; private int successPages = 0; private boolean shouldSaveHitLinks; @@ -22,6 +22,7 @@ public class WebCrawler { /** * creates a new WebCrawler object with the given amount of max pages + * * @param maxPages the max amount of pages the crawler should visit */ public WebCrawler(int maxPages) { @@ -30,11 +31,12 @@ public class WebCrawler { /** * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs - * @param maxPages the max amount of pages the crawler should visit + * + * @param maxPages the max amount of pages the crawler should visit * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits */ public WebCrawler(int maxPages, boolean shouldSaveHitLinks) { - this(maxPages,shouldSaveHitLinks,false); + this(maxPages, shouldSaveHitLinks, false); } public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug) { @@ -50,6 +52,7 @@ public class WebCrawler { /** * gets the next url in the list + * * @return the next url in the list */ private String nextUrl() { @@ -63,10 +66,12 @@ public class WebCrawler { /** * searches for a word by crawling the web through hyperlinks - * @param url the url to start searching from + * + * @param url the url to start searching from * @param searchWord the word to search for */ public void search(String url, String searchWord) { + int counter = 1; while (this.pagesVisited.size() < amountOfPages) { String curUrl; CrawlBranch branch = new CrawlBranch(); @@ -75,18 +80,19 @@ public class WebCrawler { this.pagesVisited.add(url); } else { curUrl = this.nextUrl(); + System.out.println(String.format("visiting page %s / %s",counter,amountOfPages)); + counter++; } branch.crawl(curUrl); int amount = branch.searchForWord(searchWord); if (amount > 0) { - System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount); - System.out.println(); + print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount)); successPages++; amountFound += amount; if (shouldSaveHitLinks) - resultPages.add(curUrl); - urlHits.put(curUrl,amount); + resultPages.add(curUrl); + urlHits.put(curUrl, amount); } this.pagesPending.addAll(branch.getLinks()); } @@ -116,6 +122,7 @@ public class WebCrawler { /** * sets the amount of max pages + * * @param amount the amount of pages */ public void setAmountOfPages(int amount) { @@ -157,4 +164,8 @@ public class WebCrawler { this.successPages = 0; this.amountFound = 0; } + + private void print(String text) { + if (debug) System.out.println(text); + } }