added debug mode

This commit is contained in:
Sem van der Hoeven
2020-03-03 16:50:38 +01:00
parent eb12a813b0
commit a922615e96
3 changed files with 45 additions and 19 deletions

View File

@@ -13,10 +13,20 @@ import java.util.List;
public class CrawlBranch {
private List<String> links = new LinkedList<>();
private Document htmlDocument;
private boolean debug;
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
public CrawlBranch() {
this(false);
}
public CrawlBranch(boolean debug) {
this.debug = debug;
}
/**
* crawls the links in it's current arrayList of links
*
* @param url the url to start from
* @return <code>true</code> if the search was successful, <code>false otherwise</code>
*/
@@ -26,14 +36,14 @@ public class CrawlBranch {
this.htmlDocument = connection.get();
if (connection.response().statusCode() == 200) {
System.out.println("VISITING -- Recieved web page at " + url);
print("VISITING -- Recieved web page at " + url);
} else {
System.out.println("FAIL -- recieved something else than a web page");
print("FAIL -- recieved something else than a web page");
return false;
}
Elements linksOnPage = htmlDocument.select("a[href]");
System.out.println("FOUND (" + linksOnPage.size() + ") links");
print("FOUND (" + linksOnPage.size() + ") links");
for (Element link : linksOnPage) {
this.links.add(link.absUrl("href"));
}
@@ -46,22 +56,23 @@ public class CrawlBranch {
/**
* searches how many times a word occurs in a page
*
* @param word the word to look for
* @return the amount of occurrences in the web page, -1 if the word is not found
*/
public int searchForWord(String word) {
if (this.htmlDocument == null){
if (this.htmlDocument == null) {
//System.out.println("ERROR -- call crawl before searhing");
return -1;
}
System.out.printf("Searching for %s...", word);
System.out.println();
print(String.format("Searching for %s...\n", word));
String bodyText = this.htmlDocument.body().text();
return count(bodyText.toLowerCase(),word.toLowerCase());
return count(bodyText.toLowerCase(), word.toLowerCase());
}
/**
* counts how many times a word occurs in a string
*
* @param text the string to search in for the word
* @param word the word to search for
* @return the amount of times the given word was found in the string
@@ -77,9 +88,14 @@ public class CrawlBranch {
/**
* gets the links
*
* @return the links
*/
public List<String> getLinks() {
return this.links;
}
private void print(String text) {
if (debug) System.out.println(text);
}
}

View File

@@ -12,11 +12,10 @@ public class Main {
System.out.print("Enter the maximum amount of pages the crawler should visit : ");
int amount = Integer.parseInt(scanner.nextLine().trim());
System.out.print("Should the crawler save the links with hits? (Y/N) : ");
String choice = scanner.nextLine().toLowerCase().trim();
boolean save = getChoice(choice);
boolean save = getChoice(scanner.nextLine());
System.out.print("Do you want to enable debug mode? (Y/N) : ");
WebCrawler crawler = new WebCrawler(amount,save);
boolean debug = getChoice(scanner.nextLine());
WebCrawler crawler = new WebCrawler(amount,save,debug);
crawler.search(startUrl,word);
}

View File

@@ -7,7 +7,7 @@ public class WebCrawler {
private Set<String> pagesVisited;
private List<String> pagesPending;
private ArrayList<String> resultPages;
private Map<String,Integer> urlHits;
private Map<String, Integer> urlHits;
private int amountFound = 0;
private int successPages = 0;
private boolean shouldSaveHitLinks;
@@ -22,6 +22,7 @@ public class WebCrawler {
/**
* creates a new WebCrawler object with the given amount of max pages
*
* @param maxPages the max amount of pages the crawler should visit
*/
public WebCrawler(int maxPages) {
@@ -30,11 +31,12 @@ public class WebCrawler {
/**
* creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
* @param maxPages the max amount of pages the crawler should visit
*
* @param maxPages the max amount of pages the crawler should visit
* @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
*/
public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
this(maxPages,shouldSaveHitLinks,false);
this(maxPages, shouldSaveHitLinks, false);
}
public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug) {
@@ -50,6 +52,7 @@ public class WebCrawler {
/**
* gets the next url in the list
*
* @return the next url in the list
*/
private String nextUrl() {
@@ -63,10 +66,12 @@ public class WebCrawler {
/**
* searches for a word by crawling the web through hyperlinks
* @param url the url to start searching from
*
* @param url the url to start searching from
* @param searchWord the word to search for
*/
public void search(String url, String searchWord) {
int counter = 1;
while (this.pagesVisited.size() < amountOfPages) {
String curUrl;
CrawlBranch branch = new CrawlBranch();
@@ -75,18 +80,19 @@ public class WebCrawler {
this.pagesVisited.add(url);
} else {
curUrl = this.nextUrl();
System.out.println(String.format("visiting page %s / %s",counter,amountOfPages));
counter++;
}
branch.crawl(curUrl);
int amount = branch.searchForWord(searchWord);
if (amount > 0) {
System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
System.out.println();
print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount));
successPages++;
amountFound += amount;
if (shouldSaveHitLinks)
resultPages.add(curUrl);
urlHits.put(curUrl,amount);
resultPages.add(curUrl);
urlHits.put(curUrl, amount);
}
this.pagesPending.addAll(branch.getLinks());
}
@@ -116,6 +122,7 @@ public class WebCrawler {
/**
* sets the amount of max pages
*
* @param amount the amount of pages
*/
public void setAmountOfPages(int amount) {
@@ -157,4 +164,8 @@ public class WebCrawler {
this.successPages = 0;
this.amountFound = 0;
}
private void print(String text) {
if (debug) System.out.println(text);
}
}