package main.java.webcrawler.crawler; import main.java.webcrawler.crawler.CrawlBranch; import main.java.webcrawler.visualiser.Visualiser; import java.util.*; public class WebCrawler { private int amountOfPages; private Set pagesVisited; private List pagesPending; private ArrayList resultPages; private Map urlHits; private Visualiser logger; private int amountFound = 0; private int successPages = 0; private boolean shouldSaveHitLinks; private boolean debug; private boolean done = false; public LinkedList messages; /** * creates a new WebCrawler object with standard values */ public WebCrawler() { this(50, true); } /** * creates a new WebCrawler object with the given amount of max pages * * @param maxPages the max amount of pages the crawler should visit */ public WebCrawler(int maxPages) { this(maxPages, true); } /** * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs * * @param maxPages the max amount of pages the crawler should visit * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits */ public WebCrawler(int maxPages, boolean shouldSaveHitLinks) { this(maxPages, shouldSaveHitLinks, false, null); } public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug, Visualiser logger) { this.amountOfPages = maxPages; this.shouldSaveHitLinks = shouldSaveHitLinks; this.pagesVisited = new HashSet<>(); this.pagesPending = new LinkedList<>(); this.resultPages = new ArrayList<>(); this.urlHits = new HashMap<>(); this.debug = debug; this.logger = logger; this.messages = new LinkedList<>(); } /** * gets the next url in the list * * @return the next url in the list */ private String nextUrl() { String next; do { next = this.pagesPending.remove(0); } while (this.pagesVisited.contains(next)); this.pagesVisited.add(next); return next; } /** * searches for a word by crawling the web through hyperlinks * * @param url the url to start searching from * @param searchWord the word to search for */ public void search(String url, String searchWord) { // System.out.println("searching for " + searchWord + " in " + url); int counter = 0; while (this.pagesVisited.size() < amountOfPages) { String curUrl; CrawlBranch branch = new CrawlBranch(debug, this, logger); if (this.pagesPending.isEmpty()) { curUrl = url; this.pagesVisited.add(url); } else { curUrl = this.nextUrl(); counter++; // print(String.format("visiting page %s / %s",counter,amountOfPages)); System.out.println(String.format("visiting page %s / %s", counter, amountOfPages)); addMessage(String.format("visiting page %s / %s", counter, amountOfPages)); } branch.crawl(curUrl); int amount = branch.searchForWord(searchWord); if (amount > 0) { // print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount)); addMessage(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount)); successPages++; amountFound += amount; if (shouldSaveHitLinks) resultPages.add(curUrl); urlHits.put(curUrl, amount); } this.pagesPending.addAll(branch.getLinks()); } System.out.println("done searching"); // print("========================"); addMessage("========================"); // print(String.format("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages)); addMessage(String.format("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages)); if (shouldSaveHitLinks) { // print(String.format("Successful pages: \n%s", showCombinations(urlHits))); addMessage(String.format("Successful pages: \n%s", showCombinations(urlHits))); } done = true; } private String display(List list) { StringBuilder res = new StringBuilder(); for (int i = 0; i < list.size(); i++) { res.append(list.get(i)).append("\n"); } return res.toString(); } private String showCombinations(Map urls) { StringBuilder res = new StringBuilder(); Set keys = urls.keySet(); for (String url : keys) { res.append(url).append(" (").append(urls.get(url)).append(" hits)\n"); } return res.toString(); } /** * sets the amount of max pages * * @param amount the amount of pages */ public void setAmountOfPages(int amount) { this.amountOfPages = amount; } public void setShouldSaveHitLinks(boolean shouldSaveHitLinks) { this.shouldSaveHitLinks = shouldSaveHitLinks; } public ArrayList getResultPages() { return this.resultPages; } public Map getUrlHits() { return urlHits; } public int getAmountFound() { return amountFound; } public boolean usesDebug() { return debug; } public void setDebug(boolean debug) { this.debug = debug; } /** * clears the crawler */ public void clear() { this.urlHits.clear(); this.resultPages.clear(); this.pagesPending.clear(); this.pagesVisited.clear(); this.successPages = 0; this.amountFound = 0; this.done = false; } private void print(String text) { if (debug) System.out.println(text); } public void addMessage(String message) { if (!this.messages.contains(message)) { this.messages.add(message); } // System.out.println("ADDED MESSAGE " + message); } public LinkedList getMessages() { return this.messages; } public void clearMessages() { this.messages.clear(); } public boolean isDone() { return done; } }