8 Commits

Author SHA1 Message Date
Sem van der Hoeven
5f79fb06f7 fixed bug 2020-03-03 18:55:29 +01:00
Sem van der Hoeven
996e6abc4b Merge remote-tracking branch 'origin/master' 2020-03-03 17:40:20 +01:00
Sem van der Hoeven
9946dc9d12 fixed counter 2020-03-03 17:39:42 +01:00
Sem van der Hoeven
a922615e96 added debug mode 2020-03-03 16:50:38 +01:00
Sem van der Hoeven
eb12a813b0 added debug boolean and get choice method 2020-03-03 16:38:36 +01:00
Sem van der Hoeven
38b0524b0d removed error message spam 2020-03-03 16:33:12 +01:00
SemvdH
e2ce5cac3b Update README.md 2020-01-20 17:33:38 +01:00
SemvdH
cb163f8ac7 Create README.md 2020-01-20 17:31:57 +01:00
4 changed files with 73 additions and 70 deletions

6
README.md Normal file
View File

@@ -0,0 +1,6 @@
# WebCrawler
a small little web crawler that searches for a word you give it
## Usage
to run the jar via the command line:
`java -jar <JAR LOCATION>/.\WebCrawler.jar`

View File

@@ -13,10 +13,20 @@ import java.util.List;
public class CrawlBranch { public class CrawlBranch {
private List<String> links = new LinkedList<>(); private List<String> links = new LinkedList<>();
private Document htmlDocument; private Document htmlDocument;
private boolean debug;
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1"; private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
public CrawlBranch() {
this(false);
}
public CrawlBranch(boolean debug) {
this.debug = debug;
}
/** /**
* crawls the links in it's current arrayList of links * crawls the links in it's current arrayList of links
*
* @param url the url to start from * @param url the url to start from
* @return <code>true</code> if the search was successful, <code>false otherwise</code> * @return <code>true</code> if the search was successful, <code>false otherwise</code>
*/ */
@@ -26,42 +36,43 @@ public class CrawlBranch {
this.htmlDocument = connection.get(); this.htmlDocument = connection.get();
if (connection.response().statusCode() == 200) { if (connection.response().statusCode() == 200) {
System.out.println("VISITING -- Recieved web page at " + url); print("VISITING -- Recieved web page at " + url);
} else { } else {
System.out.println("FAIL -- recieved something else than a web page"); print("FAIL -- recieved something else than a web page");
return false; return false;
} }
Elements linksOnPage = htmlDocument.select("a[href]"); Elements linksOnPage = htmlDocument.select("a[href]");
System.out.println("FOUND (" + linksOnPage.size() + ") links"); print("FOUND (" + linksOnPage.size() + ") links");
for (Element link : linksOnPage) { for (Element link : linksOnPage) {
this.links.add(link.absUrl("href")); this.links.add(link.absUrl("href"));
} }
return true; return true;
} catch (Exception e) { } catch (Exception e) {
System.out.println("ERROR -- error in out http request : " + e); //System.out.println("ERROR -- error in out http request : " + e);
return false; return false;
} }
} }
/** /**
* searches how many times a word occurs in a page * searches how many times a word occurs in a page
*
* @param word the word to look for * @param word the word to look for
* @return the amount of occurrences in the web page, -1 if the word is not found * @return the amount of occurrences in the web page, -1 if the word is not found
*/ */
public int searchForWord(String word) { public int searchForWord(String word) {
if (this.htmlDocument == null){ if (this.htmlDocument == null) {
System.out.println("ERROR -- call crawl before searhing"); //System.out.println("ERROR -- call crawl before searhing");
return -1; return -1;
} }
System.out.printf("Searching for %s...", word); print(String.format("Searching for %s...\n", word));
System.out.println();
String bodyText = this.htmlDocument.body().text(); String bodyText = this.htmlDocument.body().text();
return count(bodyText.toLowerCase(),word.toLowerCase()); return count(bodyText.toLowerCase(), word.toLowerCase());
} }
/** /**
* counts how many times a word occurs in a string * counts how many times a word occurs in a string
*
* @param text the string to search in for the word * @param text the string to search in for the word
* @param word the word to search for * @param word the word to search for
* @return the amount of times the given word was found in the string * @return the amount of times the given word was found in the string
@@ -77,9 +88,14 @@ public class CrawlBranch {
/** /**
* gets the links * gets the links
*
* @return the links * @return the links
*/ */
public List<String> getLinks() { public List<String> getLinks() {
return this.links; return this.links;
} }
private void print(String text) {
if (debug) System.out.println(text);
}
} }

View File

@@ -12,12 +12,16 @@ public class Main {
System.out.print("Enter the maximum amount of pages the crawler should visit : "); System.out.print("Enter the maximum amount of pages the crawler should visit : ");
int amount = Integer.parseInt(scanner.nextLine().trim()); int amount = Integer.parseInt(scanner.nextLine().trim());
System.out.print("Should the crawler save the links with hits? (Y/N) : "); System.out.print("Should the crawler save the links with hits? (Y/N) : ");
String choice = scanner.nextLine().toLowerCase().trim(); boolean save = getChoice(scanner.nextLine());
boolean save; System.out.print("Do you want to enable debug mode? (Y/N) : ");
if (choice.equals("y")) save = true; boolean debug = getChoice(scanner.nextLine());
else if (choice.equals("n")) save = false; if (debug) System.out.println("[INFO] - Debug mode enabled");
else save = false; WebCrawler crawler = new WebCrawler(amount,save,debug);
WebCrawler crawler = new WebCrawler(amount,save);
crawler.search(startUrl,word); crawler.search(startUrl,word);
} }
private static boolean getChoice(String choice) {
if (choice.trim().toLowerCase().equals("y")) return true;
else return false;
}
} }

View File

@@ -7,10 +7,11 @@ public class WebCrawler {
private Set<String> pagesVisited; private Set<String> pagesVisited;
private List<String> pagesPending; private List<String> pagesPending;
private ArrayList<String> resultPages; private ArrayList<String> resultPages;
private Map<String,Integer> urlHits; private Map<String, Integer> urlHits;
private int amountFound = 0; private int amountFound = 0;
private int successPages = 0; private int successPages = 0;
private boolean shouldSaveHitLinks; private boolean shouldSaveHitLinks;
private boolean debug;
/** /**
* creates a new WebCrawler object with standard values * creates a new WebCrawler object with standard values
@@ -21,6 +22,7 @@ public class WebCrawler {
/** /**
* creates a new WebCrawler object with the given amount of max pages * creates a new WebCrawler object with the given amount of max pages
*
* @param maxPages the max amount of pages the crawler should visit * @param maxPages the max amount of pages the crawler should visit
*/ */
public WebCrawler(int maxPages) { public WebCrawler(int maxPages) {
@@ -29,21 +31,28 @@ public class WebCrawler {
/** /**
* creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
*
* @param maxPages the max amount of pages the crawler should visit * @param maxPages the max amount of pages the crawler should visit
* @param shouldSaveHitLinks if the crawler should save the links that have one or more hits * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
*/ */
public WebCrawler(int maxPages, boolean shouldSaveHitLinks) { public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
this(maxPages, shouldSaveHitLinks, false);
}
public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug) {
this.amountOfPages = maxPages; this.amountOfPages = maxPages;
this.shouldSaveHitLinks = shouldSaveHitLinks; this.shouldSaveHitLinks = shouldSaveHitLinks;
this.pagesVisited = new HashSet<>(); this.pagesVisited = new HashSet<>();
this.pagesPending = new LinkedList<>(); this.pagesPending = new LinkedList<>();
this.resultPages = new ArrayList<>(); this.resultPages = new ArrayList<>();
this.urlHits = new HashMap<>(); this.urlHits = new HashMap<>();
this.debug = debug;
} }
/** /**
* gets the next url in the list * gets the next url in the list
*
* @return the next url in the list * @return the next url in the list
*/ */
private String nextUrl() { private String nextUrl() {
@@ -57,30 +66,33 @@ public class WebCrawler {
/** /**
* searches for a word by crawling the web through hyperlinks * searches for a word by crawling the web through hyperlinks
*
* @param url the url to start searching from * @param url the url to start searching from
* @param searchWord the word to search for * @param searchWord the word to search for
*/ */
public void search(String url, String searchWord) { public void search(String url, String searchWord) {
int counter = 0;
while (this.pagesVisited.size() < amountOfPages) { while (this.pagesVisited.size() < amountOfPages) {
String curUrl; String curUrl;
CrawlBranch branch = new CrawlBranch(); CrawlBranch branch = new CrawlBranch(debug);
if (this.pagesPending.isEmpty()) { if (this.pagesPending.isEmpty()) {
curUrl = url; curUrl = url;
this.pagesVisited.add(url); this.pagesVisited.add(url);
} else { } else {
curUrl = this.nextUrl(); curUrl = this.nextUrl();
counter++;
System.out.println(String.format("visiting page %s / %s",counter,amountOfPages));
} }
branch.crawl(curUrl); branch.crawl(curUrl);
int amount = branch.searchForWord(searchWord); int amount = branch.searchForWord(searchWord);
if (amount > 0) { if (amount > 0) {
System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount); print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount));
System.out.println();
successPages++; successPages++;
amountFound += amount; amountFound += amount;
if (shouldSaveHitLinks) if (shouldSaveHitLinks)
resultPages.add(curUrl); resultPages.add(curUrl);
urlHits.put(curUrl,amount); urlHits.put(curUrl, amount);
} }
this.pagesPending.addAll(branch.getLinks()); this.pagesPending.addAll(branch.getLinks());
} }
@@ -110,6 +122,7 @@ public class WebCrawler {
/** /**
* sets the amount of max pages * sets the amount of max pages
*
* @param amount the amount of pages * @param amount the amount of pages
*/ */
public void setAmountOfPages(int amount) { public void setAmountOfPages(int amount) {
@@ -132,6 +145,14 @@ public class WebCrawler {
return amountFound; return amountFound;
} }
public boolean usesDebug() {
return debug;
}
public void setDebug(boolean debug) {
this.debug = debug;
}
/** /**
* clears the crawler * clears the crawler
*/ */
@@ -144,51 +165,7 @@ public class WebCrawler {
this.amountFound = 0; this.amountFound = 0;
} }
// public static void main(String[] args) { private void print(String text) {
// Scanner input = new Scanner(System.in); if (debug) System.out.println(text);
// System.out.println("Enter a URL : "); }
// String urlInput = input.nextLine().trim();
// crawler(urlInput);
//
// }
//
// public static void crawler(String startUrl) {
// ArrayList<String> pending = new ArrayList<>();
// ArrayList<String> traversed = new ArrayList<>();
//
// pending.add(startUrl);
// while (!pending.isEmpty() && traversed.size() <= 100) {
// String tempUrl = pending.remove(0);
// if (!traversed.contains(tempUrl)) {
// traversed.add(tempUrl);
// System.out.println("crawling: " + tempUrl);
//
// for (String s : getSubURLs(tempUrl)) {
// if (!traversed.contains(s)) pending.add(s);
// }
// }
// }
// }
//
// public static ArrayList<String> getSubURLs(String urlString) {
// ArrayList<String> subUrls = new ArrayList<>();
//
// try {
// URL url = new URL(urlString);
// Scanner urlScanner = new Scanner(url.openStream());
// int cur = 0;
// while (urlScanner.hasNext()) {
// String input = urlScanner.nextLine();
// cur = input.indexOf("http:", cur);
// while (cur > 0) {
// int endIndex = input.indexOf("\"", cur);
// cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
// }
// }
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// return subUrls;
// }
} }