Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f79fb06f7 | ||
|
|
996e6abc4b | ||
|
|
9946dc9d12 | ||
|
|
a922615e96 | ||
|
|
eb12a813b0 | ||
|
|
38b0524b0d | ||
|
|
e2ce5cac3b | ||
|
|
cb163f8ac7 |
6
README.md
Normal file
6
README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# WebCrawler
|
||||
a small little web crawler that searches for a word you give it
|
||||
|
||||
## Usage
|
||||
to run the jar via the command line:
|
||||
`java -jar <JAR LOCATION>/.\WebCrawler.jar`
|
||||
@@ -13,10 +13,20 @@ import java.util.List;
|
||||
public class CrawlBranch {
|
||||
private List<String> links = new LinkedList<>();
|
||||
private Document htmlDocument;
|
||||
private boolean debug;
|
||||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
|
||||
|
||||
public CrawlBranch() {
|
||||
this(false);
|
||||
}
|
||||
|
||||
public CrawlBranch(boolean debug) {
|
||||
this.debug = debug;
|
||||
}
|
||||
|
||||
/**
|
||||
* crawls the links in it's current arrayList of links
|
||||
*
|
||||
* @param url the url to start from
|
||||
* @return <code>true</code> if the search was successful, <code>false otherwise</code>
|
||||
*/
|
||||
@@ -26,42 +36,43 @@ public class CrawlBranch {
|
||||
this.htmlDocument = connection.get();
|
||||
|
||||
if (connection.response().statusCode() == 200) {
|
||||
System.out.println("VISITING -- Recieved web page at " + url);
|
||||
print("VISITING -- Recieved web page at " + url);
|
||||
} else {
|
||||
System.out.println("FAIL -- recieved something else than a web page");
|
||||
print("FAIL -- recieved something else than a web page");
|
||||
return false;
|
||||
}
|
||||
|
||||
Elements linksOnPage = htmlDocument.select("a[href]");
|
||||
System.out.println("FOUND (" + linksOnPage.size() + ") links");
|
||||
print("FOUND (" + linksOnPage.size() + ") links");
|
||||
for (Element link : linksOnPage) {
|
||||
this.links.add(link.absUrl("href"));
|
||||
}
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
System.out.println("ERROR -- error in out http request : " + e);
|
||||
//System.out.println("ERROR -- error in out http request : " + e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* searches how many times a word occurs in a page
|
||||
*
|
||||
* @param word the word to look for
|
||||
* @return the amount of occurrences in the web page, -1 if the word is not found
|
||||
*/
|
||||
public int searchForWord(String word) {
|
||||
if (this.htmlDocument == null){
|
||||
System.out.println("ERROR -- call crawl before searhing");
|
||||
if (this.htmlDocument == null) {
|
||||
//System.out.println("ERROR -- call crawl before searhing");
|
||||
return -1;
|
||||
}
|
||||
System.out.printf("Searching for %s...", word);
|
||||
System.out.println();
|
||||
print(String.format("Searching for %s...\n", word));
|
||||
String bodyText = this.htmlDocument.body().text();
|
||||
return count(bodyText.toLowerCase(),word.toLowerCase());
|
||||
return count(bodyText.toLowerCase(), word.toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* counts how many times a word occurs in a string
|
||||
*
|
||||
* @param text the string to search in for the word
|
||||
* @param word the word to search for
|
||||
* @return the amount of times the given word was found in the string
|
||||
@@ -77,9 +88,14 @@ public class CrawlBranch {
|
||||
|
||||
/**
|
||||
* gets the links
|
||||
*
|
||||
* @return the links
|
||||
*/
|
||||
public List<String> getLinks() {
|
||||
return this.links;
|
||||
}
|
||||
|
||||
private void print(String text) {
|
||||
if (debug) System.out.println(text);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,12 +12,16 @@ public class Main {
|
||||
System.out.print("Enter the maximum amount of pages the crawler should visit : ");
|
||||
int amount = Integer.parseInt(scanner.nextLine().trim());
|
||||
System.out.print("Should the crawler save the links with hits? (Y/N) : ");
|
||||
String choice = scanner.nextLine().toLowerCase().trim();
|
||||
boolean save;
|
||||
if (choice.equals("y")) save = true;
|
||||
else if (choice.equals("n")) save = false;
|
||||
else save = false;
|
||||
WebCrawler crawler = new WebCrawler(amount,save);
|
||||
boolean save = getChoice(scanner.nextLine());
|
||||
System.out.print("Do you want to enable debug mode? (Y/N) : ");
|
||||
boolean debug = getChoice(scanner.nextLine());
|
||||
if (debug) System.out.println("[INFO] - Debug mode enabled");
|
||||
WebCrawler crawler = new WebCrawler(amount,save,debug);
|
||||
crawler.search(startUrl,word);
|
||||
}
|
||||
|
||||
private static boolean getChoice(String choice) {
|
||||
if (choice.trim().toLowerCase().equals("y")) return true;
|
||||
else return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,10 +7,11 @@ public class WebCrawler {
|
||||
private Set<String> pagesVisited;
|
||||
private List<String> pagesPending;
|
||||
private ArrayList<String> resultPages;
|
||||
private Map<String,Integer> urlHits;
|
||||
private Map<String, Integer> urlHits;
|
||||
private int amountFound = 0;
|
||||
private int successPages = 0;
|
||||
private boolean shouldSaveHitLinks;
|
||||
private boolean debug;
|
||||
|
||||
/**
|
||||
* creates a new WebCrawler object with standard values
|
||||
@@ -21,6 +22,7 @@ public class WebCrawler {
|
||||
|
||||
/**
|
||||
* creates a new WebCrawler object with the given amount of max pages
|
||||
*
|
||||
* @param maxPages the max amount of pages the crawler should visit
|
||||
*/
|
||||
public WebCrawler(int maxPages) {
|
||||
@@ -29,21 +31,28 @@ public class WebCrawler {
|
||||
|
||||
/**
|
||||
* creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
|
||||
* @param maxPages the max amount of pages the crawler should visit
|
||||
*
|
||||
* @param maxPages the max amount of pages the crawler should visit
|
||||
* @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
|
||||
*/
|
||||
public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
|
||||
this(maxPages, shouldSaveHitLinks, false);
|
||||
}
|
||||
|
||||
public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug) {
|
||||
this.amountOfPages = maxPages;
|
||||
this.shouldSaveHitLinks = shouldSaveHitLinks;
|
||||
this.pagesVisited = new HashSet<>();
|
||||
this.pagesPending = new LinkedList<>();
|
||||
this.resultPages = new ArrayList<>();
|
||||
this.urlHits = new HashMap<>();
|
||||
this.debug = debug;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* gets the next url in the list
|
||||
*
|
||||
* @return the next url in the list
|
||||
*/
|
||||
private String nextUrl() {
|
||||
@@ -57,30 +66,33 @@ public class WebCrawler {
|
||||
|
||||
/**
|
||||
* searches for a word by crawling the web through hyperlinks
|
||||
* @param url the url to start searching from
|
||||
*
|
||||
* @param url the url to start searching from
|
||||
* @param searchWord the word to search for
|
||||
*/
|
||||
public void search(String url, String searchWord) {
|
||||
int counter = 0;
|
||||
while (this.pagesVisited.size() < amountOfPages) {
|
||||
String curUrl;
|
||||
CrawlBranch branch = new CrawlBranch();
|
||||
CrawlBranch branch = new CrawlBranch(debug);
|
||||
if (this.pagesPending.isEmpty()) {
|
||||
curUrl = url;
|
||||
this.pagesVisited.add(url);
|
||||
} else {
|
||||
curUrl = this.nextUrl();
|
||||
counter++;
|
||||
System.out.println(String.format("visiting page %s / %s",counter,amountOfPages));
|
||||
}
|
||||
branch.crawl(curUrl);
|
||||
|
||||
int amount = branch.searchForWord(searchWord);
|
||||
if (amount > 0) {
|
||||
System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
|
||||
System.out.println();
|
||||
print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount));
|
||||
successPages++;
|
||||
amountFound += amount;
|
||||
if (shouldSaveHitLinks)
|
||||
resultPages.add(curUrl);
|
||||
urlHits.put(curUrl,amount);
|
||||
resultPages.add(curUrl);
|
||||
urlHits.put(curUrl, amount);
|
||||
}
|
||||
this.pagesPending.addAll(branch.getLinks());
|
||||
}
|
||||
@@ -110,6 +122,7 @@ public class WebCrawler {
|
||||
|
||||
/**
|
||||
* sets the amount of max pages
|
||||
*
|
||||
* @param amount the amount of pages
|
||||
*/
|
||||
public void setAmountOfPages(int amount) {
|
||||
@@ -132,6 +145,14 @@ public class WebCrawler {
|
||||
return amountFound;
|
||||
}
|
||||
|
||||
public boolean usesDebug() {
|
||||
return debug;
|
||||
}
|
||||
|
||||
public void setDebug(boolean debug) {
|
||||
this.debug = debug;
|
||||
}
|
||||
|
||||
/**
|
||||
* clears the crawler
|
||||
*/
|
||||
@@ -144,51 +165,7 @@ public class WebCrawler {
|
||||
this.amountFound = 0;
|
||||
}
|
||||
|
||||
// public static void main(String[] args) {
|
||||
// Scanner input = new Scanner(System.in);
|
||||
// System.out.println("Enter a URL : ");
|
||||
// String urlInput = input.nextLine().trim();
|
||||
// crawler(urlInput);
|
||||
//
|
||||
// }
|
||||
//
|
||||
// public static void crawler(String startUrl) {
|
||||
// ArrayList<String> pending = new ArrayList<>();
|
||||
// ArrayList<String> traversed = new ArrayList<>();
|
||||
//
|
||||
// pending.add(startUrl);
|
||||
// while (!pending.isEmpty() && traversed.size() <= 100) {
|
||||
// String tempUrl = pending.remove(0);
|
||||
// if (!traversed.contains(tempUrl)) {
|
||||
// traversed.add(tempUrl);
|
||||
// System.out.println("crawling: " + tempUrl);
|
||||
//
|
||||
// for (String s : getSubURLs(tempUrl)) {
|
||||
// if (!traversed.contains(s)) pending.add(s);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public static ArrayList<String> getSubURLs(String urlString) {
|
||||
// ArrayList<String> subUrls = new ArrayList<>();
|
||||
//
|
||||
// try {
|
||||
// URL url = new URL(urlString);
|
||||
// Scanner urlScanner = new Scanner(url.openStream());
|
||||
// int cur = 0;
|
||||
// while (urlScanner.hasNext()) {
|
||||
// String input = urlScanner.nextLine();
|
||||
// cur = input.indexOf("http:", cur);
|
||||
// while (cur > 0) {
|
||||
// int endIndex = input.indexOf("\"", cur);
|
||||
// cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
|
||||
// }
|
||||
// }
|
||||
// } catch (IOException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
//
|
||||
// return subUrls;
|
||||
// }
|
||||
private void print(String text) {
|
||||
if (debug) System.out.println(text);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user