upload to github
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -114,3 +114,6 @@ fabric.properties
|
|||||||
.idea/**/markdown-navigator/
|
.idea/**/markdown-navigator/
|
||||||
|
|
||||||
# End of https://www.gitignore.io/api/intellij
|
# End of https://www.gitignore.io/api/intellij
|
||||||
|
.idea/artifacts/WebCrawler_jar.xml
|
||||||
|
.idea/modules.xml
|
||||||
|
.idea/misc.xml
|
||||||
|
|||||||
12
WebCrawler.iml
Normal file
12
WebCrawler.iml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="JAVA_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
<orderEntry type="library" name="jsoup-1.12.1" level="project" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
85
src/main/java/webcrawler/CrawlBranch.java
Normal file
85
src/main/java/webcrawler/CrawlBranch.java
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
package main.java.webcrawler;
|
||||||
|
|
||||||
|
import org.jsoup.Connection;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class CrawlBranch {
|
||||||
|
private List<String> links = new LinkedList<>();
|
||||||
|
private Document htmlDocument;
|
||||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* crawls the links in it's current arrayList of links
|
||||||
|
* @param url the url to start from
|
||||||
|
* @return <code>true</code> if the search was successful, <code>false otherwise</code>
|
||||||
|
*/
|
||||||
|
public boolean crawl(String url) {
|
||||||
|
try {
|
||||||
|
Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
|
||||||
|
this.htmlDocument = connection.get();
|
||||||
|
|
||||||
|
if (connection.response().statusCode() == 200) {
|
||||||
|
System.out.println("VISITING -- Recieved web page at " + url);
|
||||||
|
} else {
|
||||||
|
System.out.println("FAIL -- recieved something else than a web page");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Elements linksOnPage = htmlDocument.select("a[href]");
|
||||||
|
System.out.println("FOUND (" + linksOnPage.size() + ") links");
|
||||||
|
for (Element link : linksOnPage) {
|
||||||
|
this.links.add(link.absUrl("href"));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.out.println("ERROR -- error in out http request : " + e);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* searches how many times a word occurs in a page
|
||||||
|
* @param word the word to look for
|
||||||
|
* @return the amount of occurrences in the web page, -1 if the word is not found
|
||||||
|
*/
|
||||||
|
public int searchForWord(String word) {
|
||||||
|
if (this.htmlDocument == null){
|
||||||
|
System.out.println("ERROR -- call crawl before searhing");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
System.out.printf("Searching for %s...", word);
|
||||||
|
System.out.println();
|
||||||
|
String bodyText = this.htmlDocument.body().text();
|
||||||
|
return count(bodyText.toLowerCase(),word.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* counts how many times a word occurs in a string
|
||||||
|
* @param text the string to search in for the word
|
||||||
|
* @param word the word to search for
|
||||||
|
* @return the amount of times the given word was found in the string
|
||||||
|
*/
|
||||||
|
private int count(String text, String word) {
|
||||||
|
int amount = 0;
|
||||||
|
String[] words = text.split(" ");
|
||||||
|
for (int i = 0; i < words.length; i++) {
|
||||||
|
if (words[i].contains(word)) amount++;
|
||||||
|
}
|
||||||
|
return amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* gets the links
|
||||||
|
* @return the links
|
||||||
|
*/
|
||||||
|
public List<String> getLinks() {
|
||||||
|
return this.links;
|
||||||
|
}
|
||||||
|
}
|
||||||
23
src/main/java/webcrawler/Main.java
Normal file
23
src/main/java/webcrawler/Main.java
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
package main.java.webcrawler;
|
||||||
|
|
||||||
|
import java.util.Scanner;
|
||||||
|
|
||||||
|
public class Main {
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Scanner scanner = new Scanner(System.in);
|
||||||
|
System.out.print("Enter a starting URL : ");
|
||||||
|
String startUrl = scanner.nextLine().trim();
|
||||||
|
System.out.print("Enter a word to search for : ");
|
||||||
|
String word = scanner.nextLine().trim();
|
||||||
|
System.out.print("Enter the maximum amount of pages the crawler should visit : ");
|
||||||
|
int amount = Integer.parseInt(scanner.nextLine().trim());
|
||||||
|
System.out.print("Should the crawler save the links with hits? (Y/N) : ");
|
||||||
|
String choice = scanner.nextLine().toLowerCase().trim();
|
||||||
|
boolean save;
|
||||||
|
if (choice.equals("y")) save = true;
|
||||||
|
else if (choice.equals("n")) save = false;
|
||||||
|
else save = false;
|
||||||
|
WebCrawler crawler = new WebCrawler(amount,save);
|
||||||
|
crawler.search(startUrl,word);
|
||||||
|
}
|
||||||
|
}
|
||||||
194
src/main/java/webcrawler/WebCrawler.java
Normal file
194
src/main/java/webcrawler/WebCrawler.java
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
package main.java.webcrawler;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class WebCrawler {
|
||||||
|
private int amountOfPages;
|
||||||
|
private Set<String> pagesVisited;
|
||||||
|
private List<String> pagesPending;
|
||||||
|
private ArrayList<String> resultPages;
|
||||||
|
private Map<String,Integer> urlHits;
|
||||||
|
private int amountFound = 0;
|
||||||
|
private int successPages = 0;
|
||||||
|
private boolean shouldSaveHitLinks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* creates a new WebCrawler object with standard values
|
||||||
|
*/
|
||||||
|
public WebCrawler() {
|
||||||
|
this(50, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* creates a new WebCrawler object with the given amount of max pages
|
||||||
|
* @param maxPages the max amount of pages the crawler should visit
|
||||||
|
*/
|
||||||
|
public WebCrawler(int maxPages) {
|
||||||
|
this(maxPages, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
|
||||||
|
* @param maxPages the max amount of pages the crawler should visit
|
||||||
|
* @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
|
||||||
|
*/
|
||||||
|
public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
|
||||||
|
this.amountOfPages = maxPages;
|
||||||
|
this.shouldSaveHitLinks = shouldSaveHitLinks;
|
||||||
|
this.pagesVisited = new HashSet<>();
|
||||||
|
this.pagesPending = new LinkedList<>();
|
||||||
|
this.resultPages = new ArrayList<>();
|
||||||
|
this.urlHits = new HashMap<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* gets the next url in the list
|
||||||
|
* @return the next url in the list
|
||||||
|
*/
|
||||||
|
private String nextUrl() {
|
||||||
|
String next;
|
||||||
|
do {
|
||||||
|
next = this.pagesPending.remove(0);
|
||||||
|
} while (this.pagesVisited.contains(next));
|
||||||
|
this.pagesVisited.add(next);
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* searches for a word by crawling the web through hyperlinks
|
||||||
|
* @param url the url to start searching from
|
||||||
|
* @param searchWord the word to search for
|
||||||
|
*/
|
||||||
|
public void search(String url, String searchWord) {
|
||||||
|
while (this.pagesVisited.size() < amountOfPages) {
|
||||||
|
String curUrl;
|
||||||
|
CrawlBranch branch = new CrawlBranch();
|
||||||
|
if (this.pagesPending.isEmpty()) {
|
||||||
|
curUrl = url;
|
||||||
|
this.pagesVisited.add(url);
|
||||||
|
} else {
|
||||||
|
curUrl = this.nextUrl();
|
||||||
|
}
|
||||||
|
branch.crawl(curUrl);
|
||||||
|
|
||||||
|
int amount = branch.searchForWord(searchWord);
|
||||||
|
if (amount > 0) {
|
||||||
|
System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
|
||||||
|
System.out.println();
|
||||||
|
successPages++;
|
||||||
|
amountFound += amount;
|
||||||
|
if (shouldSaveHitLinks)
|
||||||
|
resultPages.add(curUrl);
|
||||||
|
urlHits.put(curUrl,amount);
|
||||||
|
}
|
||||||
|
this.pagesPending.addAll(branch.getLinks());
|
||||||
|
}
|
||||||
|
System.out.println("========================");
|
||||||
|
System.out.printf("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages);
|
||||||
|
if (shouldSaveHitLinks) {
|
||||||
|
System.out.printf("Successful pages: \n%s", showCombinations(urlHits));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String display(List<String> list) {
|
||||||
|
StringBuilder res = new StringBuilder();
|
||||||
|
for (int i = 0; i < list.size(); i++) {
|
||||||
|
res.append(list.get(i)).append("\n");
|
||||||
|
}
|
||||||
|
return res.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String showCombinations(Map<String, Integer> urls) {
|
||||||
|
StringBuilder res = new StringBuilder();
|
||||||
|
Set<String> keys = urls.keySet();
|
||||||
|
for (String url : keys) {
|
||||||
|
res.append(url).append(" (").append(urls.get(url)).append(" hits)\n");
|
||||||
|
}
|
||||||
|
return res.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* sets the amount of max pages
|
||||||
|
* @param amount the amount of pages
|
||||||
|
*/
|
||||||
|
public void setAmountOfPages(int amount) {
|
||||||
|
this.amountOfPages = amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setShouldSaveHitLinks(boolean shouldSaveHitLinks) {
|
||||||
|
this.shouldSaveHitLinks = shouldSaveHitLinks;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> getResultPages() {
|
||||||
|
return this.resultPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Integer> getUrlHits() {
|
||||||
|
return urlHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getAmountFound() {
|
||||||
|
return amountFound;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* clears the crawler
|
||||||
|
*/
|
||||||
|
public void clear() {
|
||||||
|
this.urlHits.clear();
|
||||||
|
this.resultPages.clear();
|
||||||
|
this.pagesPending.clear();
|
||||||
|
this.pagesVisited.clear();
|
||||||
|
this.successPages = 0;
|
||||||
|
this.amountFound = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// public static void main(String[] args) {
|
||||||
|
// Scanner input = new Scanner(System.in);
|
||||||
|
// System.out.println("Enter a URL : ");
|
||||||
|
// String urlInput = input.nextLine().trim();
|
||||||
|
// crawler(urlInput);
|
||||||
|
//
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public static void crawler(String startUrl) {
|
||||||
|
// ArrayList<String> pending = new ArrayList<>();
|
||||||
|
// ArrayList<String> traversed = new ArrayList<>();
|
||||||
|
//
|
||||||
|
// pending.add(startUrl);
|
||||||
|
// while (!pending.isEmpty() && traversed.size() <= 100) {
|
||||||
|
// String tempUrl = pending.remove(0);
|
||||||
|
// if (!traversed.contains(tempUrl)) {
|
||||||
|
// traversed.add(tempUrl);
|
||||||
|
// System.out.println("crawling: " + tempUrl);
|
||||||
|
//
|
||||||
|
// for (String s : getSubURLs(tempUrl)) {
|
||||||
|
// if (!traversed.contains(s)) pending.add(s);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public static ArrayList<String> getSubURLs(String urlString) {
|
||||||
|
// ArrayList<String> subUrls = new ArrayList<>();
|
||||||
|
//
|
||||||
|
// try {
|
||||||
|
// URL url = new URL(urlString);
|
||||||
|
// Scanner urlScanner = new Scanner(url.openStream());
|
||||||
|
// int cur = 0;
|
||||||
|
// while (urlScanner.hasNext()) {
|
||||||
|
// String input = urlScanner.nextLine();
|
||||||
|
// cur = input.indexOf("http:", cur);
|
||||||
|
// while (cur > 0) {
|
||||||
|
// int endIndex = input.indexOf("\"", cur);
|
||||||
|
// cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// } catch (IOException e) {
|
||||||
|
// e.printStackTrace();
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return subUrls;
|
||||||
|
// }
|
||||||
|
}
|
||||||
3
src/main/resources/META-INF/MANIFEST.MF
Normal file
3
src/main/resources/META-INF/MANIFEST.MF
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
Manifest-Version: 1.0
|
||||||
|
Main-Class: main.java.webcrawler.Main
|
||||||
|
|
||||||
Reference in New Issue
Block a user