diff --git a/.gitignore b/.gitignore index ab32039..e6ceafd 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,6 @@ fabric.properties .idea/**/markdown-navigator/ # End of https://www.gitignore.io/api/intellij +.idea/artifacts/WebCrawler_jar.xml +.idea/modules.xml +.idea/misc.xml diff --git a/WebCrawler.iml b/WebCrawler.iml new file mode 100644 index 0000000..f47cdd6 --- /dev/null +++ b/WebCrawler.iml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/java/webcrawler/CrawlBranch.java b/src/main/java/webcrawler/CrawlBranch.java new file mode 100644 index 0000000..cc253c9 --- /dev/null +++ b/src/main/java/webcrawler/CrawlBranch.java @@ -0,0 +1,85 @@ +package main.java.webcrawler; + +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +public class CrawlBranch { + private List links = new LinkedList<>(); + private Document htmlDocument; + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1"; + + /** + * crawls the links in it's current arrayList of links + * @param url the url to start from + * @return true if the search was successful, false otherwise + */ + public boolean crawl(String url) { + try { + Connection connection = Jsoup.connect(url).userAgent(USER_AGENT); + this.htmlDocument = connection.get(); + + if (connection.response().statusCode() == 200) { + System.out.println("VISITING -- Recieved web page at " + url); + } else { + System.out.println("FAIL -- recieved something else than a web page"); + return false; + } + + Elements linksOnPage = htmlDocument.select("a[href]"); + System.out.println("FOUND (" + linksOnPage.size() + ") links"); + for (Element link : linksOnPage) { + this.links.add(link.absUrl("href")); + } + return true; + } catch (Exception e) { + System.out.println("ERROR -- error in out http request : " + e); + return false; + } + } + + /** + * searches how many times a word occurs in a page + * @param word the word to look for + * @return the amount of occurrences in the web page, -1 if the word is not found + */ + public int searchForWord(String word) { + if (this.htmlDocument == null){ + System.out.println("ERROR -- call crawl before searhing"); + return -1; + } + System.out.printf("Searching for %s...", word); + System.out.println(); + String bodyText = this.htmlDocument.body().text(); + return count(bodyText.toLowerCase(),word.toLowerCase()); + } + + /** + * counts how many times a word occurs in a string + * @param text the string to search in for the word + * @param word the word to search for + * @return the amount of times the given word was found in the string + */ + private int count(String text, String word) { + int amount = 0; + String[] words = text.split(" "); + for (int i = 0; i < words.length; i++) { + if (words[i].contains(word)) amount++; + } + return amount; + } + + /** + * gets the links + * @return the links + */ + public List getLinks() { + return this.links; + } +} diff --git a/src/main/java/webcrawler/Main.java b/src/main/java/webcrawler/Main.java new file mode 100644 index 0000000..3940852 --- /dev/null +++ b/src/main/java/webcrawler/Main.java @@ -0,0 +1,23 @@ +package main.java.webcrawler; + +import java.util.Scanner; + +public class Main { + public static void main(String[] args) { + Scanner scanner = new Scanner(System.in); + System.out.print("Enter a starting URL : "); + String startUrl = scanner.nextLine().trim(); + System.out.print("Enter a word to search for : "); + String word = scanner.nextLine().trim(); + System.out.print("Enter the maximum amount of pages the crawler should visit : "); + int amount = Integer.parseInt(scanner.nextLine().trim()); + System.out.print("Should the crawler save the links with hits? (Y/N) : "); + String choice = scanner.nextLine().toLowerCase().trim(); + boolean save; + if (choice.equals("y")) save = true; + else if (choice.equals("n")) save = false; + else save = false; + WebCrawler crawler = new WebCrawler(amount,save); + crawler.search(startUrl,word); + } +} diff --git a/src/main/java/webcrawler/WebCrawler.java b/src/main/java/webcrawler/WebCrawler.java new file mode 100644 index 0000000..4eb9bb1 --- /dev/null +++ b/src/main/java/webcrawler/WebCrawler.java @@ -0,0 +1,194 @@ +package main.java.webcrawler; + +import java.util.*; + +public class WebCrawler { + private int amountOfPages; + private Set pagesVisited; + private List pagesPending; + private ArrayList resultPages; + private Map urlHits; + private int amountFound = 0; + private int successPages = 0; + private boolean shouldSaveHitLinks; + + /** + * creates a new WebCrawler object with standard values + */ + public WebCrawler() { + this(50, true); + } + + /** + * creates a new WebCrawler object with the given amount of max pages + * @param maxPages the max amount of pages the crawler should visit + */ + public WebCrawler(int maxPages) { + this(maxPages, true); + } + + /** + * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs + * @param maxPages the max amount of pages the crawler should visit + * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits + */ + public WebCrawler(int maxPages, boolean shouldSaveHitLinks) { + this.amountOfPages = maxPages; + this.shouldSaveHitLinks = shouldSaveHitLinks; + this.pagesVisited = new HashSet<>(); + this.pagesPending = new LinkedList<>(); + this.resultPages = new ArrayList<>(); + this.urlHits = new HashMap<>(); + } + + + /** + * gets the next url in the list + * @return the next url in the list + */ + private String nextUrl() { + String next; + do { + next = this.pagesPending.remove(0); + } while (this.pagesVisited.contains(next)); + this.pagesVisited.add(next); + return next; + } + + /** + * searches for a word by crawling the web through hyperlinks + * @param url the url to start searching from + * @param searchWord the word to search for + */ + public void search(String url, String searchWord) { + while (this.pagesVisited.size() < amountOfPages) { + String curUrl; + CrawlBranch branch = new CrawlBranch(); + if (this.pagesPending.isEmpty()) { + curUrl = url; + this.pagesVisited.add(url); + } else { + curUrl = this.nextUrl(); + } + branch.crawl(curUrl); + + int amount = branch.searchForWord(searchWord); + if (amount > 0) { + System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount); + System.out.println(); + successPages++; + amountFound += amount; + if (shouldSaveHitLinks) + resultPages.add(curUrl); + urlHits.put(curUrl,amount); + } + this.pagesPending.addAll(branch.getLinks()); + } + System.out.println("========================"); + System.out.printf("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages); + if (shouldSaveHitLinks) { + System.out.printf("Successful pages: \n%s", showCombinations(urlHits)); + } + } + + private String display(List list) { + StringBuilder res = new StringBuilder(); + for (int i = 0; i < list.size(); i++) { + res.append(list.get(i)).append("\n"); + } + return res.toString(); + } + + private String showCombinations(Map urls) { + StringBuilder res = new StringBuilder(); + Set keys = urls.keySet(); + for (String url : keys) { + res.append(url).append(" (").append(urls.get(url)).append(" hits)\n"); + } + return res.toString(); + } + + /** + * sets the amount of max pages + * @param amount the amount of pages + */ + public void setAmountOfPages(int amount) { + this.amountOfPages = amount; + } + + public void setShouldSaveHitLinks(boolean shouldSaveHitLinks) { + this.shouldSaveHitLinks = shouldSaveHitLinks; + } + + public ArrayList getResultPages() { + return this.resultPages; + } + + public Map getUrlHits() { + return urlHits; + } + + public int getAmountFound() { + return amountFound; + } + + /** + * clears the crawler + */ + public void clear() { + this.urlHits.clear(); + this.resultPages.clear(); + this.pagesPending.clear(); + this.pagesVisited.clear(); + this.successPages = 0; + this.amountFound = 0; + } + + // public static void main(String[] args) { +// Scanner input = new Scanner(System.in); +// System.out.println("Enter a URL : "); +// String urlInput = input.nextLine().trim(); +// crawler(urlInput); +// +// } +// +// public static void crawler(String startUrl) { +// ArrayList pending = new ArrayList<>(); +// ArrayList traversed = new ArrayList<>(); +// +// pending.add(startUrl); +// while (!pending.isEmpty() && traversed.size() <= 100) { +// String tempUrl = pending.remove(0); +// if (!traversed.contains(tempUrl)) { +// traversed.add(tempUrl); +// System.out.println("crawling: " + tempUrl); +// +// for (String s : getSubURLs(tempUrl)) { +// if (!traversed.contains(s)) pending.add(s); +// } +// } +// } +// } +// +// public static ArrayList getSubURLs(String urlString) { +// ArrayList subUrls = new ArrayList<>(); +// +// try { +// URL url = new URL(urlString); +// Scanner urlScanner = new Scanner(url.openStream()); +// int cur = 0; +// while (urlScanner.hasNext()) { +// String input = urlScanner.nextLine(); +// cur = input.indexOf("http:", cur); +// while (cur > 0) { +// int endIndex = input.indexOf("\"", cur); +// cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1; +// } +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// +// return subUrls; +// } +} diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF new file mode 100644 index 0000000..239918c --- /dev/null +++ b/src/main/resources/META-INF/MANIFEST.MF @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Main-Class: main.java.webcrawler.Main +