diff --git a/.gitignore b/.gitignore
index ab32039..e6ceafd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,6 @@ fabric.properties
.idea/**/markdown-navigator/
# End of https://www.gitignore.io/api/intellij
+.idea/artifacts/WebCrawler_jar.xml
+.idea/modules.xml
+.idea/misc.xml
diff --git a/WebCrawler.iml b/WebCrawler.iml
new file mode 100644
index 0000000..f47cdd6
--- /dev/null
+++ b/WebCrawler.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/main/java/webcrawler/CrawlBranch.java b/src/main/java/webcrawler/CrawlBranch.java
new file mode 100644
index 0000000..cc253c9
--- /dev/null
+++ b/src/main/java/webcrawler/CrawlBranch.java
@@ -0,0 +1,85 @@
+package main.java.webcrawler;
+
+import org.jsoup.Connection;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+public class CrawlBranch {
+ private List links = new LinkedList<>();
+ private Document htmlDocument;
+ private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
+
+ /**
+ * crawls the links in it's current arrayList of links
+ * @param url the url to start from
+ * @return true if the search was successful, false otherwise
+ */
+ public boolean crawl(String url) {
+ try {
+ Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
+ this.htmlDocument = connection.get();
+
+ if (connection.response().statusCode() == 200) {
+ System.out.println("VISITING -- Recieved web page at " + url);
+ } else {
+ System.out.println("FAIL -- recieved something else than a web page");
+ return false;
+ }
+
+ Elements linksOnPage = htmlDocument.select("a[href]");
+ System.out.println("FOUND (" + linksOnPage.size() + ") links");
+ for (Element link : linksOnPage) {
+ this.links.add(link.absUrl("href"));
+ }
+ return true;
+ } catch (Exception e) {
+ System.out.println("ERROR -- error in out http request : " + e);
+ return false;
+ }
+ }
+
+ /**
+ * searches how many times a word occurs in a page
+ * @param word the word to look for
+ * @return the amount of occurrences in the web page, -1 if the word is not found
+ */
+ public int searchForWord(String word) {
+ if (this.htmlDocument == null){
+ System.out.println("ERROR -- call crawl before searhing");
+ return -1;
+ }
+ System.out.printf("Searching for %s...", word);
+ System.out.println();
+ String bodyText = this.htmlDocument.body().text();
+ return count(bodyText.toLowerCase(),word.toLowerCase());
+ }
+
+ /**
+ * counts how many times a word occurs in a string
+ * @param text the string to search in for the word
+ * @param word the word to search for
+ * @return the amount of times the given word was found in the string
+ */
+ private int count(String text, String word) {
+ int amount = 0;
+ String[] words = text.split(" ");
+ for (int i = 0; i < words.length; i++) {
+ if (words[i].contains(word)) amount++;
+ }
+ return amount;
+ }
+
+ /**
+ * gets the links
+ * @return the links
+ */
+ public List getLinks() {
+ return this.links;
+ }
+}
diff --git a/src/main/java/webcrawler/Main.java b/src/main/java/webcrawler/Main.java
new file mode 100644
index 0000000..3940852
--- /dev/null
+++ b/src/main/java/webcrawler/Main.java
@@ -0,0 +1,23 @@
+package main.java.webcrawler;
+
+import java.util.Scanner;
+
+public class Main {
+ public static void main(String[] args) {
+ Scanner scanner = new Scanner(System.in);
+ System.out.print("Enter a starting URL : ");
+ String startUrl = scanner.nextLine().trim();
+ System.out.print("Enter a word to search for : ");
+ String word = scanner.nextLine().trim();
+ System.out.print("Enter the maximum amount of pages the crawler should visit : ");
+ int amount = Integer.parseInt(scanner.nextLine().trim());
+ System.out.print("Should the crawler save the links with hits? (Y/N) : ");
+ String choice = scanner.nextLine().toLowerCase().trim();
+ boolean save;
+ if (choice.equals("y")) save = true;
+ else if (choice.equals("n")) save = false;
+ else save = false;
+ WebCrawler crawler = new WebCrawler(amount,save);
+ crawler.search(startUrl,word);
+ }
+}
diff --git a/src/main/java/webcrawler/WebCrawler.java b/src/main/java/webcrawler/WebCrawler.java
new file mode 100644
index 0000000..4eb9bb1
--- /dev/null
+++ b/src/main/java/webcrawler/WebCrawler.java
@@ -0,0 +1,194 @@
+package main.java.webcrawler;
+
+import java.util.*;
+
+public class WebCrawler {
+ private int amountOfPages;
+ private Set pagesVisited;
+ private List pagesPending;
+ private ArrayList resultPages;
+ private Map urlHits;
+ private int amountFound = 0;
+ private int successPages = 0;
+ private boolean shouldSaveHitLinks;
+
+ /**
+ * creates a new WebCrawler object with standard values
+ */
+ public WebCrawler() {
+ this(50, true);
+ }
+
+ /**
+ * creates a new WebCrawler object with the given amount of max pages
+ * @param maxPages the max amount of pages the crawler should visit
+ */
+ public WebCrawler(int maxPages) {
+ this(maxPages, true);
+ }
+
+ /**
+ * creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
+ * @param maxPages the max amount of pages the crawler should visit
+ * @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
+ */
+ public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
+ this.amountOfPages = maxPages;
+ this.shouldSaveHitLinks = shouldSaveHitLinks;
+ this.pagesVisited = new HashSet<>();
+ this.pagesPending = new LinkedList<>();
+ this.resultPages = new ArrayList<>();
+ this.urlHits = new HashMap<>();
+ }
+
+
+ /**
+ * gets the next url in the list
+ * @return the next url in the list
+ */
+ private String nextUrl() {
+ String next;
+ do {
+ next = this.pagesPending.remove(0);
+ } while (this.pagesVisited.contains(next));
+ this.pagesVisited.add(next);
+ return next;
+ }
+
+ /**
+ * searches for a word by crawling the web through hyperlinks
+ * @param url the url to start searching from
+ * @param searchWord the word to search for
+ */
+ public void search(String url, String searchWord) {
+ while (this.pagesVisited.size() < amountOfPages) {
+ String curUrl;
+ CrawlBranch branch = new CrawlBranch();
+ if (this.pagesPending.isEmpty()) {
+ curUrl = url;
+ this.pagesVisited.add(url);
+ } else {
+ curUrl = this.nextUrl();
+ }
+ branch.crawl(curUrl);
+
+ int amount = branch.searchForWord(searchWord);
+ if (amount > 0) {
+ System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
+ System.out.println();
+ successPages++;
+ amountFound += amount;
+ if (shouldSaveHitLinks)
+ resultPages.add(curUrl);
+ urlHits.put(curUrl,amount);
+ }
+ this.pagesPending.addAll(branch.getLinks());
+ }
+ System.out.println("========================");
+ System.out.printf("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages);
+ if (shouldSaveHitLinks) {
+ System.out.printf("Successful pages: \n%s", showCombinations(urlHits));
+ }
+ }
+
+ private String display(List list) {
+ StringBuilder res = new StringBuilder();
+ for (int i = 0; i < list.size(); i++) {
+ res.append(list.get(i)).append("\n");
+ }
+ return res.toString();
+ }
+
+ private String showCombinations(Map urls) {
+ StringBuilder res = new StringBuilder();
+ Set keys = urls.keySet();
+ for (String url : keys) {
+ res.append(url).append(" (").append(urls.get(url)).append(" hits)\n");
+ }
+ return res.toString();
+ }
+
+ /**
+ * sets the amount of max pages
+ * @param amount the amount of pages
+ */
+ public void setAmountOfPages(int amount) {
+ this.amountOfPages = amount;
+ }
+
+ public void setShouldSaveHitLinks(boolean shouldSaveHitLinks) {
+ this.shouldSaveHitLinks = shouldSaveHitLinks;
+ }
+
+ public ArrayList getResultPages() {
+ return this.resultPages;
+ }
+
+ public Map getUrlHits() {
+ return urlHits;
+ }
+
+ public int getAmountFound() {
+ return amountFound;
+ }
+
+ /**
+ * clears the crawler
+ */
+ public void clear() {
+ this.urlHits.clear();
+ this.resultPages.clear();
+ this.pagesPending.clear();
+ this.pagesVisited.clear();
+ this.successPages = 0;
+ this.amountFound = 0;
+ }
+
+ // public static void main(String[] args) {
+// Scanner input = new Scanner(System.in);
+// System.out.println("Enter a URL : ");
+// String urlInput = input.nextLine().trim();
+// crawler(urlInput);
+//
+// }
+//
+// public static void crawler(String startUrl) {
+// ArrayList pending = new ArrayList<>();
+// ArrayList traversed = new ArrayList<>();
+//
+// pending.add(startUrl);
+// while (!pending.isEmpty() && traversed.size() <= 100) {
+// String tempUrl = pending.remove(0);
+// if (!traversed.contains(tempUrl)) {
+// traversed.add(tempUrl);
+// System.out.println("crawling: " + tempUrl);
+//
+// for (String s : getSubURLs(tempUrl)) {
+// if (!traversed.contains(s)) pending.add(s);
+// }
+// }
+// }
+// }
+//
+// public static ArrayList getSubURLs(String urlString) {
+// ArrayList subUrls = new ArrayList<>();
+//
+// try {
+// URL url = new URL(urlString);
+// Scanner urlScanner = new Scanner(url.openStream());
+// int cur = 0;
+// while (urlScanner.hasNext()) {
+// String input = urlScanner.nextLine();
+// cur = input.indexOf("http:", cur);
+// while (cur > 0) {
+// int endIndex = input.indexOf("\"", cur);
+// cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
+// }
+// }
+// } catch (IOException e) {
+// e.printStackTrace();
+// }
+//
+// return subUrls;
+// }
+}
diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF
new file mode 100644
index 0000000..239918c
--- /dev/null
+++ b/src/main/resources/META-INF/MANIFEST.MF
@@ -0,0 +1,3 @@
+Manifest-Version: 1.0
+Main-Class: main.java.webcrawler.Main
+