27 Commits

Author SHA1 Message Date
Sem van der Hoeven
db1eaf4cc3 shit 2020-04-19 16:25:10 +02:00
Sem van der Hoeven
4792062925 added log window always scroll down 2020-03-10 21:56:25 +01:00
Sem van der Hoeven
3b8711a14c added button skin template 2020-03-05 09:09:04 +01:00
Sem van der Hoeven
009eaccccd added manifest 2020-03-05 08:59:52 +01:00
Sem van der Hoeven
580197507e removed gitkeep 2020-03-05 08:47:22 +01:00
Sem van der Hoeven
bcd2a188ad added libraries 2020-03-05 08:46:51 +01:00
Sem van der Hoeven
b67e59255d added lib folder to gitignore 2020-03-05 08:45:16 +01:00
Sem van der Hoeven
2db8175ce2 added gitkeep 2020-03-05 08:42:53 +01:00
Sem van der Hoeven
46150a4ffd updated gitignore 2020-03-05 08:39:36 +01:00
Sem van der Hoeven
0d3428fb6b added todos 2020-03-04 22:40:23 +01:00
Sem van der Hoeven
78a4d4dd9f made debug window functional 2020-03-04 22:38:48 +01:00
Sem van der Hoeven
6fc378b342 added writing of debug 2020-03-04 22:10:43 +01:00
Sem van der Hoeven
faa2ff2b67 added log listview 2020-03-04 21:50:44 +01:00
Sem van der Hoeven
ead13842c8 made textfield numeric 2020-03-04 21:40:34 +01:00
Sem van der Hoeven
3578f38bd2 added javafx parts and style 2020-03-04 21:38:02 +01:00
Sem van der Hoeven
eff782aa86 added new thread 2020-03-04 20:51:40 +01:00
Sem van der Hoeven
a296868817 changed packages 2020-03-04 20:38:33 +01:00
Sem van der Hoeven
9f0258ccca added canvas 2020-03-04 20:30:09 +01:00
Sem van der Hoeven
73672d8ae8 added java2d class 2020-03-04 20:25:37 +01:00
Sem van der Hoeven
5f79fb06f7 fixed bug 2020-03-03 18:55:29 +01:00
Sem van der Hoeven
996e6abc4b Merge remote-tracking branch 'origin/master' 2020-03-03 17:40:20 +01:00
Sem van der Hoeven
9946dc9d12 fixed counter 2020-03-03 17:39:42 +01:00
Sem van der Hoeven
a922615e96 added debug mode 2020-03-03 16:50:38 +01:00
Sem van der Hoeven
eb12a813b0 added debug boolean and get choice method 2020-03-03 16:38:36 +01:00
Sem van der Hoeven
38b0524b0d removed error message spam 2020-03-03 16:33:12 +01:00
SemvdH
e2ce5cac3b Update README.md 2020-01-20 17:33:38 +01:00
SemvdH
cb163f8ac7 Create README.md 2020-01-20 17:31:57 +01:00
13 changed files with 462 additions and 85 deletions

3
.gitignore vendored
View File

@@ -117,3 +117,6 @@ fabric.properties
.idea/artifacts/WebCrawler_jar.xml
.idea/modules.xml
.idea/misc.xml
.idea/**
# Keep lib folder
!lib/

6
README.md Normal file
View File

@@ -0,0 +1,6 @@
# WebCrawler
a small little web crawler that searches for a word you give it
## Usage
to run the jar via the command line:
`java -jar <JAR LOCATION>/.\WebCrawler.jar`

View File

@@ -8,5 +8,6 @@
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="jsoup-1.12.1" level="project" />
<orderEntry type="library" name="fxgraphics2d-1.10" level="project" />
</component>
</module>

BIN
lib/fxgraphics2d-1.10.jar Normal file

Binary file not shown.

BIN
lib/jsoup-1.13.1.jar Normal file

Binary file not shown.

View File

@@ -1,23 +1,35 @@
package main.java.webcrawler;
import javafx.application.Application;
import main.java.webcrawler.crawler.CrawlThread;
import main.java.webcrawler.crawler.WebCrawler;
import main.java.webcrawler.visualiser.Visualiser;
import java.util.Scanner;
public class Main {
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
System.out.print("Enter a starting URL : ");
String startUrl = scanner.nextLine().trim();
System.out.print("Enter a word to search for : ");
String word = scanner.nextLine().trim();
System.out.print("Enter the maximum amount of pages the crawler should visit : ");
int amount = Integer.parseInt(scanner.nextLine().trim());
System.out.print("Should the crawler save the links with hits? (Y/N) : ");
String choice = scanner.nextLine().toLowerCase().trim();
boolean save;
if (choice.equals("y")) save = true;
else if (choice.equals("n")) save = false;
else save = false;
WebCrawler crawler = new WebCrawler(amount,save);
crawler.search(startUrl,word);
public static void main(String[] args) throws InterruptedException {
//TODO add status text
// Scanner scanner = new Scanner(System.in);
// System.out.print("Enter a starting URL : ");
// String startUrl = scanner.nextLine().trim();
// System.out.print("Enter a word to search for : ");
// String word = scanner.nextLine().trim();
// System.out.print("Enter the maximum amount of pages the crawler should visit : ");
// int amount = Integer.parseInt(scanner.nextLine().trim());
// System.out.print("Should the crawler save the links with hits? (Y/N) : ");
// boolean save = getChoice(scanner.nextLine());
// System.out.print("Do you want to enable debug mode? (Y/N) : ");
// boolean debug = getChoice(scanner.nextLine());
// if (debug) System.out.println("[INFO] - Debug mode enabled");
Application.launch(Visualiser.class);
// CrawlThread t = new CrawlThread(amount,save,debug,startUrl,word);
// t.start();
// System.out.println(t.getCrawler().getResultPages());
}
private static boolean getChoice(String choice) {
if (choice.trim().toLowerCase().equals("y")) return true;
else return false;
}
}

View File

@@ -1,5 +1,6 @@
package main.java.webcrawler;
package main.java.webcrawler.crawler;
import main.java.webcrawler.visualiser.Visualiser;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@@ -11,12 +12,27 @@ import java.util.LinkedList;
import java.util.List;
public class CrawlBranch {
private final Visualiser logger;
private List<String> links = new LinkedList<>();
private Document htmlDocument;
private boolean debug;
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
private WebCrawler parent;
public CrawlBranch() {
this(false,null,null);
}
public CrawlBranch(boolean debug, WebCrawler parent,Visualiser logger) {
this.debug = debug;
this.logger = logger;
this.parent = parent;
}
/**
* crawls the links in it's current arrayList of links
*
* @param url the url to start from
* @return <code>true</code> if the search was successful, <code>false otherwise</code>
*/
@@ -26,42 +42,47 @@ public class CrawlBranch {
this.htmlDocument = connection.get();
if (connection.response().statusCode() == 200) {
System.out.println("VISITING -- Recieved web page at " + url);
// print("VISITING -- Recieved web page at " + url);
sendMessage("VISITING -- Recieved web page at " + url);
} else {
System.out.println("FAIL -- recieved something else than a web page");
// print("FAIL -- recieved something else than a web page");
sendMessage("FAIL -- recieved something else than a web page");
return false;
}
Elements linksOnPage = htmlDocument.select("a[href]");
System.out.println("FOUND (" + linksOnPage.size() + ") links");
// print("FOUND (" + linksOnPage.size() + ") links");
sendMessage("FOUND (" + linksOnPage.size() + ") links");
for (Element link : linksOnPage) {
this.links.add(link.absUrl("href"));
}
return true;
} catch (Exception e) {
System.out.println("ERROR -- error in out http request : " + e);
//System.out.println("ERROR -- error in out http request : " + e);
return false;
}
}
/**
* searches how many times a word occurs in a page
*
* @param word the word to look for
* @return the amount of occurrences in the web page, -1 if the word is not found
*/
public int searchForWord(String word) {
if (this.htmlDocument == null){
System.out.println("ERROR -- call crawl before searhing");
if (this.htmlDocument == null) {
//System.out.println("ERROR -- call crawl before searhing");
return -1;
}
System.out.printf("Searching for %s...", word);
System.out.println();
// print(String.format("Searching for %s...", word));
sendMessage(String.format("Searching for %s...", word));
String bodyText = this.htmlDocument.body().text();
return count(bodyText.toLowerCase(),word.toLowerCase());
return count(bodyText.toLowerCase(), word.toLowerCase());
}
/**
* counts how many times a word occurs in a string
*
* @param text the string to search in for the word
* @param word the word to search for
* @return the amount of times the given word was found in the string
@@ -77,9 +98,18 @@ public class CrawlBranch {
/**
* gets the links
*
* @return the links
*/
public List<String> getLinks() {
return this.links;
}
private void print(String text) {
if (debug) logger.log(text);
}
private void sendMessage(String message) {
this.parent.addMessage(message);
}
}

View File

@@ -0,0 +1,57 @@
package main.java.webcrawler.crawler;
import main.java.webcrawler.visualiser.Visualiser;
import java.util.LinkedList;
public class CrawlThread extends Thread {
private final int amount;
private boolean debug;
private final String startUrl;
private final String word;
private WebCrawler crawler;
private Visualiser visualiser;
public CrawlThread(int amount, boolean debug, String startUrl, String word, Visualiser visualiser) {
this.amount = amount;
this.debug = debug;
this.startUrl = startUrl;
this.word = word;
this.visualiser = visualiser;
this.crawler = new WebCrawler(amount, true, debug, visualiser);
}
public void run() {
// this.debug = false;
System.out.println("starting thread");
this.crawler.search(startUrl, word);
}
public WebCrawler getCrawler() {
return crawler;
}
public int getAmount() {
return amount;
}
public boolean isDebug() {
return debug;
}
public String getStartUrl() {
return startUrl;
}
public String getWord() {
return word;
}
public LinkedList<String> retrieveLog() {
return this.crawler.getMessages();
}
}

View File

@@ -1,4 +1,7 @@
package main.java.webcrawler;
package main.java.webcrawler.crawler;
import main.java.webcrawler.crawler.CrawlBranch;
import main.java.webcrawler.visualiser.Visualiser;
import java.util.*;
@@ -7,10 +10,15 @@ public class WebCrawler {
private Set<String> pagesVisited;
private List<String> pagesPending;
private ArrayList<String> resultPages;
private Map<String,Integer> urlHits;
private Map<String, Integer> urlHits;
private Visualiser logger;
private int amountFound = 0;
private int successPages = 0;
private boolean shouldSaveHitLinks;
private boolean debug;
private boolean done = false;
public LinkedList<String> messages;
/**
* creates a new WebCrawler object with standard values
@@ -21,6 +29,7 @@ public class WebCrawler {
/**
* creates a new WebCrawler object with the given amount of max pages
*
* @param maxPages the max amount of pages the crawler should visit
*/
public WebCrawler(int maxPages) {
@@ -29,21 +38,30 @@ public class WebCrawler {
/**
* creates a new WebCrawler object with the given amount of max pages, and if it should save the hit URLs
* @param maxPages the max amount of pages the crawler should visit
*
* @param maxPages the max amount of pages the crawler should visit
* @param shouldSaveHitLinks if the crawler should save the links that have one or more hits
*/
public WebCrawler(int maxPages, boolean shouldSaveHitLinks) {
this(maxPages, shouldSaveHitLinks, false, null);
}
public WebCrawler(int maxPages, boolean shouldSaveHitLinks, boolean debug, Visualiser logger) {
this.amountOfPages = maxPages;
this.shouldSaveHitLinks = shouldSaveHitLinks;
this.pagesVisited = new HashSet<>();
this.pagesPending = new LinkedList<>();
this.resultPages = new ArrayList<>();
this.urlHits = new HashMap<>();
this.debug = debug;
this.logger = logger;
this.messages = new LinkedList<>();
}
/**
* gets the next url in the list
*
* @return the next url in the list
*/
private String nextUrl() {
@@ -57,38 +75,50 @@ public class WebCrawler {
/**
* searches for a word by crawling the web through hyperlinks
* @param url the url to start searching from
*
* @param url the url to start searching from
* @param searchWord the word to search for
*/
public void search(String url, String searchWord) {
// System.out.println("searching for " + searchWord + " in " + url);
int counter = 0;
while (this.pagesVisited.size() < amountOfPages) {
String curUrl;
CrawlBranch branch = new CrawlBranch();
CrawlBranch branch = new CrawlBranch(debug, this, logger);
if (this.pagesPending.isEmpty()) {
curUrl = url;
this.pagesVisited.add(url);
} else {
curUrl = this.nextUrl();
counter++;
// print(String.format("visiting page %s / %s",counter,amountOfPages));
System.out.println(String.format("visiting page %s / %s", counter, amountOfPages));
addMessage(String.format("visiting page %s / %s", counter, amountOfPages));
}
branch.crawl(curUrl);
int amount = branch.searchForWord(searchWord);
if (amount > 0) {
System.out.printf("SUCCESS -- word %s found at %s %s times", searchWord, curUrl, amount);
System.out.println();
// print(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount));
addMessage(String.format("SUCCESS -- word %s found at %s %s times\n", searchWord, curUrl, amount));
successPages++;
amountFound += amount;
if (shouldSaveHitLinks)
resultPages.add(curUrl);
urlHits.put(curUrl,amount);
resultPages.add(curUrl);
urlHits.put(curUrl, amount);
}
this.pagesPending.addAll(branch.getLinks());
}
System.out.println("========================");
System.out.printf("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages);
System.out.println("done searching");
// print("========================");
addMessage("========================");
// print(String.format("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages));
addMessage(String.format("DONE -- Visited %s webpages\nHits: %s\nAmount of pages with hits: %s\n", this.pagesVisited.size(), amountFound, successPages));
if (shouldSaveHitLinks) {
System.out.printf("Successful pages: \n%s", showCombinations(urlHits));
// print(String.format("Successful pages: \n%s", showCombinations(urlHits)));
addMessage(String.format("Successful pages: \n%s", showCombinations(urlHits)));
}
done = true;
}
private String display(List<String> list) {
@@ -110,6 +140,7 @@ public class WebCrawler {
/**
* sets the amount of max pages
*
* @param amount the amount of pages
*/
public void setAmountOfPages(int amount) {
@@ -132,6 +163,14 @@ public class WebCrawler {
return amountFound;
}
public boolean usesDebug() {
return debug;
}
public void setDebug(boolean debug) {
this.debug = debug;
}
/**
* clears the crawler
*/
@@ -142,53 +181,29 @@ public class WebCrawler {
this.pagesVisited.clear();
this.successPages = 0;
this.amountFound = 0;
this.done = false;
}
// public static void main(String[] args) {
// Scanner input = new Scanner(System.in);
// System.out.println("Enter a URL : ");
// String urlInput = input.nextLine().trim();
// crawler(urlInput);
//
// }
//
// public static void crawler(String startUrl) {
// ArrayList<String> pending = new ArrayList<>();
// ArrayList<String> traversed = new ArrayList<>();
//
// pending.add(startUrl);
// while (!pending.isEmpty() && traversed.size() <= 100) {
// String tempUrl = pending.remove(0);
// if (!traversed.contains(tempUrl)) {
// traversed.add(tempUrl);
// System.out.println("crawling: " + tempUrl);
//
// for (String s : getSubURLs(tempUrl)) {
// if (!traversed.contains(s)) pending.add(s);
// }
// }
// }
// }
//
// public static ArrayList<String> getSubURLs(String urlString) {
// ArrayList<String> subUrls = new ArrayList<>();
//
// try {
// URL url = new URL(urlString);
// Scanner urlScanner = new Scanner(url.openStream());
// int cur = 0;
// while (urlScanner.hasNext()) {
// String input = urlScanner.nextLine();
// cur = input.indexOf("http:", cur);
// while (cur > 0) {
// int endIndex = input.indexOf("\"", cur);
// cur = endIndex > 0 ? input.indexOf("http:", endIndex) : -1;
// }
// }
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// return subUrls;
// }
private void print(String text) {
if (debug) System.out.println(text);
}
public void addMessage(String message) {
if (!this.messages.contains(message)) {
this.messages.add(message);
}
// System.out.println("ADDED MESSAGE " + message);
}
public LinkedList<String> getMessages() {
return this.messages;
}
public void clearMessages() {
this.messages.clear();
}
public boolean isDone() {
return done;
}
}

View File

@@ -0,0 +1,14 @@
package main.java.webcrawler.visualiser;
import com.sun.javafx.scene.control.skin.ButtonSkin;
import javafx.animation.ScaleTransition;
import javafx.scene.control.Button;
import javafx.util.Duration;
public class RunSkin extends ButtonSkin {
public RunSkin(Button button) {
super(button);
ScaleTransition transition = new ScaleTransition(Duration.millis(200));
}
}

View File

@@ -0,0 +1,205 @@
package main.java.webcrawler.visualiser;
import javafx.animation.AnimationTimer;
import javafx.application.Application;
import javafx.beans.value.ChangeListener;
import javafx.beans.value.ObservableValue;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
import javafx.geometry.HPos;
import javafx.geometry.Insets;
import javafx.geometry.Pos;
import javafx.scene.Scene;
import javafx.scene.control.Button;
import javafx.scene.control.Label;
import javafx.scene.control.ListView;
import javafx.scene.control.ScrollPane;
import javafx.scene.control.TextField;
import javafx.scene.layout.BorderPane;
import javafx.scene.layout.HBox;
import javafx.scene.layout.VBox;
import javafx.stage.Stage;
import main.java.webcrawler.crawler.CrawlThread;
import main.java.webcrawler.crawler.WebCrawler;
import org.jfree.fx.FXGraphics2D;
import org.jfree.fx.ResizableCanvas;
import java.awt.*;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
public class Visualiser extends Application {
private double frameTime = 0;
private BorderPane pane;
private ResizableCanvas canvas;
private ListView<String> log;
private CrawlThread thread;
private WebCrawler crawler;
private int lastLogSize = 0;
//TODO make listview always scroll to bottom
//TODO implement visualisation
@Override
public void start(Stage primaryStage) throws Exception {
new Visualiser();
pane = new BorderPane();
canvas = new ResizableCanvas(this::draw, pane);
canvas.setWidth(1600);
canvas.setHeight(800);
pane.setCenter(canvas);
initGUIElements();
log("debug");
FXGraphics2D g2d = new FXGraphics2D(canvas.getGraphicsContext2D());
draw(g2d);
primaryStage.setScene(new Scene(pane));
primaryStage.setTitle("Webcrawler results");
primaryStage.show();
new AnimationTimer() {
long last = -1;
@Override
public void handle(long now) {
if (last == -1)
last = now;
update((now - last) / 1000000000.0);
last = now;
draw(g2d);
}
}.start();
}
private void initGUIElements() {
HBox top = new HBox(100);
top.getStyleClass().add("content");
top.setPadding(new Insets(10, 10, 10, 10));
top.setPrefWidth(canvas.getWidth());
top.setPrefHeight(200);
pane.setTop(top);
pane.getStylesheets().add(getClass().getResource("../../../resources/stylesheets/visualiser.css").toExternalForm());
// start url, word to search, amount of pages, debug (?)
TextField urlField = new TextField();
urlField.setPromptText("Enter the starting url");
TextField wordField = new TextField();
wordField.setPromptText("Enter the word to search for");
TextField amountField = new TextField();
makeNumeric(amountField);
amountField.setPromptText("Maximum amount of pages the crawler should visit..");
VBox content = new VBox(5);
content.setAlignment(Pos.CENTER_LEFT);
content.setMinWidth(400);
content.setPadding(new Insets(0, 0, 0, 100));
content.getChildren().addAll(
new Label("Starting url:"),
urlField,
new Label("Word to search for:"),
wordField,
new Label("Maximum amount of pages:"),
amountField);
top.getChildren().add(content);
Button button = new Button("Run");
button.setOnAction(e -> {
// log.getItems().clear();
thread = new CrawlThread(Integer.parseInt(amountField.getText()), true, parseUrl(urlField.getText()), wordField.getText(), this);
thread.start();
this.crawler = thread.getCrawler();
System.out.println(crawler);
ObservableList<String> crawlerMessages = FXCollections.observableList(crawler.messages);
this.log.setItems(crawlerMessages);
});
top.getChildren().add(button);
log = new ListView<>();
log.setMinWidth(1100);
top.setAlignment(Pos.CENTER_LEFT);
top.getChildren().add(log);
}
private String parseUrl(String text) {
if (!text.startsWith("http://")) {
text = "http://" + text;
}
if (text.startsWith("https")) {
text = text.replace("https", "http");
}
System.out.println("parsed to " + text);
return text;
}
private void makeNumeric(TextField textField) {
// force the field to be numeric only
textField.textProperty().addListener(new ChangeListener<String>() {
@Override
public void changed(ObservableValue<? extends String> observable, String oldValue,
String newValue) {
if (!newValue.matches("\\d*")) {
textField.setText(newValue.replaceAll("[^\\d]", ""));
}
}
});
}
public void draw(FXGraphics2D graphics) {
graphics.setBackground(new Color(43, 43, 46));
graphics.clearRect(0, 0, (int) canvas.getWidth(), (int) canvas.getHeight());
// graphics.setColor(Color.red);
// graphics.draw(new Rectangle2D.Double(10, 10, 500, 500));
}
private void updateFrame() {
}
public void update(double deltaTime) {
this.frameTime += deltaTime;
if (this.frameTime > 1d / 60d) {
updateFrame();
this.frameTime = 0d;
}
if (this.log.getItems().isEmpty()) {
this.log.getItems().add("test");
}
this.log.refresh();
// if (thread != null && thread.isAlive()) {
// if (crawler == null) {
// crawler = thread.getCrawler();
// }
// if (crawler != null) {
// if (!this.crawler.isDone()) {
//
// List<String> msgs = new ArrayList<>(crawler.getMessages());
// System.out.println(msgs);
//// if (!msgs.isEmpty()) {
//// System.out.println("adding messages:\n" + msgs);
// log.getItems().addAll(msgs);
// thread.getCrawler().clearMessages();
if (!log.getItems().isEmpty())
log.scrollTo(log.getItems().size() - 1);
//// lastLogSize = log.getItems().size();
//
//// }
// }
// }
//
// }
}
public void log(String item) {
try {
this.log.getItems().add(item);
} catch (Exception e) {
System.out.println("exception caught");
}
}
}

View File

@@ -0,0 +1,34 @@
.content {
-fx-background-color: #2b2b2e;
-fx-font-family: Consolas;
-fx-border-style: solid;
-fx-border-width: 0.5px;
-fx-border-color: #9cb8ae;
-fx-font-size: 16px;
}
.button {
-fx-background-color: #4c6a6b;
}
.label {
-fx-text-fill: #13e89a;
}
.text-field {
-fx-background-color: #7a7a7a;
}
.text-field:focused {
-fx-text-fill: #13e89a;
}
.list-view {
-fx-background-color: black;
}
.list-cell {
-fx-background-color: black;
-fx-text-fill: #00d60e;
-fx-font-size: 14px;
}