Browse Source

add firefox support

master
Jonathan Cobb 3 years ago
parent
commit
5745fab675
12 changed files with 180 additions and 51 deletions
  1. +13
    -5
      README.md
  2. +20
    -0
      bin/update_firefox_driver.sh
  3. +3
    -2
      examples/crawl_config.json
  4. +1
    -1
      src/main/java/bubble/crunch/CrawlJob.java
  5. +2
    -2
      src/main/java/bubble/crunch/CrawlSite.java
  6. +14
    -4
      src/main/java/bubble/crunch/CrawlStats.java
  7. +7
    -0
      src/main/java/bubble/crunch/InvalidCrawlOptionsException.java
  8. +0
    -7
      src/main/java/bubble/crunch/InvalidCrawlSiteException.java
  9. +10
    -4
      src/main/java/bubble/crunch/WebDriverPool.java
  10. +24
    -3
      src/main/java/bubble/crunch/WebDriverType.java
  11. +16
    -2
      src/main/java/bubble/crunch/main/BubbleCrunchMain.java
  12. +70
    -21
      src/main/java/bubble/crunch/main/BubbleCrunchOptions.java

+ 13
- 5
README.md View File

@@ -15,20 +15,28 @@ To build from source:

mvn -Puberjar clean package

# Install Chrome and Web Driver
Before running, you need to install Google Chrome and the Chrome driver for Selenium.
# Install Browsers and Web Driver
Before running, you need to install one or both of:

If you are running a desktop, you can probably just install Chrome normally, if you haven't already.
* Google Chrome and the Chrome web driver for Selenium.
* Firefox and the Firefox web driver for Selenium.

If you're running Ubuntu 18.04 server with no desktop, install Chromium headless:
If you are running a desktop, you can probably just install Chrome or Firefox normally, if you haven't already.

## Install Firefox

apt install firefox

## Install Chrome

apt install software-properties-common -y
add-apt-repository ppa:canonical-chromium-builds/stage -y
apt-get update -y
apt install chromium-browser -y

Install the Web Driver:
## Install Web Drivers:

./bin/update_firefox_driver.sh
./bin/update_chrome_driver.sh

# Running


+ 20
- 0
bin/update_firefox_driver.sh View File

@@ -0,0 +1,20 @@
#!/bin/bash

function die {
echo 1>&2 "fatal error: ${1}"
exit 1
}

BASE_DIR="$(cd "$(dirname "${0}")/.." && pwd)"
DRIVER_DIR="${BASE_DIR}/drivers"

FF_DRIVER_URL=https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-linux64.tar.gz
DRIVER_TEMP=$(mktemp /tmp/geckodriver_linux64.tar.gz.XXXXXXX)

curl -L ${FF_DRIVER_URL} > ${DRIVER_TEMP} || die "Error downloading ${FF_DRIVER_URL}"

if [[ ! -d "${DRIVER_DIR}" ]] ; then
mkdir "${DRIVER_DIR}" || die "Error creating driver dir: ${DRIVER_DIR}"
fi
cd "${DRIVER_DIR}" && tar xzf "${DRIVER_TEMP}" || die "Error untarring ${DRIVER_TEMP}"
rm -f "${DRIVER_TEMP}"

+ 3
- 2
examples/crawl_config.json View File

@@ -3,14 +3,15 @@
{
// Run with -h to see info about these settings. All are optional.

"driverPath": "@CRUNCH_DRIVER_PATH", // use this env var name from chrome webdriver
"driverPath": "@CRUNCH_DRIVER_PATH", // use this env var name to find chrome webdriver
"driverType": "chrome", // web driver, firefox or chrome (default chrome)
"screenshotsDir": "/tmp", // save screenshots here (set to /dev/null to skip sceenshots, default /tmp)
"numThreads": 5, // how many threads to run (default 5)
"maxDepth": 10, // how many levels deep to crawl (default 10)
"crawlTimeout": "10m", // how long for the entire crawl before stopping (default 10m)
"pageTimeout": "1m", // how long to wait for page load before timing out (default 1m)
"maxLinksPerPage": 50, // how many links to follow on each page (default 50)
"csvFile": "/tmp/crawls.csv", // write crawl stats here. (default is stdout)
"csvFile": "/tmp/crawls.csv", // write crawl stats here. (default stdout)

// At least one entry must be present in the array below
"crawl": [


+ 1
- 1
src/main/java/bubble/crunch/CrawlJob.java View File

@@ -85,7 +85,7 @@ public class CrawlJob implements Runnable {
stack.push(url);

if (stack.size() >= maxDepth) {
log.info("run(" + url + "): maxDepth (" + maxDepth + ") reached, not crawling");
log.trace("run(" + url + "): maxDepth (" + maxDepth + ") reached, not crawling");
return;
}



+ 2
- 2
src/main/java/bubble/crunch/CrawlSite.java View File

@@ -20,10 +20,10 @@ public class CrawlSite {
try {
new URL(url);
} catch (Exception e) {
throw new InvalidCrawlSiteException("Expected 'crawl' element to be URL string: "+url);
throw new InvalidCrawlOptionsException("Expected 'crawl' element to be URL string: "+url);
}
if (!isHttpOrHttps(url)) {
throw new InvalidCrawlSiteException("Expected 'crawl' element to begin with http:// or https:// -- "+url);
throw new InvalidCrawlOptionsException("Expected 'crawl' element to begin with http:// or https:// -- "+url);
}
this.url = url;
return this;


+ 14
- 4
src/main/java/bubble/crunch/CrawlStats.java View File

@@ -4,10 +4,11 @@ import java.io.*;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import static org.cobbzilla.util.daemon.ZillaRuntime.die;
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;
import static org.cobbzilla.util.daemon.ZillaRuntime.*;
import static org.cobbzilla.util.http.HttpSchemes.SCHEME_HTTP;
import static org.cobbzilla.util.http.HttpSchemes.SCHEME_HTTPS;
import static org.cobbzilla.util.io.FileUtil.abs;
import static org.cobbzilla.util.time.TimeUtil.DATE_FORMAT_YYYY_MM_DD_HH_mm_ss;

public class CrawlStats {

@@ -15,8 +16,17 @@ public class CrawlStats {

private final Map<String, CrawlData> data = new ConcurrentHashMap<>();

public CrawlStats(File csvFile) throws IOException {
this.writer = csvFile == null ? new OutputStreamWriter(System.out) : new FileWriter(csvFile);
public CrawlStats(File csvFile, boolean replay) throws IOException {
if (replay) {
this.writer = csvFile == null ? new OutputStreamWriter(System.out) : new FileWriter(replayFile(csvFile));
} else {
this.writer = csvFile == null ? new OutputStreamWriter(System.out) : new FileWriter(csvFile);
}
}

private File replayFile(File csvFile) {
final String path = abs(csvFile);
return new File(path.substring(0, path.length()-".csv".length()) + "_replay_" + DATE_FORMAT_YYYY_MM_DD_HH_mm_ss.print(now()) + ".csv");
}

public boolean alreadyVisited(String url) {


+ 7
- 0
src/main/java/bubble/crunch/InvalidCrawlOptionsException.java View File

@@ -0,0 +1,7 @@
package bubble.crunch;

public class InvalidCrawlOptionsException extends RuntimeException {

public InvalidCrawlOptionsException(String message) { super(message); }

}

+ 0
- 7
src/main/java/bubble/crunch/InvalidCrawlSiteException.java View File

@@ -1,7 +0,0 @@
package bubble.crunch;

public class InvalidCrawlSiteException extends RuntimeException {

public InvalidCrawlSiteException(String message) { super(message); }

}

+ 10
- 4
src/main/java/bubble/crunch/WebDriverPool.java View File

@@ -7,12 +7,14 @@ import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;

import java.util.ArrayList;
import java.util.List;

import static java.lang.Boolean.TRUE;
import static org.cobbzilla.util.daemon.ZillaRuntime.die;
import static org.cobbzilla.util.daemon.ZillaRuntime.threadName;

@@ -61,21 +63,25 @@ public class WebDriverPool {
}

private WebDriver initDriver(BubbleCrunchOptions opts) {

final WebDriverType driverType = opts.getDriverType();
System.setProperty(driverType.getDriverProperty(), opts.getWebDriverPath());

switch (driverType) {
case chrome:
final ChromeOptions chromeOptions = new ChromeOptions();
// chromeOptions.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors");
final ChromeOptions chromeOptions = new ChromeOptions().setBinary(opts.browserBinary());
chromeOptions.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200");
return new ChromeDriver(chromeOptions);

case firefox:
return new FirefoxDriver(new FirefoxOptions().setLegacy(true));
System.setProperty("webdriver.firefox.marionette", TRUE.toString());
final FirefoxBinary firefoxBinary = new FirefoxBinary(opts.browserBinary());
firefoxBinary.addCommandLineOptions("-headless");
final FirefoxOptions firefoxOptions = new FirefoxOptions().setBinary(firefoxBinary).setLegacy(false);
return new FirefoxDriver(firefoxOptions);

default:
return die("Invalid driver type: "+driverType);
return die("Invalid or unsupported driver type: "+driverType);
}
}



+ 24
- 3
src/main/java/bubble/crunch/WebDriverType.java View File

@@ -2,13 +2,34 @@ package bubble.crunch;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;

@AllArgsConstructor
import java.io.File;

import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;
import static org.cobbzilla.util.system.CommandShell.execScript;

@AllArgsConstructor @Slf4j
public enum WebDriverType {

chrome ("webdriver.chrome.driver"),
firefox ("webdriver.firefox.marionette");
chrome ("webdriver.chrome.driver",
new File("/usr/bin/google-chrome"),
"killall chrome ; killall chromedriver"),

firefox ("webdriver.gecko.driver",
new File("/usr/bin/firefox"),
"killall firefox ; killall geckodriver");

@Getter private final String driverProperty;
@Getter private final File defaultBinary;
private final String cleanupScript;

public void cleanup () {
try {
execScript(cleanupScript);
} catch (Exception e) {
log.error("Error cleaning up "+this+": ("+cleanupScript+"): "+shortError(e));
}
}

}

+ 16
- 2
src/main/java/bubble/crunch/main/BubbleCrunchMain.java View File

@@ -63,7 +63,7 @@ public class BubbleCrunchMain extends BaseMain<BubbleCrunchOptions> {
}

final File csvFile = opts.getCsvFile();
final CrawlStats stats = new CrawlStats(csvFile);
final CrawlStats stats = new CrawlStats(csvFile, opts.isReplay());
ExecutorService exec = null;
try {
exec = fixedPool(opts.getNumThreads());
@@ -72,7 +72,16 @@ public class BubbleCrunchMain extends BaseMain<BubbleCrunchOptions> {
final long timeout = opts.getCrawlTimeoutMillis();

final AtomicLong activeCrawls = new AtomicLong(0);
for (CrawlSite site : sites) {
for (int i=0; i<sites.size(); i++) {
final CrawlSite site = sites.get(i);
while (activeCrawls.get() > 100) {
sleep(SECONDS.toMillis(5), "waiting to submit new crawls");
log.info("waiting for "+activeCrawls.get()+" active crawls before submitting more, "+(sites.size()-i-1)+" pending");
if (now() - start > timeout) {
log.error("timeout! active crawls="+activeCrawls.get()+", pending="+(sites.size()-i-1));
return;
}
}
final CrawlJob crawlJob = new CrawlJob(driverPool,
site,
screenshotsDir,
@@ -92,9 +101,14 @@ public class BubbleCrunchMain extends BaseMain<BubbleCrunchOptions> {
log.info("waiting for "+activeCrawls.get()+" active crawls");
sleep(SECONDS.toMillis(5), "waiting for crawl to finish");
}
if (now() - start > timeout) {
log.error("timeout! active crawls="+activeCrawls.get());
return;
}
out("Crawl completed");

} finally {
opts.getDriverType().cleanup();
if (exec != null) exec.shutdownNow();
}
}


+ 70
- 21
src/main/java/bubble/crunch/main/BubbleCrunchOptions.java View File

@@ -1,7 +1,7 @@
package bubble.crunch.main;

import bubble.crunch.CrawlSite;
import bubble.crunch.InvalidCrawlSiteException;
import bubble.crunch.InvalidCrawlOptionsException;
import bubble.crunch.NoWebDriverException;
import bubble.crunch.WebDriverType;
import com.fasterxml.jackson.annotation.JsonIgnore;
@@ -17,7 +17,9 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.cobbzilla.util.daemon.ZillaRuntime.empty;
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;
import static org.cobbzilla.util.http.HttpSchemes.isHttpOrHttps;
import static org.cobbzilla.util.io.FileUtil.abs;
import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.json.JsonUtil.jsonWithComments;
@@ -31,6 +33,14 @@ public class BubbleCrunchOptions extends BaseMainOptions {
@Option(name=OPT_DRIVER_TYPE, aliases=LONGOPT_DRIVER_TYPE, usage=USAGE_DRIVER_TYPE)
@Getter @Setter private WebDriverType driverType = WebDriverType.chrome;

public static final String USAGE_BROWSER_BINARY = "Path to browser binary. Default is /usr/bin/google-chrome for Chrome or /usr/bin/firefox for Firefox";
public static final String OPT_BROWSER_BINARY = "-b";
public static final String LONGOPT_BROWSER_BINARY = "--browser";
@Option(name=OPT_BROWSER_BINARY, aliases=LONGOPT_BROWSER_BINARY, usage=USAGE_BROWSER_BINARY)
@Getter @Setter private File browserBinary = null;

public File browserBinary () { return browserBinary == null ? driverType.getDefaultBinary() : browserBinary; }

public static final String ENV_CRUNCH_DRIVER_PATH = "CRUNCH_DRIVER_PATH";
public static final String USAGE_DRIVER_PATH = "Path to driver binary, or if starts with '@', name of env var to check. Default is value of env variable " + ENV_CRUNCH_DRIVER_PATH;
public static final String OPT_DRIVER_PATH = "-p";
@@ -51,7 +61,7 @@ public class BubbleCrunchOptions extends BaseMainOptions {
return path;
}

public static final String USAGE_CRAWL_FILE = "What to crawl. Must be either (a) JSON file containing an array of CrawlSite definitions or (b) JSON file containing a BubbleCrunchOptions object, or (c) a text file with a list of URLs. Required.";
public static final String USAGE_CRAWL_FILE = "What to crawl. Must be one of (a) JSON file containing an array of CrawlSite definitions or (b) JSON file containing a BubbleCrunchOptions object or (c) a text file with a list of URLs, or (d) a CSV file that was the output from a previous run (in this case, only the URLs are requested without any crawling). Required.";
public static final String OPT_CRAWL_FILE = "-c";
public static final String LONGOPT_CRAWL_FILE = "--crawl";
@Option(name=OPT_CRAWL_FILE, aliases=LONGOPT_CRAWL_FILE, usage=USAGE_CRAWL_FILE, required=true)
@@ -103,20 +113,33 @@ public class BubbleCrunchOptions extends BaseMainOptions {
public boolean hasMaxLinksPerPage () { return maxLinksPerPage == -1; }

public static final String USAGE_CSV_OUTFILE = "CSV output file. Default is stdout";
public static final String OPT_CSV_OUTFILE = "-o";
public static final String LONGOPT_CSV_OUTFILE = "--output";
public static final String OPT_CSV_OUTFILE = "-C";
public static final String LONGOPT_CSV_OUTFILE = "--csv-file";
@Option(name=OPT_CSV_OUTFILE, aliases=LONGOPT_CSV_OUTFILE, usage=USAGE_CSV_OUTFILE)
@Getter @Setter private File csvFile;

public static final String USAGE_REPLAY = "Replay CSV file as input. Default is false. Output will be csv file with .replay_datestamp appended";
public static final String OPT_REPLAY = "-r";
public static final String LONGOPT_REPLAY = "--replay";
@Option(name=OPT_REPLAY, aliases=LONGOPT_REPLAY, usage=USAGE_REPLAY)
@Getter @Setter private boolean replay = false;

@JsonIgnore @Getter private final List<CrawlSite> crawlSites = new ArrayList<>();

public BubbleCrunchOptions init () throws IOException {
if (!abs(getCrawlFile()).endsWith(".json")) {
// not json, must be a list of urls
final List<String> urls = FileUtil.toStringList(getCrawlFile());
for (String url : urls) {
if (url.startsWith("#")) continue;
crawlSites.add(new CrawlSite(url));
final String crawlName = abs(getCrawlFile());
if (!crawlName.endsWith(".json")) {
// not JSON, check for CSV
if (crawlName.endsWith(".csv")) {
addCsvCrawls(this);

} else {
// not CSV, must be a list of urls
final List<String> urls = FileUtil.toStringList(getCrawlFile());
for (String url : urls) {
if (url.startsWith("#")) continue;
crawlSites.add(new CrawlSite(url));
}
}
return this;

@@ -126,7 +149,7 @@ public class BubbleCrunchOptions extends BaseMainOptions {
try {
crawlSpec = jsonWithComments(FileUtil.toString(getCrawlFile()), JsonNode.class);
} catch (Exception e) {
throw new InvalidCrawlSiteException("Error reading config file ("+abs(getCrawlFile())+"): "+shortError(e));
throw new InvalidCrawlOptionsException("Error reading config file ("+ crawlName +"): "+shortError(e));
}
// determine if json is array of CrawlSite objects, or a full BubbleCrunchOptions object
if (crawlSpec.isArray()) {
@@ -137,20 +160,46 @@ public class BubbleCrunchOptions extends BaseMainOptions {
} else if (crawlSpec.isObject()) {
// is json and a full config, re-parse...
final BubbleCrunchOptions opts = json(crawlSpec, BubbleCrunchOptions.class);
final JsonNode crawlNode = opts.getCrawl();
if (crawlNode == null) {
throw new InvalidCrawlSiteException("Expected value of 'crawl' property to be a JSON array, was null");
if (opts.isReplay()) {
addCsvCrawls(opts);

} else {
final JsonNode crawlNode = opts.getCrawl();
if (crawlNode == null) {
throw new InvalidCrawlOptionsException("Expected value of 'crawl' property to be a JSON array, was null");
}
if (!crawlNode.isArray()) {
throw new InvalidCrawlOptionsException("Expected value of 'crawl' property to be a JSON array, was: " + json(crawlNode));
}
if (crawlNode.size() > 0) opts.getCrawlSites().addAll(addCrawlSites(crawlNode));
}
if (!crawlNode.isArray()) {
throw new InvalidCrawlSiteException("Expected value of 'crawl' property to be a JSON array, was: "+json(crawlNode));
}
if (crawlNode.size() > 0) opts.getCrawlSites().addAll(addCrawlSites(crawlNode));
return opts;

} else {
throw new InvalidCrawlSiteException("Expected config file ("+abs(getCrawlFile())+") to be a JSON array of URLs/CrawlSite objects, or a BubbleCrunchOptions configuration object.");
throw new InvalidCrawlOptionsException("Expected config file ("+ crawlName +") to be a JSON array of URLs/CrawlSite objects, or a BubbleCrunchOptions configuration object.");
}
}
}

public void addCsvCrawls(BubbleCrunchOptions opts) throws IOException {
final File csvFile = opts.getCsvFile();
if (!csvFile.exists()) throw new InvalidCrawlOptionsException("CSV file does not exist: "+abs(csvFile));

final List<CrawlSite> crawlSites = opts.getCrawlSites();
final List<String> lines = FileUtil.toStringList(csvFile);
if (empty(lines)) throw new InvalidCrawlOptionsException("CSV file is empty: "+abs(csvFile));
for (String line : lines) {
final String[] parts = line.split(",");
for (String part : parts) {
if (isHttpOrHttps(part)) {
crawlSites.add(new CrawlSite(part));
break;
}
}
}
if (empty(crawlSites)) throw new InvalidCrawlOptionsException("CSV file did not contain any URLs: "+abs(csvFile));
opts.setMaxDepth(1);
opts.setMaxLinksPerPage(0);
}

private List<CrawlSite> addCrawlSites(JsonNode crawlNode) {
@@ -161,12 +210,12 @@ public class BubbleCrunchOptions extends BaseMainOptions {
try {
sites.add(json(node, CrawlSite.class));
} catch (Exception e) {
throw new InvalidCrawlSiteException("Expected 'crawl' element to be a CrawlSite object: "+json(node));
throw new InvalidCrawlOptionsException("Expected 'crawl' element to be a CrawlSite object: "+json(node));
}
} else if (node.isTextual()) {
sites.add(new CrawlSite(node.textValue()));
} else {
throw new InvalidCrawlSiteException("Expected 'crawl' element to be CrawlSite object or URL string: "+json(node));
throw new InvalidCrawlOptionsException("Expected 'crawl' element to be CrawlSite object or URL string: "+json(node));
}
}
return sites;


Loading…
Cancel
Save