@@ -15,20 +15,28 @@ To build from source: | |||
mvn -Puberjar clean package | |||
# Install Chrome and Web Driver | |||
Before running, you need to install Google Chrome and the Chrome driver for Selenium. | |||
# Install Browsers and Web Driver | |||
Before running, you need to install one or both of: | |||
If you are running a desktop, you can probably just install Chrome normally, if you haven't already. | |||
* Google Chrome and the Chrome web driver for Selenium. | |||
* Firefox and the Firefox web driver for Selenium. | |||
If you're running Ubuntu 18.04 server with no desktop, install Chromium headless: | |||
If you are running a desktop, you can probably just install Chrome or Firefox normally, if you haven't already. | |||
## Install Firefox | |||
apt install firefox | |||
## Install Chrome | |||
apt install software-properties-common -y | |||
add-apt-repository ppa:canonical-chromium-builds/stage -y | |||
apt-get update -y | |||
apt install chromium-browser -y | |||
Install the Web Driver: | |||
## Install Web Drivers: | |||
./bin/update_firefox_driver.sh | |||
./bin/update_chrome_driver.sh | |||
# Running | |||
@@ -0,0 +1,20 @@ | |||
#!/bin/bash | |||
function die { | |||
echo 1>&2 "fatal error: ${1}" | |||
exit 1 | |||
} | |||
BASE_DIR="$(cd "$(dirname "${0}")/.." && pwd)" | |||
DRIVER_DIR="${BASE_DIR}/drivers" | |||
FF_DRIVER_URL=https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-linux64.tar.gz | |||
DRIVER_TEMP=$(mktemp /tmp/geckodriver_linux64.tar.gz.XXXXXXX) | |||
curl -L ${FF_DRIVER_URL} > ${DRIVER_TEMP} || die "Error downloading ${FF_DRIVER_URL}" | |||
if [[ ! -d "${DRIVER_DIR}" ]] ; then | |||
mkdir "${DRIVER_DIR}" || die "Error creating driver dir: ${DRIVER_DIR}" | |||
fi | |||
cd "${DRIVER_DIR}" && tar xzf "${DRIVER_TEMP}" || die "Error untarring ${DRIVER_TEMP}" | |||
rm -f "${DRIVER_TEMP}" |
@@ -3,14 +3,15 @@ | |||
{ | |||
// Run with -h to see info about these settings. All are optional. | |||
"driverPath": "@CRUNCH_DRIVER_PATH", // use this env var name from chrome webdriver | |||
"driverPath": "@CRUNCH_DRIVER_PATH", // use this env var name to find chrome webdriver | |||
"driverType": "chrome", // web driver, firefox or chrome (default chrome) | |||
"screenshotsDir": "/tmp", // save screenshots here (set to /dev/null to skip sceenshots, default /tmp) | |||
"numThreads": 5, // how many threads to run (default 5) | |||
"maxDepth": 10, // how many levels deep to crawl (default 10) | |||
"crawlTimeout": "10m", // how long for the entire crawl before stopping (default 10m) | |||
"pageTimeout": "1m", // how long to wait for page load before timing out (default 1m) | |||
"maxLinksPerPage": 50, // how many links to follow on each page (default 50) | |||
"csvFile": "/tmp/crawls.csv", // write crawl stats here. (default is stdout) | |||
"csvFile": "/tmp/crawls.csv", // write crawl stats here. (default stdout) | |||
// At least one entry must be present in the array below | |||
"crawl": [ | |||
@@ -85,7 +85,7 @@ public class CrawlJob implements Runnable { | |||
stack.push(url); | |||
if (stack.size() >= maxDepth) { | |||
log.info("run(" + url + "): maxDepth (" + maxDepth + ") reached, not crawling"); | |||
log.trace("run(" + url + "): maxDepth (" + maxDepth + ") reached, not crawling"); | |||
return; | |||
} | |||
@@ -20,10 +20,10 @@ public class CrawlSite { | |||
try { | |||
new URL(url); | |||
} catch (Exception e) { | |||
throw new InvalidCrawlSiteException("Expected 'crawl' element to be URL string: "+url); | |||
throw new InvalidCrawlOptionsException("Expected 'crawl' element to be URL string: "+url); | |||
} | |||
if (!isHttpOrHttps(url)) { | |||
throw new InvalidCrawlSiteException("Expected 'crawl' element to begin with http:// or https:// -- "+url); | |||
throw new InvalidCrawlOptionsException("Expected 'crawl' element to begin with http:// or https:// -- "+url); | |||
} | |||
this.url = url; | |||
return this; | |||
@@ -4,10 +4,11 @@ import java.io.*; | |||
import java.util.Map; | |||
import java.util.concurrent.ConcurrentHashMap; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.die; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.*; | |||
import static org.cobbzilla.util.http.HttpSchemes.SCHEME_HTTP; | |||
import static org.cobbzilla.util.http.HttpSchemes.SCHEME_HTTPS; | |||
import static org.cobbzilla.util.io.FileUtil.abs; | |||
import static org.cobbzilla.util.time.TimeUtil.DATE_FORMAT_YYYY_MM_DD_HH_mm_ss; | |||
public class CrawlStats { | |||
@@ -15,8 +16,17 @@ public class CrawlStats { | |||
private final Map<String, CrawlData> data = new ConcurrentHashMap<>(); | |||
public CrawlStats(File csvFile) throws IOException { | |||
this.writer = csvFile == null ? new OutputStreamWriter(System.out) : new FileWriter(csvFile); | |||
public CrawlStats(File csvFile, boolean replay) throws IOException { | |||
if (replay) { | |||
this.writer = csvFile == null ? new OutputStreamWriter(System.out) : new FileWriter(replayFile(csvFile)); | |||
} else { | |||
this.writer = csvFile == null ? new OutputStreamWriter(System.out) : new FileWriter(csvFile); | |||
} | |||
} | |||
private File replayFile(File csvFile) { | |||
final String path = abs(csvFile); | |||
return new File(path.substring(0, path.length()-".csv".length()) + "_replay_" + DATE_FORMAT_YYYY_MM_DD_HH_mm_ss.print(now()) + ".csv"); | |||
} | |||
public boolean alreadyVisited(String url) { | |||
@@ -0,0 +1,7 @@ | |||
package bubble.crunch; | |||
public class InvalidCrawlOptionsException extends RuntimeException { | |||
public InvalidCrawlOptionsException(String message) { super(message); } | |||
} |
@@ -1,7 +0,0 @@ | |||
package bubble.crunch; | |||
public class InvalidCrawlSiteException extends RuntimeException { | |||
public InvalidCrawlSiteException(String message) { super(message); } | |||
} |
@@ -7,12 +7,14 @@ import lombok.extern.slf4j.Slf4j; | |||
import org.openqa.selenium.WebDriver; | |||
import org.openqa.selenium.chrome.ChromeDriver; | |||
import org.openqa.selenium.chrome.ChromeOptions; | |||
import org.openqa.selenium.firefox.FirefoxBinary; | |||
import org.openqa.selenium.firefox.FirefoxDriver; | |||
import org.openqa.selenium.firefox.FirefoxOptions; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import static java.lang.Boolean.TRUE; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.die; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.threadName; | |||
@@ -61,21 +63,25 @@ public class WebDriverPool { | |||
} | |||
private WebDriver initDriver(BubbleCrunchOptions opts) { | |||
final WebDriverType driverType = opts.getDriverType(); | |||
System.setProperty(driverType.getDriverProperty(), opts.getWebDriverPath()); | |||
switch (driverType) { | |||
case chrome: | |||
final ChromeOptions chromeOptions = new ChromeOptions(); | |||
// chromeOptions.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors"); | |||
final ChromeOptions chromeOptions = new ChromeOptions().setBinary(opts.browserBinary()); | |||
chromeOptions.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200"); | |||
return new ChromeDriver(chromeOptions); | |||
case firefox: | |||
return new FirefoxDriver(new FirefoxOptions().setLegacy(true)); | |||
System.setProperty("webdriver.firefox.marionette", TRUE.toString()); | |||
final FirefoxBinary firefoxBinary = new FirefoxBinary(opts.browserBinary()); | |||
firefoxBinary.addCommandLineOptions("-headless"); | |||
final FirefoxOptions firefoxOptions = new FirefoxOptions().setBinary(firefoxBinary).setLegacy(false); | |||
return new FirefoxDriver(firefoxOptions); | |||
default: | |||
return die("Invalid driver type: "+driverType); | |||
return die("Invalid or unsupported driver type: "+driverType); | |||
} | |||
} | |||
@@ -2,13 +2,34 @@ package bubble.crunch; | |||
import lombok.AllArgsConstructor; | |||
import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
@AllArgsConstructor | |||
import java.io.File; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; | |||
import static org.cobbzilla.util.system.CommandShell.execScript; | |||
@AllArgsConstructor @Slf4j | |||
public enum WebDriverType { | |||
chrome ("webdriver.chrome.driver"), | |||
firefox ("webdriver.firefox.marionette"); | |||
chrome ("webdriver.chrome.driver", | |||
new File("/usr/bin/google-chrome"), | |||
"killall chrome ; killall chromedriver"), | |||
firefox ("webdriver.gecko.driver", | |||
new File("/usr/bin/firefox"), | |||
"killall firefox ; killall geckodriver"); | |||
@Getter private final String driverProperty; | |||
@Getter private final File defaultBinary; | |||
private final String cleanupScript; | |||
public void cleanup () { | |||
try { | |||
execScript(cleanupScript); | |||
} catch (Exception e) { | |||
log.error("Error cleaning up "+this+": ("+cleanupScript+"): "+shortError(e)); | |||
} | |||
} | |||
} |
@@ -63,7 +63,7 @@ public class BubbleCrunchMain extends BaseMain<BubbleCrunchOptions> { | |||
} | |||
final File csvFile = opts.getCsvFile(); | |||
final CrawlStats stats = new CrawlStats(csvFile); | |||
final CrawlStats stats = new CrawlStats(csvFile, opts.isReplay()); | |||
ExecutorService exec = null; | |||
try { | |||
exec = fixedPool(opts.getNumThreads()); | |||
@@ -72,7 +72,16 @@ public class BubbleCrunchMain extends BaseMain<BubbleCrunchOptions> { | |||
final long timeout = opts.getCrawlTimeoutMillis(); | |||
final AtomicLong activeCrawls = new AtomicLong(0); | |||
for (CrawlSite site : sites) { | |||
for (int i=0; i<sites.size(); i++) { | |||
final CrawlSite site = sites.get(i); | |||
while (activeCrawls.get() > 100) { | |||
sleep(SECONDS.toMillis(5), "waiting to submit new crawls"); | |||
log.info("waiting for "+activeCrawls.get()+" active crawls before submitting more, "+(sites.size()-i-1)+" pending"); | |||
if (now() - start > timeout) { | |||
log.error("timeout! active crawls="+activeCrawls.get()+", pending="+(sites.size()-i-1)); | |||
return; | |||
} | |||
} | |||
final CrawlJob crawlJob = new CrawlJob(driverPool, | |||
site, | |||
screenshotsDir, | |||
@@ -92,9 +101,14 @@ public class BubbleCrunchMain extends BaseMain<BubbleCrunchOptions> { | |||
log.info("waiting for "+activeCrawls.get()+" active crawls"); | |||
sleep(SECONDS.toMillis(5), "waiting for crawl to finish"); | |||
} | |||
if (now() - start > timeout) { | |||
log.error("timeout! active crawls="+activeCrawls.get()); | |||
return; | |||
} | |||
out("Crawl completed"); | |||
} finally { | |||
opts.getDriverType().cleanup(); | |||
if (exec != null) exec.shutdownNow(); | |||
} | |||
} | |||
@@ -1,7 +1,7 @@ | |||
package bubble.crunch.main; | |||
import bubble.crunch.CrawlSite; | |||
import bubble.crunch.InvalidCrawlSiteException; | |||
import bubble.crunch.InvalidCrawlOptionsException; | |||
import bubble.crunch.NoWebDriverException; | |||
import bubble.crunch.WebDriverType; | |||
import com.fasterxml.jackson.annotation.JsonIgnore; | |||
@@ -17,7 +17,9 @@ import java.io.IOException; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.empty; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; | |||
import static org.cobbzilla.util.http.HttpSchemes.isHttpOrHttps; | |||
import static org.cobbzilla.util.io.FileUtil.abs; | |||
import static org.cobbzilla.util.json.JsonUtil.json; | |||
import static org.cobbzilla.util.json.JsonUtil.jsonWithComments; | |||
@@ -31,6 +33,14 @@ public class BubbleCrunchOptions extends BaseMainOptions { | |||
@Option(name=OPT_DRIVER_TYPE, aliases=LONGOPT_DRIVER_TYPE, usage=USAGE_DRIVER_TYPE) | |||
@Getter @Setter private WebDriverType driverType = WebDriverType.chrome; | |||
public static final String USAGE_BROWSER_BINARY = "Path to browser binary. Default is /usr/bin/google-chrome for Chrome or /usr/bin/firefox for Firefox"; | |||
public static final String OPT_BROWSER_BINARY = "-b"; | |||
public static final String LONGOPT_BROWSER_BINARY = "--browser"; | |||
@Option(name=OPT_BROWSER_BINARY, aliases=LONGOPT_BROWSER_BINARY, usage=USAGE_BROWSER_BINARY) | |||
@Getter @Setter private File browserBinary = null; | |||
public File browserBinary () { return browserBinary == null ? driverType.getDefaultBinary() : browserBinary; } | |||
public static final String ENV_CRUNCH_DRIVER_PATH = "CRUNCH_DRIVER_PATH"; | |||
public static final String USAGE_DRIVER_PATH = "Path to driver binary, or if starts with '@', name of env var to check. Default is value of env variable " + ENV_CRUNCH_DRIVER_PATH; | |||
public static final String OPT_DRIVER_PATH = "-p"; | |||
@@ -51,7 +61,7 @@ public class BubbleCrunchOptions extends BaseMainOptions { | |||
return path; | |||
} | |||
public static final String USAGE_CRAWL_FILE = "What to crawl. Must be either (a) JSON file containing an array of CrawlSite definitions or (b) JSON file containing a BubbleCrunchOptions object, or (c) a text file with a list of URLs. Required."; | |||
public static final String USAGE_CRAWL_FILE = "What to crawl. Must be one of (a) JSON file containing an array of CrawlSite definitions or (b) JSON file containing a BubbleCrunchOptions object or (c) a text file with a list of URLs, or (d) a CSV file that was the output from a previous run (in this case, only the URLs are requested without any crawling). Required."; | |||
public static final String OPT_CRAWL_FILE = "-c"; | |||
public static final String LONGOPT_CRAWL_FILE = "--crawl"; | |||
@Option(name=OPT_CRAWL_FILE, aliases=LONGOPT_CRAWL_FILE, usage=USAGE_CRAWL_FILE, required=true) | |||
@@ -103,20 +113,33 @@ public class BubbleCrunchOptions extends BaseMainOptions { | |||
public boolean hasMaxLinksPerPage () { return maxLinksPerPage == -1; } | |||
public static final String USAGE_CSV_OUTFILE = "CSV output file. Default is stdout"; | |||
public static final String OPT_CSV_OUTFILE = "-o"; | |||
public static final String LONGOPT_CSV_OUTFILE = "--output"; | |||
public static final String OPT_CSV_OUTFILE = "-C"; | |||
public static final String LONGOPT_CSV_OUTFILE = "--csv-file"; | |||
@Option(name=OPT_CSV_OUTFILE, aliases=LONGOPT_CSV_OUTFILE, usage=USAGE_CSV_OUTFILE) | |||
@Getter @Setter private File csvFile; | |||
public static final String USAGE_REPLAY = "Replay CSV file as input. Default is false. Output will be csv file with .replay_datestamp appended"; | |||
public static final String OPT_REPLAY = "-r"; | |||
public static final String LONGOPT_REPLAY = "--replay"; | |||
@Option(name=OPT_REPLAY, aliases=LONGOPT_REPLAY, usage=USAGE_REPLAY) | |||
@Getter @Setter private boolean replay = false; | |||
@JsonIgnore @Getter private final List<CrawlSite> crawlSites = new ArrayList<>(); | |||
public BubbleCrunchOptions init () throws IOException { | |||
if (!abs(getCrawlFile()).endsWith(".json")) { | |||
// not json, must be a list of urls | |||
final List<String> urls = FileUtil.toStringList(getCrawlFile()); | |||
for (String url : urls) { | |||
if (url.startsWith("#")) continue; | |||
crawlSites.add(new CrawlSite(url)); | |||
final String crawlName = abs(getCrawlFile()); | |||
if (!crawlName.endsWith(".json")) { | |||
// not JSON, check for CSV | |||
if (crawlName.endsWith(".csv")) { | |||
addCsvCrawls(this); | |||
} else { | |||
// not CSV, must be a list of urls | |||
final List<String> urls = FileUtil.toStringList(getCrawlFile()); | |||
for (String url : urls) { | |||
if (url.startsWith("#")) continue; | |||
crawlSites.add(new CrawlSite(url)); | |||
} | |||
} | |||
return this; | |||
@@ -126,7 +149,7 @@ public class BubbleCrunchOptions extends BaseMainOptions { | |||
try { | |||
crawlSpec = jsonWithComments(FileUtil.toString(getCrawlFile()), JsonNode.class); | |||
} catch (Exception e) { | |||
throw new InvalidCrawlSiteException("Error reading config file ("+abs(getCrawlFile())+"): "+shortError(e)); | |||
throw new InvalidCrawlOptionsException("Error reading config file ("+ crawlName +"): "+shortError(e)); | |||
} | |||
// determine if json is array of CrawlSite objects, or a full BubbleCrunchOptions object | |||
if (crawlSpec.isArray()) { | |||
@@ -137,20 +160,46 @@ public class BubbleCrunchOptions extends BaseMainOptions { | |||
} else if (crawlSpec.isObject()) { | |||
// is json and a full config, re-parse... | |||
final BubbleCrunchOptions opts = json(crawlSpec, BubbleCrunchOptions.class); | |||
final JsonNode crawlNode = opts.getCrawl(); | |||
if (crawlNode == null) { | |||
throw new InvalidCrawlSiteException("Expected value of 'crawl' property to be a JSON array, was null"); | |||
if (opts.isReplay()) { | |||
addCsvCrawls(opts); | |||
} else { | |||
final JsonNode crawlNode = opts.getCrawl(); | |||
if (crawlNode == null) { | |||
throw new InvalidCrawlOptionsException("Expected value of 'crawl' property to be a JSON array, was null"); | |||
} | |||
if (!crawlNode.isArray()) { | |||
throw new InvalidCrawlOptionsException("Expected value of 'crawl' property to be a JSON array, was: " + json(crawlNode)); | |||
} | |||
if (crawlNode.size() > 0) opts.getCrawlSites().addAll(addCrawlSites(crawlNode)); | |||
} | |||
if (!crawlNode.isArray()) { | |||
throw new InvalidCrawlSiteException("Expected value of 'crawl' property to be a JSON array, was: "+json(crawlNode)); | |||
} | |||
if (crawlNode.size() > 0) opts.getCrawlSites().addAll(addCrawlSites(crawlNode)); | |||
return opts; | |||
} else { | |||
throw new InvalidCrawlSiteException("Expected config file ("+abs(getCrawlFile())+") to be a JSON array of URLs/CrawlSite objects, or a BubbleCrunchOptions configuration object."); | |||
throw new InvalidCrawlOptionsException("Expected config file ("+ crawlName +") to be a JSON array of URLs/CrawlSite objects, or a BubbleCrunchOptions configuration object."); | |||
} | |||
} | |||
} | |||
public void addCsvCrawls(BubbleCrunchOptions opts) throws IOException { | |||
final File csvFile = opts.getCsvFile(); | |||
if (!csvFile.exists()) throw new InvalidCrawlOptionsException("CSV file does not exist: "+abs(csvFile)); | |||
final List<CrawlSite> crawlSites = opts.getCrawlSites(); | |||
final List<String> lines = FileUtil.toStringList(csvFile); | |||
if (empty(lines)) throw new InvalidCrawlOptionsException("CSV file is empty: "+abs(csvFile)); | |||
for (String line : lines) { | |||
final String[] parts = line.split(","); | |||
for (String part : parts) { | |||
if (isHttpOrHttps(part)) { | |||
crawlSites.add(new CrawlSite(part)); | |||
break; | |||
} | |||
} | |||
} | |||
if (empty(crawlSites)) throw new InvalidCrawlOptionsException("CSV file did not contain any URLs: "+abs(csvFile)); | |||
opts.setMaxDepth(1); | |||
opts.setMaxLinksPerPage(0); | |||
} | |||
private List<CrawlSite> addCrawlSites(JsonNode crawlNode) { | |||
@@ -161,12 +210,12 @@ public class BubbleCrunchOptions extends BaseMainOptions { | |||
try { | |||
sites.add(json(node, CrawlSite.class)); | |||
} catch (Exception e) { | |||
throw new InvalidCrawlSiteException("Expected 'crawl' element to be a CrawlSite object: "+json(node)); | |||
throw new InvalidCrawlOptionsException("Expected 'crawl' element to be a CrawlSite object: "+json(node)); | |||
} | |||
} else if (node.isTextual()) { | |||
sites.add(new CrawlSite(node.textValue())); | |||
} else { | |||
throw new InvalidCrawlSiteException("Expected 'crawl' element to be CrawlSite object or URL string: "+json(node)); | |||
throw new InvalidCrawlOptionsException("Expected 'crawl' element to be CrawlSite object or URL string: "+json(node)); | |||
} | |||
} | |||
return sites; | |||