internetarchive · adam-miller · Jan 28, 2021 · Feb 2, 2021 · Feb 18, 2021 · Feb 23, 2021
diff --git a/contrib/pom.xml b/contrib/pom.xml
@@ -78,6 +78,11 @@
 			<artifactId>gson</artifactId>
 			<version>2.8.6</version>
 		</dependency>
+		<dependency>
+			<groupId>org.xerial</groupId>
+			<artifactId>sqlite-jdbc</artifactId>
+			<version>3.34.0</version>
+		</dependency>
 	</dependencies>
 	<repositories>
 		<repository>

diff --git a/contrib/src/main/java/org/archive/modules/postprocessor/TroughCrawlLogFeed.java b/contrib/src/main/java/org/archive/modules/postprocessor/TroughCrawlLogFeed.java
@@ -18,10 +18,12 @@
  */
 package org.archive.modules.postprocessor;
 
+
 import java.net.MalformedURLException;
 import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.Date;
-import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
@@ -89,8 +91,12 @@ public class TroughCrawlLogFeed extends Processor implements Lifecycle {
 
     protected static final Logger logger = Logger.getLogger(TroughCrawlLogFeed.class.getName());
 
-    protected static final int BATCH_MAX_TIME_MS = 20 * 1000;
+    protected static final int BATCH_MAX_TIME_MS = 60 * 1000;
     protected static final int BATCH_MAX_SIZE = 400;
+    protected static final String CRAWLED_BATCH = "crawled";
+    protected static final String UNCRAWLED_BATCH = "uncrawled";
+    protected AtomicInteger crawledBatchSize = new AtomicInteger(0);
+    protected AtomicInteger uncrawledBatchSize = new AtomicInteger(0);
 
     protected KeyedProperties kp = new KeyedProperties();
     public KeyedProperties getKeyedProperties() {
@@ -119,15 +125,15 @@ public String getRethinkUrl() {
 
     protected TroughClient troughClient() throws MalformedURLException {
         if (troughClient == null) {
-            troughClient = new TroughClient(getRethinkUrl(), 60 * 60);
+            troughClient = new TroughClient(getRethinkUrl(), null);
             troughClient.start();
         }
         return troughClient;
     }
 
-    protected List<Object[]> crawledBatch = new ArrayList<Object[]>();
+    protected ConcurrentLinkedQueue<Object[]> crawledBatch = new ConcurrentLinkedQueue<Object[]>();
     protected long crawledBatchLastTime = System.currentTimeMillis();
-    protected List<Object[]> uncrawledBatch = new ArrayList<Object[]>();
+    protected ConcurrentLinkedQueue<Object[]> uncrawledBatch = new ConcurrentLinkedQueue<Object[]>();
     protected long uncrawledBatchLastTime = System.currentTimeMillis();
 
     protected Frontier frontier;
@@ -157,14 +163,32 @@ protected boolean shouldProcess(CrawlURI curi) {
             return false;
         }
     }
+
+    protected boolean dumpPendingAtClose = true;
+    public boolean getDumpPendingAtClose() {
+	return dumpPendingAtClose;
+    }
+    public void setDumpPendingAtClose(boolean dumpPendingAtClose) {
+	this.dumpPendingAtClose = dumpPendingAtClose;
+    }
+
+    protected boolean forceBatchPosting = false;
+    protected boolean getForceBatchPosting() {
+	return forceBatchPosting;
+    }
+    protected void setForceBatchPosting(boolean forceBatchPosting) {
+	this.forceBatchPosting = forceBatchPosting;
+    }
 
     @Override
     public synchronized void stop() {
         if (!isRunning) {
             return;
         }
         if (!crawledBatch.isEmpty()) {
-            postCrawledBatch();
+            setForceBatchPosting(true);
+            postBatch(crawledBatch, crawledBatchSize, CRAWLED_BATCH);
+            setForceBatchPosting(false);
         }
 
         if (frontier instanceof BdbFrontier) {
@@ -177,9 +201,14 @@ public void execute(Object o) {
                 }
             };
 
-            logger.info("dumping " + frontier.queuedUriCount() + " queued urls to trough feed");
-            ((BdbFrontier) frontier).forAllPendingDo(closure);
-            logger.info("dumped " + frontier.queuedUriCount() + " queued urls to trough feed");
+            if(getDumpPendingAtClose()) {
+                logger.info("dumping " + frontier.queuedUriCount() + " queued urls to trough feed");
+                setForceBatchPosting(true);
+                ((BdbFrontier) frontier).forAllPendingDo(closure);
+                postBatch(uncrawledBatch, uncrawledBatchSize, UNCRAWLED_BATCH);
+                setForceBatchPosting(false);
+                    logger.info("dumped " + frontier.queuedUriCount() + " queued urls to trough feed");
+            }
         } else {
             logger.warning("frontier is not a BdbFrontier, cannot dump queued urls to trough feed");
         }
@@ -223,12 +252,10 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
                     serverCache.getHostFor(curi.getUURI()).getHostName(),
             };
 
-            synchronized (crawledBatch) {
-                crawledBatch.add(values);
-            }
+            crawledBatch.add(values);
 
-            if (crawledBatch.size() >= BATCH_MAX_SIZE || System.currentTimeMillis() - crawledBatchLastTime > BATCH_MAX_TIME_MS) {
-                postCrawledBatch();
+            if (crawledBatchSize.incrementAndGet() >= BATCH_MAX_SIZE || System.currentTimeMillis() - crawledBatchLastTime > BATCH_MAX_TIME_MS) {
+                postBatch(crawledBatch, crawledBatchSize, CRAWLED_BATCH);
             }
         } else {
             Object[] values = new Object[] {
@@ -241,72 +268,78 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
                     serverCache.getHostFor(curi.getUURI()).getHostName(),
             };
 
-            synchronized (uncrawledBatch) {
-                uncrawledBatch.add(values);
-            }
-
-            if (uncrawledBatch.size() >= BATCH_MAX_SIZE || System.currentTimeMillis() - uncrawledBatchLastTime > BATCH_MAX_TIME_MS) {
-                postUncrawledBatch();
+            uncrawledBatch.add(values);
+            if (uncrawledBatchSize.incrementAndGet() >= BATCH_MAX_SIZE || System.currentTimeMillis() - uncrawledBatchLastTime > BATCH_MAX_TIME_MS) {
+                postBatch(uncrawledBatch, uncrawledBatchSize, UNCRAWLED_BATCH);
             }
         }
     }
 
-    protected void postCrawledBatch() {
-        logger.info("posting batch of " + crawledBatch.size() + " crawled urls trough segment " + getSegmentId());
-        synchronized (crawledBatch) {
-            if (!crawledBatch.isEmpty()) {
-                StringBuffer sqlTmpl = new StringBuffer();
-                sqlTmpl.append("insert into crawled_url ("
-                        + "timestamp, status_code, size, payload_size, url, hop_path, is_seed_redirect, "
-                        + "via, mimetype, content_digest, seed, is_duplicate, warc_filename, "
-                        + "warc_offset, warc_content_bytes, host)  values "
-                        + "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
-                for (int i = 1; i < crawledBatch.size(); i++) {
-                    sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
+    protected void postBatch(ConcurrentLinkedQueue<Object[]> batch, AtomicInteger batchSize, String batchType) {
+        ArrayList<Object[]> crawlLogLines = new ArrayList<Object[]>();
+        synchronized (batch) {
+            //read and remove log lines from batch into local variable so we can exit the synchronized block asap
+            if (batchSize.get() >= BATCH_MAX_SIZE || getForceBatchPosting()) {
+                while(!batch.isEmpty()) {
+                    Object[] crawlLine = batch.poll();
+                    if(crawlLine != null)
+                        crawlLogLines.add(crawlLine);
+                    else
+                        break;
                 }
-
-                Object[] flattenedValues = new Object[16 * crawledBatch.size()];
-                for (int i = 0; i < crawledBatch.size(); i++) {
-                    System.arraycopy(crawledBatch.get(i), 0, flattenedValues, 16 * i, 16);
-                }
-
-                try {
-                    troughClient().write(getSegmentId(), sqlTmpl.toString(), flattenedValues);
-                } catch (Exception e) {
-                    logger.log(Level.WARNING, "problem posting batch of " + crawledBatch.size() + " crawled urls to trough segment " + getSegmentId(), e);
-                }
-
-                crawledBatchLastTime = System.currentTimeMillis();
-                crawledBatch.clear();
+                batchSize.getAndSet(batch.size()); //size() is O(n). Use sparingly. Should be near zero right now.
+                logger.info("posting batch of " + crawlLogLines.size() + " " + batchType + " urls to trough segment " + getSegmentId());
             }
         }
-    }
-    protected void postUncrawledBatch() {
-        logger.info("posting batch of " + uncrawledBatch.size() + " uncrawled urls trough segment " + getSegmentId());
-        synchronized (uncrawledBatch) {
-            if (!uncrawledBatch.isEmpty()) {
-                StringBuffer sqlTmpl = new StringBuffer();
-                sqlTmpl.append(
-                        "insert into uncrawled_url (timestamp, url, hop_path, status_code, via, seed, host)"
-                                + " values (%s, %s, %s, %s, %s, %s, %s)");
-
-                for (int i = 1; i < uncrawledBatch.size(); i++) {
-                    sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s)");
-                }
+        if( crawlLogLines != null && crawlLogLines.size() > 0){
+            StringBuffer sqlTmpl = new StringBuffer();
+            int numCols=0;
+            switch(batchType){
+                case CRAWLED_BATCH:
+                    sqlTmpl.append("insert into crawled_url ("
+                            + "timestamp, status_code, size, payload_size, url, hop_path, is_seed_redirect, "
+                            + "via, mimetype, content_digest, seed, is_duplicate, warc_filename, "
+                            + "warc_offset, warc_content_bytes, host)  values "
+                            + "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
+                    for (int i = 1; i < crawlLogLines.size(); i++) {
+                        sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
+                    }
+                    numCols=16;
+                    break;
+                case UNCRAWLED_BATCH:
+                    sqlTmpl.append(
+                            "insert into uncrawled_url (timestamp, url, hop_path, status_code, via, seed, host)"
+                                    + " values (%s, %s, %s, %s, %s, %s, %s)");
+
+                    for (int i = 1; i < crawlLogLines.size(); i++) {
+                        sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s)");
+                    }
+                    numCols=7;
+                    break;
+            }
 
-                Object[] flattenedValues = new Object[7 * uncrawledBatch.size()];
-                for (int i = 0; i < uncrawledBatch.size(); i++) {
-                    System.arraycopy(uncrawledBatch.get(i), 0, flattenedValues, 7 * i, 7);
-                }
+            /*
+            Trough takes a single object array for the values we insert, so an array of size (N columns * X rows).
+            We read each row in column by column, then repeat for each row.
+             */
+            Object[] flattenedValues = new Object[numCols * crawlLogLines.size()];
+            for (int i = 0; i < crawlLogLines.size(); i++) {
+                System.arraycopy(crawlLogLines.get(i),0,flattenedValues, numCols * i,numCols);
+            }
 
-                try {
-                    troughClient().write(getSegmentId(), sqlTmpl.toString(), flattenedValues);
-                } catch (Exception e) {
-                    logger.log(Level.WARNING, "problem posting batch of " + uncrawledBatch.size() + " uncrawled urls to trough segment " + getSegmentId(), e);
-                }
+            try {
+                troughClient().write(getSegmentId(), sqlTmpl.toString(), flattenedValues);
+            } catch (Exception e) {
+                logger.log(Level.WARNING, "problem posting batch of " + crawlLogLines.size() + " " + batchType + " urls to trough segment " + getSegmentId(), e);
+            }
 
-                uncrawledBatchLastTime = System.currentTimeMillis();
-                uncrawledBatch.clear();
+            switch(batchType) {
+                case CRAWLED_BATCH:
+                    crawledBatchLastTime = System.currentTimeMillis();
+                    break;
+                case UNCRAWLED_BATCH:
+                    uncrawledBatchLastTime = System.currentTimeMillis();
+                    break;
             }
         }
     }