diff --git a/distribution/pom.xml b/distribution/pom.xml index 0997caa..383ff4f 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -19,8 +19,9 @@ + org.apache.maven.plugins maven-assembly-plugin - 3.1.1 + 3.6.0 distro-assembly @@ -52,13 +53,6 @@ 1.2.3 - - uk.ac.gate - gcp-cli - 3.3-SNAPSHOT - - provided - diff --git a/distribution/src/assembly/distro.xml b/distribution/src/assembly/distro.xml index 1743480..b7d21e2 100644 --- a/distribution/src/assembly/distro.xml +++ b/distribution/src/assembly/distro.xml @@ -1,6 +1,6 @@ - + xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.2.0 https://maven.apache.org/xsd/assembly-2.2.0.xsd"> distro zip @@ -36,13 +36,32 @@ false false - - provided - - false - false - ${artifact.artifactId}.${artifact.extension} - + + + true + + uk.ac.gate:gcp-cli + + + + false + false + ${artifact.artifactId}.${artifact.extension} + + + + true + + uk.ac.gate:gcp-plugin-* + + + distro + false + true + plugins + + + diff --git a/impl/pom.xml b/impl/pom.xml index 8acdd02..9a797a0 100644 --- a/impl/pom.xml +++ b/impl/pom.xml @@ -34,26 +34,6 @@ compile - - commons-httpclient - commons-httpclient - 3.0.1 - compile - - - - - org.netpreserve.commons - webarchive-commons - 1.1.9 - compile - - - * - * - - - - - uk.ac.gate.mimir - mimir-connector - 6.2 - compile - - - - - commons-io - commons-io - 2.7 - compile - - commons-cli diff --git a/impl/src/main/java/gate/cloud/batch/BatchRunner.java b/impl/src/main/java/gate/cloud/batch/BatchRunner.java index df72fc2..56d2acb 100644 --- a/impl/src/main/java/gate/cloud/batch/BatchRunner.java +++ b/impl/src/main/java/gate/cloud/batch/BatchRunner.java @@ -717,6 +717,17 @@ public void uncaughtException(Thread t, Throwable e) { Gate.init(); + // load built-in plugins + File builtInPluginsDir = new File(gcpHome, "plugins"); + if(builtInPluginsDir.isDirectory()) { + File[] plugins = builtInPluginsDir.listFiles(File::isDirectory); + if(plugins != null) { + for(File pluginFile : plugins) { + Gate.getCreoleRegister().registerPlugin(new Plugin.Directory(pluginFile.toURI().toURL())); + } + } + } + // load any other plugins specified on the command line String[] pluginsToLoad = line.getOptionValues('p'); if(pluginsToLoad != null) { diff --git a/impl/src/main/java/gate/cloud/util/ByteArrayURLStreamHandler.java b/impl/src/main/java/gate/cloud/util/ByteArrayURLStreamHandler.java index e4b192d..24b4d7c 100644 --- a/impl/src/main/java/gate/cloud/util/ByteArrayURLStreamHandler.java +++ b/impl/src/main/java/gate/cloud/util/ByteArrayURLStreamHandler.java @@ -19,8 +19,6 @@ import java.util.List; import java.util.Map; -import org.apache.commons.httpclient.Header; - /** * This oddity is just a wrapper around a byte array and a URL, to * allow creation of GATE documents from a byte array with @@ -28,10 +26,36 @@ */ public class ByteArrayURLStreamHandler extends URLStreamHandler { + + public static class Header { + public Header(String name, String value) { + this.name = name; + this.value = value; + } + + private String name; + private String value; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + } private byte[] data; private Header[] headers; - + public ByteArrayURLStreamHandler(byte[] data) { this(data, null); } @@ -78,7 +102,7 @@ public Map> getHeaderFields() { } else if(values.size() == 1) { values = new ArrayList(values); values.add(h.getValue()); - fields.put(h.getName(), values); + fields.put(h.getName(), values); } else { values.add(h.getValue()); } diff --git a/impl/src/main/java/gate/cloud/util/Scratch.java b/impl/src/main/java/gate/cloud/util/Scratch.java deleted file mode 100644 index a549127..0000000 --- a/impl/src/main/java/gate/cloud/util/Scratch.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Scratch.java - * Copyright (c) 2007-2011, The University of Sheffield. - * - * This file is part of GCP (see http://gate.ac.uk/), and is free - * software, licenced under the GNU Affero General Public License, - * Version 3, November 2007. - * - * - * $Id: Scratch.java 17349 2014-02-19 18:02:24Z ian_roberts $ - */ -package gate.cloud.util; - -import static gate.cloud.io.IOConstants.PARAM_ARC_FILE_LOCATION; -import static gate.cloud.io.IOConstants.PARAM_MIME_TYPES; -import gate.cloud.io.arc.ARCDocumentEnumerator; -import gate.cloud.io.arc.ArchiveDocumentEnumerator; - -import java.io.File; -import java.io.FilenameFilter; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Map; - -/** - * Class for trying out various things. NOT part of the GCP distro. - */ -public class Scratch { - - public static void main(String args[]) { - try { -// enumerateArcs(args); - enumerateDocsInArc(args); - } catch (Throwable t) { - t.printStackTrace(); - } - } - - /** - * Test code to enumerate all the arc files inside a directory and print out - * the number of documents they would match. - * - * @param args - * args[0] should be the directory containing the ARC files. - * @throws Exception - */ - private static void enumerateArcs(String[] args) throws Exception { - Map configData = new HashMap(); - configData.put(PARAM_MIME_TYPES, "text/html application/pdf"); - - File topDir = new File(args[0]); - File arcs[] = topDir.listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(".arc.gz") || name.endsWith(".arc"); - } - }); - Arrays.sort(arcs, new Comparator() { - public int compare(File o1, File o2) { - return o1.getAbsolutePath().compareTo(o2.getAbsolutePath()); - } - }); - int arcCnt = 0; - long totalDocs = 0; - System.out.println("\"Archive\", \"Size (MBs)\", Documents, Total\n"); - for (File arcFile : arcs) { - try { - configData.put(PARAM_ARC_FILE_LOCATION, - arcFile.getAbsolutePath()); - ArchiveDocumentEnumerator enumerator = new ARCDocumentEnumerator(); - enumerator.config(configData); - enumerator.init(); - int docs = 0; - while (enumerator.hasNext()) { - enumerator.next(); - docs++; - } - totalDocs += docs; - System.out.printf("\"%s\", %.4f, %d, %d\n", - // name - arcFile.getName(), - // size (MBs) - ((double) arcFile.length() / 1048576), - // included docs - docs, - // total docs so far - totalDocs); - arcCnt++; - } catch (Exception e) { - // report and ignore - e.printStackTrace(); - } - } - } - - /** - * Test code to enumerate all the arc files inside a directory and print out - * the number of documents they would match. - * - * @param args - * args[0] should be the directory containing the ARC files. - * @throws Exception - */ - private static void enumerateDocsInArc(String[] args) throws Exception { - Map configData = new HashMap(); - configData.put(PARAM_MIME_TYPES, "text/html application/pdf"); - - File arcFile = new File(args[0]); - try { - configData.put(PARAM_ARC_FILE_LOCATION, arcFile.getAbsolutePath()); - ArchiveDocumentEnumerator enumerator = new ARCDocumentEnumerator(); - enumerator.config(configData); - enumerator.init(); - int docs = 0; - while (enumerator.hasNext()) { - System.out.println(enumerator.next()); - docs++; - if(docs > 200) return; - } - } catch (Exception e) { - // report and ignore - e.printStackTrace(); - } - } - -} diff --git a/plugins/arc-input/pom.xml b/plugins/arc-input/pom.xml new file mode 100644 index 0000000..ab3cc3f --- /dev/null +++ b/plugins/arc-input/pom.xml @@ -0,0 +1,117 @@ + + + + 4.0.0 + uk.ac.gate + gcp-plugin-arc-input + 3.3-SNAPSHOT + jar + http://gate.ac.uk/gcp + + GATE Cloud Paralleliser (ARC/WARC input plugin) + + + GCP is a tool for running saved GATE applications in multiple parallel + threads on a single machine. + + + + + + + org.apache.maven.plugins + maven-deploy-plugin + 2.7 + + true + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + lib/ + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.6.0 + + + distro-assembly + package + + single + + + + src/assembly/distro.xml + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + + + uk.ac.gate + gcp-impl + 3.3-SNAPSHOT + provided + + + + + org.netpreserve.commons + webarchive-commons + 1.1.9 + compile + + + * + * + + + + + + + commons-io + commons-io + 2.7 + compile + + + + commons-httpclient + commons-httpclient + 3.0.1 + compile + + + + + diff --git a/plugins/arc-input/src/assembly/creole.xml b/plugins/arc-input/src/assembly/creole.xml new file mode 100644 index 0000000..d0a9c39 --- /dev/null +++ b/plugins/arc-input/src/assembly/creole.xml @@ -0,0 +1,3 @@ + + gcp-plugin-arc-input.jar + \ No newline at end of file diff --git a/plugins/arc-input/src/assembly/distro.xml b/plugins/arc-input/src/assembly/distro.xml new file mode 100644 index 0000000..6c40b44 --- /dev/null +++ b/plugins/arc-input/src/assembly/distro.xml @@ -0,0 +1,36 @@ + + distro + arc-input + + zip + + + + + src/assembly + + + creole.xml + + + + + + + runtime + /lib + false + + + true + + ${project.groupId}:${project.artifactId} + + + ${artifact.artifactId}.${artifact.extension} + + + + diff --git a/impl/src/main/java/gate/cloud/io/arc/ARCDocumentEnumerator.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCDocumentEnumerator.java similarity index 100% rename from impl/src/main/java/gate/cloud/io/arc/ARCDocumentEnumerator.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCDocumentEnumerator.java diff --git a/impl/src/main/java/gate/cloud/io/arc/ARCDocumentNamingStrategy.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCDocumentNamingStrategy.java similarity index 100% rename from impl/src/main/java/gate/cloud/io/arc/ARCDocumentNamingStrategy.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCDocumentNamingStrategy.java diff --git a/impl/src/main/java/gate/cloud/io/arc/ARCInputHandler.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCInputHandler.java similarity index 99% rename from impl/src/main/java/gate/cloud/io/arc/ARCInputHandler.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCInputHandler.java index 0672f41..29142bf 100644 --- a/impl/src/main/java/gate/cloud/io/arc/ARCInputHandler.java +++ b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ARCInputHandler.java @@ -13,6 +13,7 @@ import static gate.cloud.io.IOConstants.PARAM_ARC_FILE_LOCATION; import static gate.cloud.io.IOConstants.PARAM_SOURCE_FILE_LOCATION; + import gate.util.GateException; import java.io.ByteArrayInputStream; diff --git a/impl/src/main/java/gate/cloud/io/arc/ArchiveDocumentEnumerator.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ArchiveDocumentEnumerator.java similarity index 100% rename from impl/src/main/java/gate/cloud/io/arc/ArchiveDocumentEnumerator.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/ArchiveDocumentEnumerator.java diff --git a/impl/src/main/java/gate/cloud/io/arc/ArchiveInputHandler.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ArchiveInputHandler.java similarity index 97% rename from impl/src/main/java/gate/cloud/io/arc/ArchiveInputHandler.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/ArchiveInputHandler.java index c8a6245..3c42aea 100644 --- a/impl/src/main/java/gate/cloud/io/arc/ArchiveInputHandler.java +++ b/plugins/arc-input/src/main/java/gate/cloud/io/arc/ArchiveInputHandler.java @@ -299,8 +299,14 @@ record = getRecord(reader, offset); chunkIn.close(); content = baos.toByteArray(); } - - URL docUrl = new URL(null, header.getUrl(), new ByteArrayURLStreamHandler(content, httpHeaders)); + ByteArrayURLStreamHandler.Header[] handlerHeaders = null; + if(httpHeaders != null) { + handlerHeaders = new ByteArrayURLStreamHandler.Header[httpHeaders.length]; + for(int i = 0; i < httpHeaders.length; i++) { + handlerHeaders[i] = new ByteArrayURLStreamHandler.Header(httpHeaders[i].getName(), httpHeaders[i].getValue()); + } + } + URL docUrl = new URL(null, header.getUrl(), new ByteArrayURLStreamHandler(content, handlerHeaders)); FeatureMap docParams = Factory.newFeatureMap(); docParams.put(Document.DOCUMENT_URL_PARAMETER_NAME, docUrl); diff --git a/impl/src/main/java/gate/cloud/io/arc/WARCDocumentEnumerator.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/WARCDocumentEnumerator.java similarity index 100% rename from impl/src/main/java/gate/cloud/io/arc/WARCDocumentEnumerator.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/WARCDocumentEnumerator.java diff --git a/impl/src/main/java/gate/cloud/io/arc/WARCInputHandler.java b/plugins/arc-input/src/main/java/gate/cloud/io/arc/WARCInputHandler.java similarity index 100% rename from impl/src/main/java/gate/cloud/io/arc/WARCInputHandler.java rename to plugins/arc-input/src/main/java/gate/cloud/io/arc/WARCInputHandler.java diff --git a/plugins/mimir-output/pom.xml b/plugins/mimir-output/pom.xml new file mode 100644 index 0000000..b260628 --- /dev/null +++ b/plugins/mimir-output/pom.xml @@ -0,0 +1,83 @@ + + + + 4.0.0 + uk.ac.gate + gcp-plugin-mimir-output + 3.3-SNAPSHOT + jar + http://gate.ac.uk/gcp + + GATE Cloud Paralleliser (Mimir output plugin) + + + GCP is a tool for running saved GATE applications in multiple parallel + threads on a single machine. + + + + + + + org.apache.maven.plugins + maven-deploy-plugin + 2.7 + + true + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + lib/ + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.6.0 + + + distro-assembly + package + + single + + + + src/assembly/distro.xml + + + + + + + + + + + uk.ac.gate + gcp-api + 3.3-SNAPSHOT + provided + + + + + uk.ac.gate.mimir + mimir-connector + 6.2 + compile + + + + diff --git a/plugins/mimir-output/src/assembly/creole.xml b/plugins/mimir-output/src/assembly/creole.xml new file mode 100644 index 0000000..f82ddde --- /dev/null +++ b/plugins/mimir-output/src/assembly/creole.xml @@ -0,0 +1,3 @@ + + gcp-plugin-mimir-output.jar + \ No newline at end of file diff --git a/plugins/mimir-output/src/assembly/distro.xml b/plugins/mimir-output/src/assembly/distro.xml new file mode 100644 index 0000000..8b0eda6 --- /dev/null +++ b/plugins/mimir-output/src/assembly/distro.xml @@ -0,0 +1,36 @@ + + distro + mimir-output + + zip + + + + + src/assembly + + + creole.xml + + + + + + + runtime + /lib + false + + + true + + ${project.groupId}:${project.artifactId} + + + ${artifact.artifactId}.${artifact.extension} + + + + diff --git a/impl/src/main/java/gate/cloud/io/mimir/MimirOutputHandler.java b/plugins/mimir-output/src/main/java/gate/cloud/io/mimir/MimirOutputHandler.java similarity index 100% rename from impl/src/main/java/gate/cloud/io/mimir/MimirOutputHandler.java rename to plugins/mimir-output/src/main/java/gate/cloud/io/mimir/MimirOutputHandler.java diff --git a/pom.xml b/pom.xml index 4f6e24f..1be0d4f 100644 --- a/pom.xml +++ b/pom.xml @@ -60,6 +60,8 @@ api impl + plugins/arc-input + plugins/mimir-output cli distribution