Skip to content

Commit

Permalink
Pipeline continues after OutOfMemory-Exception
Browse files Browse the repository at this point in the history
  • Loading branch information
nreimers committed Oct 30, 2015
1 parent 739cbe7 commit 8f5d831
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 139 deletions.
110 changes: 110 additions & 0 deletions code/src/main/java/de/tudarmstadt/ukp/dariah/IO/GlobalFileStorage.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package de.tudarmstadt.ukp.dariah.IO;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.LinkedList;

import org.apache.tools.ant.DirectoryScanner;

/**
* A global file storage for files that should be
* process in UIMA
* @author Nils Reimers
*
*/
public class GlobalFileStorage {
private File lastPolledFile = null;

private static GlobalFileStorage instance;

private GlobalFileStorage () {}

public static GlobalFileStorage getInstance () {
if (GlobalFileStorage.instance == null) {
GlobalFileStorage.instance = new GlobalFileStorage ();
}
return GlobalFileStorage.instance;
}

/**
* Reads a path and stores internally all file path
* @throws FileNotFoundException
**/
public void readFilePaths(String sourceLocation, String fileExtentsion) throws FileNotFoundException {

if(sourceLocation.contains("*")) {
int asterisk = sourceLocation.indexOf('*');

int separator = Math.max(
sourceLocation.lastIndexOf(File.separatorChar, asterisk),
sourceLocation.lastIndexOf('/', asterisk));

String sourcePath;
if(separator >= 0) {
sourcePath = sourceLocation.substring(0, separator+1);
} else {
sourcePath = ".";
}

String pattern = sourceLocation.substring(separator+1);


DirectoryScanner scanner = new DirectoryScanner();
scanner.setIncludes(new String[]{pattern});
scanner.setBasedir(sourcePath);
scanner.setCaseSensitive(false);
scanner.scan();

for(String file : scanner.getIncludedFiles()) {
this.push(new File(sourcePath+file));
}
} else {

File inputPath = new File(sourceLocation);

if(inputPath.isFile()) {
this.push(inputPath);
} else if(inputPath.isDirectory()) {

File[] files = inputPath.listFiles();
for (File file : files) {
if (file.isFile() && (file.toString().endsWith(fileExtentsion))) {
this.push(file);
}
}
} else {
throw new FileNotFoundException("Path "+sourceLocation+" does not point to a valid file or directory");
}
}

}

private LinkedList<File> files = new LinkedList<>();
public boolean isEmpty() {
return files.isEmpty();
}

/**
* Retrieves and removes the head (first element) of this list
*/
public File poll() {
this.lastPolledFile = files.poll();
return this.lastPolledFile;
}

public void push(File e) {
files.push(e);

}

public int size() {
return files.size();
}


public File getLastPolledFile() {
return this.lastPolledFile;
}


}
Original file line number Diff line number Diff line change
@@ -1,19 +1,98 @@
package de.tudarmstadt.ukp.dariah.IO;

import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.IOUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.CasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.Progress;

import com.ibm.icu.text.CharsetDetector;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;

/**
* Outputs which file is currently read
* @author reimers
*
*/
public class TextReaderWithInfo extends TextReader {
public class TextReaderWithInfo extends CasCollectionReader_ImplBase {

/**
* Set this as the language of the produced documents.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false)
private String language;

/**
* Name of configuration parameter that contains the character encoding used by the input files.
*/
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;

@Override
protected Resource nextFile() {
Resource next = super.nextFile();
System.out.println("Process file: "+next.toString());
return next;
public void getNext(CAS aCAS) throws IOException, CollectionException {
JCas jcas;
try {
jcas = aCAS.getJCas();
}
catch (CASException e) {
throw new CollectionException(e);
}

try {
File file = GlobalFileStorage.getInstance().poll();

System.out.println("Process file: "+file.getName());

InputStream is = null;
try {
is = new BufferedInputStream(new FileInputStream(file));
aCAS.setDocumentText(IOUtils.toString(is, encoding));
}
finally {
closeQuietly(is);
}

jcas.setDocumentLanguage(language);

DocumentMetaData docMetaData = DocumentMetaData.create(aCAS);
docMetaData.setDocumentTitle(file.getName());
docMetaData.setDocumentId(file.getAbsolutePath());
docMetaData.setDocumentBaseUri("file:"+file.getParentFile().getAbsolutePath());
docMetaData.setDocumentUri("file:"+file.getAbsolutePath());
} catch(Exception e) {
throw new CollectionException(e);
}

}

@Override
public Progress[] getProgress() {
return null;
}

@Override
public boolean hasNext()
throws IOException, CollectionException {

return !GlobalFileStorage.getInstance().isEmpty();
}



}
127 changes: 14 additions & 113 deletions code/src/main/java/de/tudarmstadt/ukp/dariah/IO/XmlReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,112 +49,28 @@





/**
* Reads in xml files. The xpath to each element is stored in a special annotation.
* @author Nils Reimers
*
*/
@TypeCapability(
outputs={
"de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field",
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"})

public class XmlReader extends CasCollectionReader_ImplBase {

/**
* Location from which the input is read.
*/
public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION;
@ConfigurationParameter(name=PARAM_SOURCE_LOCATION, mandatory=true)
private String inputPath;


/**
* Set this as the language of the produced documents.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false)
private String language;

/**
* optional, tags those should be worked on (if empty, then all tags
* except those ExcludeTags will be worked on)
*
* @NOTE: Currently not implemented
*/
// public static final String PARAM_INCLUDE_TAG = "IncludeTag";
// @ConfigurationParameter(name=PARAM_INCLUDE_TAG, mandatory=true, defaultValue={})
// private Set<String> includeTags;

/**
* optional, tags those should not be worked on. Out them should no
* text be extracted and also no Annotations be produced.
*
* @NOTE: Currently not implemented
*/
// public static final String PARAM_EXCLUDE_TAG = "ExcludeTag";
// @ConfigurationParameter(name=PARAM_EXCLUDE_TAG, mandatory=true, defaultValue={})
// private Set<String> excludeTags;

/**
* tag which contains the docId
*
* @NOTE: Currently not implemented
*/
// public static final String PARAM_DOC_ID_TAG = "DocIdTag";
// @ConfigurationParameter(name=PARAM_DOC_ID_TAG, mandatory=false)
// private String docIdTag;

/**
* The collection ID to set in the {@link DocumentMetaData}.
*
* @NOTE: Currently not implemented
*/
// public static final String PARAM_COLLECTION_ID = "collectionId";
// @ConfigurationParameter(name=PARAM_COLLECTION_ID, mandatory=false)
// private String collectionId;



// mandatory, list of xml files to be readed in
private final ArrayList<File> xmlFiles = new ArrayList<File>();

// current be parsed file index
private int currentParsedFile;



@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);

// mandatory, directory where that those be parsed XML files are
File inPath = new File(inputPath);
// get all xml files from the input directory (ignore the
// subdirectories)
if (inPath.isDirectory()) {
File[] files = inPath.listFiles();
for (File file : files) {
if (file.isFile() && (file.toString().endsWith(".xml") || file.toString().endsWith(".sgml"))) {
xmlFiles.add(file);
}
}
Collections.sort(xmlFiles);
} else if(inPath.isFile() && inPath.exists()) {
xmlFiles.add(inPath);
}
else {
throw new ResourceInitializationException("Invalid path", new Object[] {inputPath});
}


currentParsedFile = 0;

// if (docIdTag != null && docIdTag.contains("/@")) {
// int split = docIdTag.indexOf("/@");
// docIdElementLocalName = docIdTag.substring(0, split);
// docIdAttributeName = docIdTag.substring(split+2);
// }
// else {
// docIdElementLocalName = docIdTag;
// }
}

@Override
public void getNext(CAS aCAS)
Expand All @@ -170,7 +86,7 @@ public void getNext(CAS aCAS)

try {
// parse the xml file
File xmlFile = xmlFiles.get(currentParsedFile);
File xmlFile = GlobalFileStorage.getInstance().poll();

System.out.println("Process file: "+xmlFile.getName());

Expand Down Expand Up @@ -206,10 +122,9 @@ public void getNext(CAS aCAS)
docMetaData.setDocumentId(xmlFile.getAbsolutePath());
docMetaData.setDocumentBaseUri("file:"+xmlFile.getParentFile().getAbsolutePath());
docMetaData.setDocumentUri("file:"+xmlFile.getAbsolutePath());

currentParsedFile++;

} catch (Exception e) {
e.printStackTrace();
//e.printStackTrace();
throw new CollectionException(e);
}

Expand All @@ -218,28 +133,14 @@ public void getNext(CAS aCAS)
@Override
public Progress[] getProgress()
{
return new Progress[] { new ProgressImpl(currentParsedFile, xmlFiles
.size(), Progress.ENTITIES) };
return null;
}

@Override
public boolean hasNext()
throws IOException, CollectionException
{

if (currentParsedFile >= 0 && currentParsedFile < xmlFiles.size()) {
// There are additional files to parse
return true;
}
throws IOException, CollectionException {

return false;
}

@Override
public void close()
throws IOException
{
// Nothing to do
return !GlobalFileStorage.getInstance().isEmpty();
}


Expand Down
Loading

0 comments on commit 8f5d831

Please sign in to comment.