-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pipeline continues after OutOfMemory-Exception
- Loading branch information
Showing
4 changed files
with
245 additions
and
139 deletions.
There are no files selected for viewing
110 changes: 110 additions & 0 deletions
110
code/src/main/java/de/tudarmstadt/ukp/dariah/IO/GlobalFileStorage.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
package de.tudarmstadt.ukp.dariah.IO; | ||
|
||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
import java.util.LinkedList; | ||
|
||
import org.apache.tools.ant.DirectoryScanner; | ||
|
||
/** | ||
* A global file storage for files that should be | ||
* process in UIMA | ||
* @author Nils Reimers | ||
* | ||
*/ | ||
public class GlobalFileStorage { | ||
private File lastPolledFile = null; | ||
|
||
private static GlobalFileStorage instance; | ||
|
||
private GlobalFileStorage () {} | ||
|
||
public static GlobalFileStorage getInstance () { | ||
if (GlobalFileStorage.instance == null) { | ||
GlobalFileStorage.instance = new GlobalFileStorage (); | ||
} | ||
return GlobalFileStorage.instance; | ||
} | ||
|
||
/** | ||
* Reads a path and stores internally all file path | ||
* @throws FileNotFoundException | ||
**/ | ||
public void readFilePaths(String sourceLocation, String fileExtentsion) throws FileNotFoundException { | ||
|
||
if(sourceLocation.contains("*")) { | ||
int asterisk = sourceLocation.indexOf('*'); | ||
|
||
int separator = Math.max( | ||
sourceLocation.lastIndexOf(File.separatorChar, asterisk), | ||
sourceLocation.lastIndexOf('/', asterisk)); | ||
|
||
String sourcePath; | ||
if(separator >= 0) { | ||
sourcePath = sourceLocation.substring(0, separator+1); | ||
} else { | ||
sourcePath = "."; | ||
} | ||
|
||
String pattern = sourceLocation.substring(separator+1); | ||
|
||
|
||
DirectoryScanner scanner = new DirectoryScanner(); | ||
scanner.setIncludes(new String[]{pattern}); | ||
scanner.setBasedir(sourcePath); | ||
scanner.setCaseSensitive(false); | ||
scanner.scan(); | ||
|
||
for(String file : scanner.getIncludedFiles()) { | ||
this.push(new File(sourcePath+file)); | ||
} | ||
} else { | ||
|
||
File inputPath = new File(sourceLocation); | ||
|
||
if(inputPath.isFile()) { | ||
this.push(inputPath); | ||
} else if(inputPath.isDirectory()) { | ||
|
||
File[] files = inputPath.listFiles(); | ||
for (File file : files) { | ||
if (file.isFile() && (file.toString().endsWith(fileExtentsion))) { | ||
this.push(file); | ||
} | ||
} | ||
} else { | ||
throw new FileNotFoundException("Path "+sourceLocation+" does not point to a valid file or directory"); | ||
} | ||
} | ||
|
||
} | ||
|
||
private LinkedList<File> files = new LinkedList<>(); | ||
public boolean isEmpty() { | ||
return files.isEmpty(); | ||
} | ||
|
||
/** | ||
* Retrieves and removes the head (first element) of this list | ||
*/ | ||
public File poll() { | ||
this.lastPolledFile = files.poll(); | ||
return this.lastPolledFile; | ||
} | ||
|
||
public void push(File e) { | ||
files.push(e); | ||
|
||
} | ||
|
||
public int size() { | ||
return files.size(); | ||
} | ||
|
||
|
||
public File getLastPolledFile() { | ||
return this.lastPolledFile; | ||
} | ||
|
||
|
||
} |
89 changes: 84 additions & 5 deletions
89
code/src/main/java/de/tudarmstadt/ukp/dariah/IO/TextReaderWithInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,98 @@ | ||
package de.tudarmstadt.ukp.dariah.IO; | ||
|
||
import static org.apache.commons.io.IOUtils.closeQuietly; | ||
|
||
import java.io.BufferedInputStream; | ||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
|
||
import org.apache.commons.io.IOUtils; | ||
import org.apache.uima.cas.CAS; | ||
import org.apache.uima.cas.CASException; | ||
import org.apache.uima.collection.CollectionException; | ||
import org.apache.uima.fit.component.CasCollectionReader_ImplBase; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.util.Progress; | ||
|
||
import com.ibm.icu.text.CharsetDetector; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; | ||
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; | ||
import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; | ||
|
||
/** | ||
* Outputs which file is currently read | ||
* @author reimers | ||
* | ||
*/ | ||
public class TextReaderWithInfo extends TextReader { | ||
public class TextReaderWithInfo extends CasCollectionReader_ImplBase { | ||
|
||
/** | ||
* Set this as the language of the produced documents. | ||
*/ | ||
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; | ||
@ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false) | ||
private String language; | ||
|
||
/** | ||
* Name of configuration parameter that contains the character encoding used by the input files. | ||
*/ | ||
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; | ||
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") | ||
private String encoding; | ||
|
||
@Override | ||
protected Resource nextFile() { | ||
Resource next = super.nextFile(); | ||
System.out.println("Process file: "+next.toString()); | ||
return next; | ||
public void getNext(CAS aCAS) throws IOException, CollectionException { | ||
JCas jcas; | ||
try { | ||
jcas = aCAS.getJCas(); | ||
} | ||
catch (CASException e) { | ||
throw new CollectionException(e); | ||
} | ||
|
||
try { | ||
File file = GlobalFileStorage.getInstance().poll(); | ||
|
||
System.out.println("Process file: "+file.getName()); | ||
|
||
InputStream is = null; | ||
try { | ||
is = new BufferedInputStream(new FileInputStream(file)); | ||
aCAS.setDocumentText(IOUtils.toString(is, encoding)); | ||
} | ||
finally { | ||
closeQuietly(is); | ||
} | ||
|
||
jcas.setDocumentLanguage(language); | ||
|
||
DocumentMetaData docMetaData = DocumentMetaData.create(aCAS); | ||
docMetaData.setDocumentTitle(file.getName()); | ||
docMetaData.setDocumentId(file.getAbsolutePath()); | ||
docMetaData.setDocumentBaseUri("file:"+file.getParentFile().getAbsolutePath()); | ||
docMetaData.setDocumentUri("file:"+file.getAbsolutePath()); | ||
} catch(Exception e) { | ||
throw new CollectionException(e); | ||
} | ||
|
||
} | ||
|
||
@Override | ||
public Progress[] getProgress() { | ||
return null; | ||
} | ||
|
||
@Override | ||
public boolean hasNext() | ||
throws IOException, CollectionException { | ||
|
||
return !GlobalFileStorage.getInstance().isEmpty(); | ||
} | ||
|
||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.