Skip to content

Commit

Permalink
TIKA-4355 -- LibPstParser should allow the path to readpst to be conf…
Browse files Browse the repository at this point in the history
…igurable
  • Loading branch information
tballison committed Nov 19, 2024
1 parent 4e32eb5 commit 45d4727
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -47,6 +48,7 @@
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;

/**
* This is an optional PST parser that relies on the user installing
Expand All @@ -65,8 +67,10 @@ public class LibPstParser implements Parser, Initializable {
private static final int MAX_STDERR = 10000;
private static final String READ_PST_COMMAND = "readpst";

private LibPstParserConfig defaultConfig = new LibPstParserConfig();

private final LibPstParserConfig defaultConfig = new LibPstParserConfig();
//for security purposes, this cannot be set via the parseContext. This must
//be set via the usual @Field setters in tika-config.xml
private String readPstPath = "";
@Override
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return SUPPORTED;
Expand Down Expand Up @@ -125,9 +129,10 @@ private void processContents(Path outDir, LibPstParserConfig config, XHTMLConten
Files.walkFileTree(outDir, new EmailVisitor(outDir, config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
}

private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile) {
private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile)
throws TikaConfigException {
List commands = new ArrayList<String>();
commands.add(READ_PST_COMMAND);
commands.add(getFullReadPstCommand());
if (config.isDebug()) {
commands.add("-d");
commands.add(ProcessUtils.escapeCommandLine(debugFile
Expand Down Expand Up @@ -157,6 +162,13 @@ private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Pa

@Override
public void initialize(Map<String, Param> map) throws TikaConfigException {
if (readPstPath.contains("\u0000")) {
throw new TikaConfigException("path can't include null values");
}
String fullReadPstCommand = getFullReadPstCommand();
if (! Files.isRegularFile(Paths.get(fullReadPstCommand))) {
throw new TikaConfigException("I regret I can't find the readpst executable: " + fullReadPstCommand);
}
try {
check();
} catch (IOException e) {
Expand All @@ -171,8 +183,10 @@ public void checkInitialization(InitializableProblemHandler initializableProblem
}

//throws exception if readpst is not available
private static void check() throws TikaConfigException, IOException {
ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V");
private void check() throws TikaConfigException, IOException {
String fullReadPstCommand = getFullReadPstCommand();

ProcessBuilder pb = new ProcessBuilder(ProcessUtils.escapeCommandLine(fullReadPstCommand), "-V");
FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 10000);
if (result.getExitValue() != 0) {
throw new TikaConfigException(
Expand All @@ -183,7 +197,7 @@ private static void check() throws TikaConfigException, IOException {
}
}

public static boolean checkQuietly() {
public boolean checkQuietly() {
try {
check();
} catch (TikaConfigException | IOException e) {
Expand All @@ -192,6 +206,16 @@ public static boolean checkQuietly() {
return true;
}

private String getFullReadPstCommand() throws TikaConfigException {
if (StringUtils.isBlank(readPstPath)) {
return READ_PST_COMMAND;
}
if (! readPstPath.endsWith("/") && readPstPath.endsWith("\\")) {
return readPstPath + "/" + READ_PST_COMMAND;
}
return readPstPath + READ_PST_COMMAND;
}

@Field
public void setTimeoutSeconds(long timeoutSeconds) {
defaultConfig.setTimeoutSeconds(timeoutSeconds);
Expand All @@ -212,5 +236,14 @@ public void setMaxEmails(int maxEmails) {
defaultConfig.setMaxEmails(maxEmails);
}

/**
* This should include the path up to but not including 'readpst', e.g. "C:\my_bin" where
* readpst is at "C:\my_bin\readpst"
* @param readPstPath
*/
@Field
public void setReadPstPath(String readPstPath) {
this.readPstPath = readPstPath;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ public class TestLibPstParser extends TikaTest {

@BeforeAll
public static void setUp() {
LIBPST_EXISTS = LibPstParser.checkQuietly();
//test that readpst is on the path
LIBPST_EXISTS = new LibPstParser().checkQuietly();
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public void testForkedPackageParsing() throws Exception {

@Test
public void testLibPstParser() throws Exception {
if (! LibPstParser.checkQuietly()) {
if (! new LibPstParser().checkQuietly()) {
return;
}
TikaConfig tikaConfig = new TikaConfig(
Expand Down

0 comments on commit 45d4727

Please sign in to comment.