Skip to content

Commit

Permalink
Merge pull request metafacture#141 from cboehme/multiline-pica-parser
Browse files Browse the repository at this point in the history
Multiline pica parser
  • Loading branch information
cboehme committed Dec 4, 2013
2 parents b245c80 + bd30086 commit a84bdc5
Show file tree
Hide file tree
Showing 4 changed files with 407 additions and 234 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
*/
final class PicaConstants {

public static final char FIELD_DELIMITER = '\u001e';
public static final char SUBFIELD_DELIMITER = '\u001f';
public static final char RECORD_MARKER = '\u001d';
public static final char FIELD_MARKER = '\u001e';
public static final char SUBFIELD_MARKER = '\u001f';
public static final char FIELD_END_MARKER = '\n';

private PicaConstants() {
// No instances allowed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,34 +23,89 @@


/**
* Parses a PICA+ record with UTF8 encoding assumed.
* <p>Parses pica+ records. The parser only parses single records.
* A string containing multiple records must be split into
* individual records before passing it to {@code PicaDecoder}.</p>
*
* <p>The parser is designed to accept any string as valid input and
* to parse pica plain format as well as normalised pica. To
* achieve this, the parser behaves as following:</p>
*
* <ul>
* <li>Fields are separated by record markers (0x1d), field
* markers (0x1e) or field end markers (0x0a).</li>
* <li>The field name and the first subfield are separated by
* a subfield marker (0x01f).</li>
* <li>The parser assumes that the input starts with a field
* name.</li>
* <li>The parser assumes that the end of the input marks
* the end of the current field and the end of the record.
* </li>
* <li>Subfields are separated by subfield markers (0x1f).</li>
* <li>The first character of a subfield is the name of the
* subfield</li>
* <li>To handle input with multiple field and subfield separators
* following each other directly (for instance 0x0a and 0x1e), it
* is assumed that field names, subfields, subfield names or
* subfield values can be empty.</li>
* </ul>
*
* <p>Please not that the record markers is treated as a field
* delimiter and not as a record delimiter. Records need to be
* separated prior to parsing them.</p>
*
* <p>As the behaviour of the parser may result in unnamed fields or
* subfields or fields with no subfields the {@code PicaDecoder}
* automatically filters empty fields and subfields:</p>
*
* <ul>
* <li>Subfields without a name are ignored (such fields cannot
* have any value because then the first character of the value
* would be the field name).</li>
* <li>Subfields which only have a name but no value are always
* parsed.</li>
* <li>Unnamed Fields are only parsed if the contain not-ignored
* subfields.</li>
* <li>Named fields containing none or only ignored subfields are
* only parsed if {@code skipEmptyFields} is set to {@code false}
* otherwise they are ignored.</li>
* <li>Input containing only whitespace (spaces and tabs) is
* completely ignored</li>
* </ul>
*
* <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
* {@code receiver.endEntity} for each parsed field and
* {@code receiver.literal} for each parsed subfield. Spaces in the
* field name are not included in the entity name. The input
* "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
* sequence of calls:</p>
*
* For each field in the stream the module calls:
* <ol>
* <li>receiver.startEntity</li>
* <li>receiver.literal for each subfield of the field</li>
* <li>receiver.endEntity</li>
* <li>receiver.startEntity("028A")</li>
* <li>receiver.literal("a", "Andy")</li>
* <li>receiver.literal("d", "Warhol")</li>
* <li>receiver.endEntity()</li>
* </ol>
*
* Spaces in the field name are not included in the entity name.
*
* Empty subfields are skipped. For instance, processing the following input
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
* skips unnamed fields without any subfields.
* <p>The content of subfield 003@$0 is used for the record id. If
* {@code ignoreMissingIdn} is false and field 003@$0 is not found
* in the record a {@link MissingIdException} is thrown.</p>
*
* If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the
* record a {@link MissingIdException} is thrown.
* <p>The parser assumes that the input is utf-8 encoded. The parser
* does not support other pica encodings.</p>
*
* @author Christoph Böhme
*
*/
@Description("Parses a PICA+ record with UTF8 encoding assumed.")
@Description("Parses pica+ records. The parser only parses single records. " +
"A string containing multiple records must be split into " +
"individual records before passing it to PicaDecoder.")
@In(String.class)
@Out(StreamReceiver.class)
public final class PicaDecoder
extends DefaultObjectPipe<String, StreamReceiver> {

private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_DELIMITER, '0'};
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_MARKER, '0'};

private static final int BUFFER_SIZE = 1024 * 1024;

Expand Down Expand Up @@ -144,10 +199,9 @@ private boolean recordIsEmpty() {
/**
* Searches the record for the sequence specified in {@code ID_FIELD}
* and returns all characters following this sequence until the next
* {@link PicaConstants.FIELD_DELIMITER},
* {@link PicaConstants.SUBFIELD_DELIMITER} or the end of the record
* is reached. Only the first occurrence of the sequence is processed,
* later occurrences are ignored.
* control character (see {@link PicaConstants}) is found or the end of
* the record is reached. Only the first occurrence of the sequence is
* processed, later occurrences are ignored.
*
* If the sequence is not found in the string or if it is not followed
* by any characters then {@code null} is returned.
Expand All @@ -161,7 +215,7 @@ private String extractRecordId() {
int fieldPos = 0;
boolean skip = false;
for (int i = 0; i < recordLen; ++i) {
if (buffer[i] == PicaConstants.FIELD_DELIMITER) {
if (isFieldDelimiter(buffer[i])) {
if (idBuilder.length() > 0) {
break;
}
Expand All @@ -176,7 +230,7 @@ private String extractRecordId() {
skip = true;
}
} else {
if (buffer[i] == PicaConstants.SUBFIELD_DELIMITER) {
if (buffer[i] == PicaConstants.SUBFIELD_MARKER) {
break;
}
idBuilder.append(buffer[i]);
Expand All @@ -191,4 +245,10 @@ private String extractRecordId() {
return null;
}

private static boolean isFieldDelimiter(final char ch) {
return ch == PicaConstants.RECORD_MARKER
|| ch == PicaConstants.FIELD_MARKER
|| ch == PicaConstants.FIELD_END_MARKER;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

/**
* A parser for PICA+ records. Only single records can be parsed as the parser
* does not recognise end-of-record markers (usually new lines). The initial
* parser state is FIELD_NAME. All states are valid end states. The parser
* processes any input, there is no error state.
* ignores end of record markers. The initial parser state is FIELD_NAME. All
* states are valid end states. The parser processes any input, there is no
* error state.
*
* The parser ignores spaces in field names. They are not included in the
* field name.
Expand All @@ -38,14 +38,19 @@ enum PicaParserState {
@Override
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
final PicaParserState next;
if (ch == PicaConstants.FIELD_DELIMITER) {
switch (ch) {
case PicaConstants.RECORD_MARKER:
case PicaConstants.FIELD_MARKER:
case PicaConstants.FIELD_END_MARKER:
ctx.emitStartEntity();
ctx.emitEndEntity();
next = FIELD_NAME;
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
break;
case PicaConstants.SUBFIELD_MARKER:
ctx.emitStartEntity();
next = SUBFIELD_NAME;
} else {
break;
default:
if (ch != ' ') {
ctx.appendText(ch);
}
Expand All @@ -64,12 +69,17 @@ protected void endOfInput(final PicaParserContext ctx) {
@Override
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
final PicaParserState next;
if (ch == PicaConstants.FIELD_DELIMITER) {
switch (ch) {
case PicaConstants.RECORD_MARKER:
case PicaConstants.FIELD_MARKER:
case PicaConstants.FIELD_END_MARKER:
ctx.emitEndEntity();
next = FIELD_NAME;
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
break;
case PicaConstants.SUBFIELD_MARKER:
next = this;
} else {
break;
default:
ctx.setSubfieldName(ch);
next = SUBFIELD_VALUE;
}
Expand All @@ -85,14 +95,19 @@ protected void endOfInput(final PicaParserContext ctx) {
@Override
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
final PicaParserState next;
if (ch == PicaConstants.FIELD_DELIMITER) {
switch (ch) {
case PicaConstants.RECORD_MARKER:
case PicaConstants.FIELD_MARKER:
case PicaConstants.FIELD_END_MARKER:
ctx.emitLiteral();
ctx.emitEndEntity();
next = FIELD_NAME;
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
break;
case PicaConstants.SUBFIELD_MARKER:
ctx.emitLiteral();
next = SUBFIELD_NAME;
} else {
break;
default:
ctx.appendText(ch);
next = this;
}
Expand Down
Loading

0 comments on commit a84bdc5

Please sign in to comment.