Skip to content

Commit

Permalink
add integration branch changes
Browse files Browse the repository at this point in the history
  • Loading branch information
austin007008 committed Mar 13, 2024
1 parent ec3b1f8 commit f56d550
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public class ExcerptTransform extends DocumentTransform.DefaultDocumentTransform
private final IteratorEnvironment env;
private final SortedKeyValueIterator<Key,Value> source;

private String hitTermValues = "";
private final ArrayList<String> hitTermValues = new ArrayList<>();

public ExcerptTransform(ExcerptFields excerptFields, IteratorEnvironment env, SortedKeyValueIterator<Key,Value> source) {
this(excerptFields, env, source, new TermFrequencyExcerptIterator());
Expand Down Expand Up @@ -126,7 +126,7 @@ private PhraseIndexes getPhraseIndexes(Document document) {
pos.getOffset());
}
// save the hit term for later callout
hitTermValues = (String) hitTuple.getValue();
Collections.addAll(hitTermValues, ((String) hitTuple.getValue()).split(Constants.SPACE));
}
}
}
Expand Down Expand Up @@ -313,7 +313,7 @@ private Set<Excerpt> getExcerpts(PhraseIndexes phraseIndexes) {
* the term values to match
* @return the excerpt
*/
private String getExcerpt(String field, int start, int end, Range range, String hitTermValues) {
private String getExcerpt(String field, int start, int end, Range range, ArrayList<String> hitTermValues) {
int prevLeftWordsSkipped = 0;
int currLeftWordsSkipped = 0;
boolean leftLock = false;
Expand Down Expand Up @@ -372,28 +372,13 @@ private String getExcerpt(String field, int start, int end, Range range, String
throw new RuntimeException("This should never be reached. Something went wrong!");
}

private static String getHitPhrase(String hitTermValues, String[] phraseParts) {
private static String getHitPhrase(ArrayList<String> hitTermValues, String[] phraseParts) {
List<String> hitPhrase = new ArrayList<>();
String[] hitTermValuesParts = hitTermValues.split(Constants.SPACE);
boolean startedCallout = false;
for (String phrasePart : phraseParts[1].split(Constants.SPACE)) {
// check if we have a multi value term
// if we do, call out the first and last term values only
if (hitTermValuesParts.length > 1) {
if (phrasePart.equals(hitTermValuesParts[0])) {
hitPhrase.add("[" + phrasePart);
startedCallout = true;
} else if (startedCallout && phrasePart.equals(hitTermValuesParts[hitTermValuesParts.length - 1])) {
hitPhrase.add(phrasePart + "]");
} else {
hitPhrase.add(phrasePart);
}
if (hitTermValues.contains(phrasePart)) {
hitPhrase.add("[" + phrasePart + "]");
} else {
if (hitTermValues.contains(phrasePart)) {
hitPhrase.add("[" + phrasePart + "]");
} else {
hitPhrase.add(phrasePart);
}
hitPhrase.add(phrasePart);
}
}
return String.join(" ", hitPhrase);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ public void testExcerptOverlapped() throws IOException {
assertEquals(1, arg.size());
String excerpt = arg.getAttributes().iterator().next().getData().toString();
// only one excerpt should return
assertEquals("and the [quick brown] fox jumped over the lazy dog", excerpt);
assertEquals("and the [quick] [brown] fox jumped over the lazy dog", excerpt);
}

/**
Expand Down Expand Up @@ -230,7 +230,7 @@ public void testExcerptOverlappedAndPhraseOverlapped() throws IOException {
Set<String> excerpts = arg.getAttributes().stream().map(a -> a.getData().toString()).collect(Collectors.toSet());
// all excerpts should be returned
assertTrue(excerpts.contains("Jack and Jill jumped over the"));
assertTrue(excerpts.contains("the brown chicken layed an egg and the [quick brown] fox jumped over the lazy dog"));
assertTrue(excerpts.contains("the [brown] chicken layed an egg and the [quick] [brown] fox jumped over the lazy dog"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
import datawave.data.type.OneToManyNormalizerType;
import datawave.data.type.Type;
import datawave.data.type.util.Geometry;
import datawave.ingest.data.config.NormalizedFieldAndValue;
import datawave.ingest.mapreduce.handler.shard.content.BoundedOffsetQueue;
import datawave.ingest.mapreduce.handler.shard.content.OffsetQueue;
import datawave.ingest.mapreduce.handler.shard.content.TermAndZone;
import datawave.ingest.protobuf.TermWeight;
import datawave.ingest.protobuf.Uid;
import datawave.query.QueryTestTableHelper;
import datawave.util.TableName;
Expand All @@ -45,12 +50,12 @@ public enum WhatKindaRange {
protected static final String shard = date + "_0";
protected static final ColumnVisibility columnVisibility = new ColumnVisibility("ALL");
protected static final Value emptyValue = new Value(new byte[0]);
protected static final long timeStamp = 1356998400000l;
protected static final long timeStamp = 1356998400000L;

public static final String corleoneUID = UID.builder().newId("Corleone".getBytes(), (Date) null).toString();
public static final String corleoneChildUID = UID.builder().newId("Corleone".getBytes(), (Date) null, "1").toString();
public static final String sopranoUID = UID.builder().newId("Soprano".toString().getBytes(), (Date) null).toString();
public static final String caponeUID = UID.builder().newId("Capone".toString().getBytes(), (Date) null).toString();
public static final String sopranoUID = UID.builder().newId("Soprano".getBytes(), (Date) null).toString();
public static final String caponeUID = UID.builder().newId("Capone".getBytes(), (Date) null).toString();

protected static String normalizeColVal(Map.Entry<String,String> colVal) throws Exception {
if ("FROM_ADDRESS".equals(colVal.getKey()) || "TO_ADDRESS".equals(colVal.getKey())) {
Expand Down Expand Up @@ -993,13 +998,25 @@ private static void addFiTfTokens(BatchWriter bw, WhatKindaRange range, String f
Mutation fi = new Mutation(shard);
fi.put("fi\u0000" + field.toUpperCase(), lcNoDiacriticsType.normalize(phrase) + "\u0000" + datatype + "\u0000" + uid, columnVisibility, timeStamp,
emptyValue);

OffsetQueue<Integer> tokenOffsetCache = new BoundedOffsetQueue<>(500);
int i = 0;
String[] tokens = phrase.split(" ");
for (String token : tokens) {
fi.put("fi\u0000" + field.toUpperCase(), lcNoDiacriticsType.normalize(token) + "\u0000" + datatype + "\u0000" + uid, columnVisibility, timeStamp,
emptyValue);
fi.put("tf", datatype + "\u0000" + uid + "\u0000" + lcNoDiacriticsType.normalize(token) + "\u0000" + field, columnVisibility, timeStamp,
emptyValue);
tokenOffsetCache.addOffset(new TermAndZone(token, field.toUpperCase()), i);

i++;
}
for (BoundedOffsetQueue.OffsetList<Integer> offsets : tokenOffsetCache.offsets()) {
NormalizedFieldAndValue nfv = new NormalizedFieldAndValue(offsets.termAndZone.zone, offsets.termAndZone.term);
TermWeight.Info.Builder builder = TermWeight.Info.newBuilder();
for (Integer offset : offsets.offsets) {
builder.addTermOffset(offset);
}
Value value = new Value(builder.build().toByteArray());
fi.put("tf", datatype + "\u0000" + uid + "\u0000" + lcNoDiacriticsType.normalize(nfv.getIndexedFieldValue()) + "\u0000" + nfv.getIndexedFieldName(),
columnVisibility, timeStamp, value);
}
bw.addMutation(fi);
}
Expand Down

0 comments on commit f56d550

Please sign in to comment.