Skip to content

Commit

Permalink
Fix implicit default highlighting fields (implements #152).
Browse files Browse the repository at this point in the history
This was an ugly limitation of the previous version, due to which users
had always explicitely pass a list of 'regular' highlighting fields via
`hl.fl`. A refactoring of the `OcrHighlightComponent` allows us to
remedy this.

The fix is a bit hacky, but it works reliably: When we detect that
highlighting is enabled, but no `hl.fl` parameter was passed (i.e.
the highlighter will automatically highlight all stored fields), we
create a custom explicit `hl.fl` parameter at runtime, that includes all
of the default highlight fields, except for those fields contained in
the `hl.ocr.fl` field set.
jbaiter committed May 10, 2021

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 59f4bc2 commit 41f91e5
Showing 4 changed files with 117 additions and 31 deletions.
5 changes: 0 additions & 5 deletions docs/query.md
Original file line number Diff line number Diff line change
@@ -2,11 +2,6 @@
To enable highlighting, make sure you set `hl=true` in you query. Additionally, you need to pass the OCR fields that
you want to have highlighted in the `hl.ocr.fl` parameter.

!!! caution "Highlighting Non-OCR Fields"
One unfortunate side effect of the way the plugin works is that you need to pass non-OCR fields to be highlighted
**explicitly** via the `hl.fl` parameter. By default, Solr falls back on highlighting all stored fields if the
parameter is not present, which no longer works if this plugin is used.

## Response Format
With OCR highlighting enabled, your Solr response will now include a new item `ocrHighlighting`, mapping all
highlighted OCR fields to their highlighting snippets:
Original file line number Diff line number Diff line change
@@ -1,22 +1,44 @@
package de.digitalcollections.solrocr.solr;

import com.google.common.base.Strings;
import de.digitalcollections.solrocr.lucene.OcrHighlighter;
import de.digitalcollections.solrocr.util.PageCacheWarmer;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.CloseHook;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.ShardResponse;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.PluginInfoInitialized;
import org.apache.solr.util.plugin.SolrCoreAware;

public class OcrHighlightComponent extends SearchComponent
implements PluginInfoInitialized, SolrCoreAware {
public static final String COMPONENT_NAME = "ocrHighlight";
public static final String HL_RESPONSE_FIELD = "ocrHighlighting";

public class OcrHighlightComponent extends org.apache.solr.handler.component.HighlightComponent {
private PluginInfo info;
private SolrOcrHighlighter ocrHighlighter;

@@ -30,9 +52,31 @@ public void init(PluginInfo info) {
this.info = info;
}

@Override
public void prepare(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
rb.doHighlights = params.getBool(HighlightParams.HIGHLIGHT, false);
if (rb.doHighlights) {
rb.setNeedDocList(true);
String hlq = params.get(HighlightParams.Q);
String hlparser =
Stream.of(params.get(HighlightParams.QPARSER), params.get(QueryParsing.DEFTYPE))
.filter(Objects::nonNull)
.findFirst()
.orElse(QParserPlugin.DEFAULT_QTYPE);
if (hlq != null) {
try {
QParser parser = QParser.getParser(hlq, hlparser, rb.req);
rb.setHighlightQuery(parser.getHighlightQuery());
} catch (SyntaxError e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
}
}
}

@Override
public void inform(SolrCore core) {
super.inform(core);
this.ocrHighlighter = new SolrOcrHighlighter();
if ("true".equals(info.attributes.getOrDefault("enablePreload", "false"))) {
PageCacheWarmer.enable(
@@ -57,8 +101,6 @@ public void postClose(SolrCore core) {
public void process(ResponseBuilder rb) throws IOException {
if (rb.doHighlights) {
SolrQueryRequest req = rb.req;
String[] defaultHighlightFields =
rb.getQparser() != null ? rb.getQparser().getDefaultHighlightFields() : null;
Query highlightQuery = rb.getHighlightQuery();
if (highlightQuery == null) {
if (rb.getQparser() != null) {
@@ -82,31 +124,65 @@ public void process(ResponseBuilder rb) throws IOException {
req,
rb.rsp.getResponseHeader().asShallowMap());
if (ocrHighlights != null) {
rb.rsp.add(highlightingResponseField(), ocrHighlights);
rb.rsp.add(HL_RESPONSE_FIELD, ocrHighlights);
}
}
fixRegularHighlighting(rb);
}
}

// Disable further highlighting if fields are not set to prevent the default highlighter
// from highlighting our OCR fields, which will break.
ModifiableSolrParams params = new ModifiableSolrParams(rb.req.getParams());
if (params.get("hl.fl") == null) {
params.set("hl", "false");
@Override
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
if (!rb.doHighlights) return;
if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
sreq.purpose |= ShardRequest.PURPOSE_GET_HIGHLIGHTS;
sreq.params.set(HighlightParams.HIGHLIGHT, "true");
} else {
sreq.params.set(HighlightParams.HIGHLIGHT, "false");
}
}

/**
* By default, the `HighlightComponent` will delegate to the query parser to pick the default set
* of fields to be highlighted, in the case that the user doesn't submit a list herself.
* Unfortunately this default set includes our OCR field, which in many cases is not compatible
* with the regular highlighting approach since, from Solr's perspective, it doesn't include
* plaintext. We thus set a customized list of highlighting fields, based on the default list, but
* excluding any fields that are intended for OCR highlighting.
*/
private void fixRegularHighlighting(ResponseBuilder rb) {
ModifiableSolrParams params = new ModifiableSolrParams(rb.req.getParams());
if (params.get("hl.fl") == null) {
String ocrHlStr = params.get(OcrHighlightParams.OCR_FIELDS);
Set<String> ocrHlFields = new HashSet<>();
if (!Strings.isNullOrEmpty(ocrHlStr)) {
ocrHlFields.addAll(Arrays.asList(ocrHlStr.split(",")));
}
String[] defaultHighlightFields =
rb.getQparser() != null ? rb.getQparser().getDefaultHighlightFields() : null;
if (defaultHighlightFields != null) {
params.set(
"hl.fl",
Arrays.stream(defaultHighlightFields)
.filter(fl -> !ocrHlFields.contains(fl))
.collect(Collectors.joining(",")));
}
if (Strings.isNullOrEmpty(params.get("hl.fl"))) {
params.set("hl", "off");
rb.doHighlights = false;
// Set the highlighting result to an empty list
rb.rsp.add("highlighting", new SimpleOrderedMap<>());
}
rb.req.setParams(params);
}
rb.req.setParams(params);
}

@SuppressWarnings({"rawtypes", "unchecked"})
@Override
public void finishStage(ResponseBuilder rb) {
boolean setOcrHighlights =
rb.doHighlights
&& !rb.req.getParams().get(OcrHighlightParams.OCR_FIELDS, "").isEmpty()
!Strings.isNullOrEmpty(rb.req.getParams().get(OcrHighlightParams.OCR_FIELDS, ""))
&& rb.stage == ResponseBuilder.STAGE_GET_FIELDS;
if (setOcrHighlights) {
super.finishStage(rb);
final Object[] objArr = new NamedList.NamedListEntry[rb.resultIds.size()];
for (ShardRequest sreq : rb.finished) {
if ((sreq.purpose & ShardRequest.PURPOSE_GET_HIGHLIGHTS) == 0) continue;
for (ShardResponse srsp : sreq.responses) {
@@ -121,13 +197,16 @@ public void finishStage(ResponseBuilder rb) {
if (partialHls != null && partialHls) {
rb.rsp.getResponseHeader().add(OcrHighlighter.PARTIAL_OCR_HIGHLIGHTS, true);
}
Object hl = srsp.getSolrResponse().getResponse().get(HL_RESPONSE_FIELD);
SolrPluginUtils.copyNamedListIntoArrayByDocPosInResponse(
(NamedList) hl, rb.resultIds, (Map.Entry<String, Object>[]) objArr);
}
}
rb.rsp.add(
HL_RESPONSE_FIELD,
SolrPluginUtils.removeNulls(
(Map.Entry<String, Object>[]) objArr, new SimpleOrderedMap<>()));
fixRegularHighlighting(rb);
}
}

@Override
protected String highlightingResponseField() {
return "ocrHighlighting";
}
}
Original file line number Diff line number Diff line change
@@ -113,15 +113,13 @@ public void testRegularHighlightingWorks() throws Exception {
}

@Test
public void testCombinedHighlightingWoriks() throws Exception {
public void testCombinedHighlightingWorks() throws Exception {
QueryResponse resp =
query(
"q",
"\"commodo consequat\" svadag",
"hl",
"true",
"hl.fl",
"some_text",
"defType",
"edismax",
"hl.weightMatches",
18 changes: 16 additions & 2 deletions src/test/java/de/digitalcollections/solrocr/solr/HocrTest.java
Original file line number Diff line number Diff line change
@@ -295,14 +295,28 @@ public void testRegularHighlighting() {
assertQ(req, "count(//lst[@name='ocrHighlighting']//arr[@name='snippets'])=0");
}

@Test
public void testImplicitRegularHighlighting() {
SolrQueryRequest req =
req(
"q",
"\"occaecat cupidatat\"",
"defType",
"edismax",
"qf",
"some_text ocr_text",
"hl",
"true");
assertQ(req, "count(//lst[@name='highlighting']//arr[@name='some_text'])=1");
assertQ(req, "count(//lst[@name='ocrHighlighting']//arr[@name='snippets'])=0");
}

@Test
public void testCombinedHighlighting() {
SolrQueryRequest req =
xmlQ(
"q",
"\"occaecat cupidatat\" Salomet",
"hl.fl",
"some_text",
"defType",
"edismax",
"qf",

0 comments on commit 41f91e5

Please sign in to comment.