Skip to content

Commit

Permalink
Individual medium for harvest/index and remove field binding from dis…
Browse files Browse the repository at this point in the history
…covery model

Note that Individual still uses Solr Java Object binding to SolrInputDocument for indexing.
  • Loading branch information
wwelling committed Aug 14, 2023
1 parent eed7107 commit 77ff765
Show file tree
Hide file tree
Showing 43 changed files with 457 additions and 830 deletions.
8 changes: 4 additions & 4 deletions solr/configsets/scholars-discovery/conf/managed-schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,6 @@
field first in an ascending sort and last in a descending sort.
-->

<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />

<!-- Scholars whole string type -->
<fieldType name="whole_string" class="solr.TextField" omitNorms="true" termVectors="false" termPositions="false" termOffsets="false" sortMissingLast="true">
<analyzer>
Expand Down Expand Up @@ -358,6 +354,10 @@
</analyzer>
</fieldType>

<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />

<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
Expand Down
159 changes: 48 additions & 111 deletions solr/configsets/scholars-discovery/conf/solrconfig.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
that you fully re-index after changing this setting as it can
affect both how text is indexed and queried.
-->
<luceneMatchVersion>9.4</luceneMatchVersion>
<luceneMatchVersion>9.7</luceneMatchVersion>

<!-- <lib/> directives can be used to instruct Solr to load any Jars
identified and use them to resolve any "plugins" specified in
Expand Down Expand Up @@ -352,9 +352,9 @@
This limit only impacts boolean queries specified by a user as part of a query string,
and provides per-collection controls on how complex user specified boolean queries can
be. Query strings that specify more clauses then this will result in an error.
be. Query strings that specify more clauses than this will result in an error.
If this per-collection limit is greater then the global `maxBooleanClauses` limit
If this per-collection limit is greater than the global `maxBooleanClauses` limit
specified in `solr.xml`, it will have no effect, as that setting also limits the size
of user specified boolean queries.
-->
Expand Down Expand Up @@ -385,10 +385,9 @@
to occupy. Note that when this option is specified, the size
and initialSize parameters are ignored.
-->

<filterCache size="8192"
initialSize="2048"
autowarmCount="1024"/>
<filterCache size="1024"
initialSize="512"
autowarmCount="0"/>

<!-- Query Result Cache
Expand All @@ -398,20 +397,20 @@
maxRamMB - the maximum amount of RAM (in MB) that this cache is allowed
to occupy
-->

<queryResultCache size="8192"
initialSize="2048"
autowarmCount="1024"/>
<queryResultCache size="1024"
initialSize="512"
autowarmCount="0"
maxRamMB="200"/>

<!-- Document Cache
Caches Lucene Document objects (the stored fields for each
document). Since Lucene internal document ids are transient,
this cache will not be autowarmed.
-->
<documentCache size="8192"
initialSize="4096"
autowarmCount="2048"/>
<documentCache size="1024"
initialSize="512"
autowarmCount="0"/>

<!-- custom cache currently used by block join -->
<cache name="perSegFilter"
Expand All @@ -429,7 +428,8 @@
-->

<fieldValueCache size="1024"
autowarmCount="128" />
autowarmCount="128"/>


<!-- Custom Cache
Expand Down Expand Up @@ -563,60 +563,47 @@
Circuit Breaker Section - This section consists of configurations for
circuit breakers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

<!-- Circuit Breakers
Circuit breakers are designed to allow stability and predictable query
<!-- Circuit breakers are designed to allow stability and predictable query
execution. They prevent operations that can take down the node and cause
noisy neighbour issues.
This flag is the uber control switch which controls the activation/deactivation of all circuit
breakers. If a circuit breaker wishes to be independently configurable,
they are free to add their specific configuration but need to ensure that this flag is always
respected - this should have veto over all independent configuration flags.
-->
<circuitBreakers enabled="true">

<!-- Memory Circuit Breaker Configuration
Specific configuration for max JVM heap usage circuit breaker. This configuration defines whether
the circuit breaker is enabled and the threshold percentage of maximum heap allocated beyond which queries will be rejected until the
current JVM usage goes below the threshold. The valid value range for this value is 50-95.
Consider a scenario where the max heap allocated is 4 GB and memoryCircuitBreakerThreshold is
defined as 75. Threshold JVM usage will be 4 * 0.75 = 3 GB. Its generally a good idea to keep this value between 75 - 80% of maximum heap
allocated.
If, at any point, the current JVM heap usage goes above 3 GB, queries will be rejected until the heap usage goes below 3 GB again.
If you see queries getting rejected with 503 error code, check for "Circuit Breakers tripped"
in logs and the corresponding error message should tell you what transpired (if the failure
was caused by tripped circuit breakers).
If, at any point, the current JVM heap usage goes above 3 GB, queries will be rejected until the heap usage goes below 3 GB again.
If you see queries getting rejected with 503 error code, check for "Circuit Breakers tripped"
in logs and the corresponding error message should tell you what transpired (if the failure
was caused by tripped circuit breakers).
The CircuitBreakerManager is the default manager for all circuit breakers.
The enabled flag here controls the activation/deactivation of all circuit
breakers specified within.
-->
<circuitBreaker class="solr.CircuitBreakerManager" enabled="true">
<!-- Memory Circuit Breaker
Specific configuration for max JVM heap usage circuit breaker. This configuration defines
whether the circuit breaker is enabled and the threshold percentage of maximum heap allocated
beyond which queries will be rejected until the current JVM usage goes below the threshold.
The valid value for this range is 50-95.
Consider a scenario where the max heap allocated is 4 GB and memThreshold is defined as 75.
Threshold JVM usage will be 4 * 0.75 = 3 GB. Its generally a good idea to keep this value
between 75 - 80% of maximum heap allocated.
If, at any point, the current JVM heap usage goes above 3 GB, queries will be rejected until
the heap usage goes below 3 GB again. If you see queries getting rejected with 503 error code,
check for "Circuit Breakers tripped" in logs and the corresponding error message should tell
you what transpired (if the failure was caused by tripped circuit breakers).
-->
<!--
<memBreaker enabled="true" threshold="75"/>
<str name="memEnabled">true</str>
<str name="memThreshold">75</str>
-->

<!-- CPU Circuit Breaker Configuration
Specific configuration for CPU utilization based circuit breaker. This configuration defines whether the circuit breaker is enabled
and the average load over the last minute at which the circuit breaker should start rejecting queries.
<!-- CPU Circuit Breaker Configuration
Consider a scenario where the max heap allocated is 4 GB and memoryCircuitBreakerThreshold is
defined as 75. Threshold JVM usage will be 4 * 0.75 = 3 GB. Its generally a good idea to keep this value between 75 - 80% of maximum heap
allocated.
Specific configuration for CPU utilization based circuit breaker. This configuration defines
whether the circuit breaker is enabled and the average load over the last minute at which the
circuit breaker should start rejecting queries.
-->

<!--
<cpuBreaker enabled="true" threshold="75"/>
-->

</circuitBreakers>

<!--
<str name="cpuEnabled">true</str>
<str name="cpuThreshold">75</str>
-->
</circuitBreaker>

<!-- Request Dispatcher
Expand All @@ -631,9 +618,6 @@
what restrictions may be placed on the ContentStreams from
those requests
enableRemoteStreaming - enables use of the stream.file
and stream.url parameters for specifying remote streams.
multipartUploadLimitInKB - specifies the max size (in KiB) of
Multipart File Uploads that Solr will allow in a Request.
Expand All @@ -649,12 +633,7 @@
Solr components, but may be useful when developing custom
plugins.
*** WARNING ***
Before enabling remote streaming, you should make sure your
system has authentication enabled.
<requestParsers enableRemoteStreaming="false"
multipartUploadLimitInKB="-1"
<requestParsers multipartUploadLimitInKB="-1"
formdataUploadLimitInKB="-1"
addHttpRequestToContext="false"/>
-->
Expand Down Expand Up @@ -990,48 +969,6 @@
<str>[EEE, ]dd MMM yyyy HH:mm[:ss] z</str>
<str>EEEE, dd-MMM-yy HH:mm:ss z</str>
<str>EEE MMM ppd HH:mm:ss [z ]yyyy</str>

<str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
<str>yyyy-MM-dd'T'HH:mm:ss.SSSz</str>
<str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss.SSSZZ</str>

<str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSSz</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSSZZ</str>

<str>yyyy-MM-dd'T'HH:mm:ss</str>
<str>yyyy-MM-dd'T'HH:mm:ssz</str>
<str>yyyy-MM-dd'T'HH:mm:ssZ</str>
<str>yyyy-MM-dd'T'HH:mm:ssZZ</str>

<str>yyyy-MM-dd'T'HH:mm</str>
<str>yyyy-MM-dd'T'HH:mmz</str>
<str>yyyy-MM-dd'T'HH:mmZ</str>
<str>yyyy-MM-dd'T'HH:mmZZ</str>

<str>yyyy-MM-dd HH:mm:ss.SSS</str>
<str>yyyy-MM-dd HH:mm:ss.SSSz</str>
<str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
<str>yyyy-MM-dd HH:mm:ss.SSSZZ</str>

<str>yyyy-MM-dd HH:mm:ss,SSS</str>
<str>yyyy-MM-dd HH:mm:ss,SSSz</str>
<str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
<str>yyyy-MM-dd HH:mm:ss,SSSZZ</str>

<str>yyyy-MM-dd HH:mm:ss</str>
<str>yyyy-MM-dd HH:mm:ssz</str>
<str>yyyy-MM-dd HH:mm:ssZ</str>
<str>yyyy-MM-dd HH:mm:ssZZ</str>

<str>yyyy-MM-dd HH:mm</str>
<str>yyyy-MM-dd HH:mmz</str>
<str>yyyy-MM-dd HH:mmZ</str>
<str>yyyy-MM-dd HH:mmZZ</str>

<str>yyyy-MM-dd</str>
</arr>
</updateProcessor>
<updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ protected void configureInbound(MessageSecurityMetadataSourceRegistry messages)
.simpSubscribeDestMatchers(
"/queue/public",
"/queue/themes",
"/queue/dataAndAnalyticsViews",
"/queue/directoryViews",
"/queue/discoveryViews",
"/queue/displayViews",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ public class DiscoveryConstants {

public static final String CLASS = "class";

public static final String ABSTRACT = "abstract";

public static final String TYPE = "type";

public static final String SNIPPET = "snippet";

public static final String MOD_TIME = "modTime";

public static final String SYNC_IDS = "syncIds";

public static final String QUERY_DELIMETER = ":";

public static final String DEFAULT_QUERY = WILDCARD + QUERY_DELIMETER + WILDCARD;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package edu.tamu.scholars.middleware.discovery.component;

import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
import edu.tamu.scholars.middleware.discovery.model.Individual;
import reactor.core.publisher.Flux;

public interface Harvester {

public Flux<AbstractIndexDocument> harvest();
public Flux<Individual> harvest();

public AbstractIndexDocument harvest(String subject);
public Individual harvest(String subject);

public Class<AbstractIndexDocument> type();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import java.util.Collection;

import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
import edu.tamu.scholars.middleware.discovery.model.Individual;

public interface Indexer {

public void init();

public void index(Collection<AbstractIndexDocument> documents);
public void index(Collection<Individual> documents);

public void index(AbstractIndexDocument document);
public void index(Individual document);

public void optimize();

Expand Down
Loading

0 comments on commit 77ff765

Please sign in to comment.