Individual medium for harvest/index and remove field binding from dis…

…covery model Note that Individual still uses Solr Java Object binding to SolrInputDocument for indexing.
TAMULib · Aug 14, 2023 · 77ff765 · 77ff765
1 parent eed7107
commit 77ff765
Show file tree

Hide file tree

Showing 43 changed files with 457 additions and 830 deletions.
diff --git a/solr/configsets/scholars-discovery/conf/managed-schema.xml b/solr/configsets/scholars-discovery/conf/managed-schema.xml
@@ -202,10 +202,6 @@
          field first in an ascending sort and last in a descending sort.
     -->
 
-    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
-    <fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
-    <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
-
     <!-- Scholars whole string type  -->
     <fieldType name="whole_string" class="solr.TextField" omitNorms="true" termVectors="false" termPositions="false" termOffsets="false" sortMissingLast="true">
       <analyzer>
@@ -358,6 +354,10 @@
       </analyzer>
     </fieldType>
 
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
+    <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
+
     <!-- boolean type: "true" or "false" -->
     <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
     <fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>

diff --git a/solr/configsets/scholars-discovery/conf/solrconfig.xml b/solr/configsets/scholars-discovery/conf/solrconfig.xml
@@ -35,7 +35,7 @@
        that you fully re-index after changing this setting as it can
        affect both how text is indexed and queried.
   -->
-  <luceneMatchVersion>9.4</luceneMatchVersion>
+  <luceneMatchVersion>9.7</luceneMatchVersion>
 
   <!-- <lib/> directives can be used to instruct Solr to load any Jars
        identified and use them to resolve any "plugins" specified in
@@ -352,9 +352,9 @@
 
          This limit only impacts boolean queries specified by a user as part of a query string,
          and provides per-collection controls on how complex user specified boolean queries can
-         be.  Query strings that specify more clauses then this will result in an error.
+         be.  Query strings that specify more clauses than this will result in an error.
 
-         If this per-collection limit is greater then the global `maxBooleanClauses` limit
+         If this per-collection limit is greater than the global `maxBooleanClauses` limit
          specified in `solr.xml`, it will have no effect, as that setting also limits the size
          of user specified boolean queries.
       -->
@@ -385,10 +385,9 @@
                       to occupy. Note that when this option is specified, the size
                       and initialSize parameters are ignored.
       -->
-
-    <filterCache size="8192"
-                 initialSize="2048"
-                 autowarmCount="1024"/>
+    <filterCache size="1024"
+                 initialSize="512"
+                 autowarmCount="0"/>
 
     <!-- Query Result Cache
 
@@ -398,20 +397,20 @@
             maxRamMB - the maximum amount of RAM (in MB) that this cache is allowed
                        to occupy
       -->
-
-    <queryResultCache size="8192"
-                      initialSize="2048"
-                      autowarmCount="1024"/>
+    <queryResultCache size="1024"
+                      initialSize="512"
+                      autowarmCount="0"
+                      maxRamMB="200"/>
 
     <!-- Document Cache
 
          Caches Lucene Document objects (the stored fields for each
          document).  Since Lucene internal document ids are transient,
          this cache will not be autowarmed.
       -->
-    <documentCache size="8192"
-                   initialSize="4096"
-                   autowarmCount="2048"/>
+    <documentCache size="1024"
+                   initialSize="512"
+                   autowarmCount="0"/>
 
     <!-- custom cache currently used by block join -->
     <cache name="perSegFilter"
@@ -429,7 +428,8 @@
       -->
 
        <fieldValueCache size="1024"
-                        autowarmCount="128" />
+                        autowarmCount="128"/>
+
 
     <!-- Custom Cache
 
@@ -563,60 +563,47 @@
      Circuit Breaker Section - This section consists of configurations for
      circuit breakers
      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
-
-    <!-- Circuit Breakers
-
-     Circuit breakers are designed to allow stability and predictable query
+  <!-- Circuit breakers are designed to allow stability and predictable query
      execution. They prevent operations that can take down the node and cause
      noisy neighbour issues.
 
-     This flag is the uber control switch which controls the activation/deactivation of all circuit
-     breakers. If a circuit breaker wishes to be independently configurable,
-     they are free to add their specific configuration but need to ensure that this flag is always
-     respected - this should have veto over all independent configuration flags.
-    -->
-    <circuitBreakers enabled="true">
-
-    <!-- Memory Circuit Breaker Configuration
-
-     Specific configuration for max JVM heap usage circuit breaker. This configuration defines whether
-     the circuit breaker is enabled and the threshold percentage of maximum heap allocated beyond which queries will be rejected until the
-     current JVM usage goes below the threshold. The valid value range for this value is 50-95.
-
-     Consider a scenario where the max heap allocated is 4 GB and memoryCircuitBreakerThreshold is
-     defined as 75. Threshold JVM usage will be 4 * 0.75 = 3 GB. Its generally a good idea to keep this value between 75 - 80% of maximum heap
-     allocated.
-
-     If, at any point, the current JVM heap usage goes above 3 GB, queries will be rejected until the heap usage goes below 3 GB again.
-     If you see queries getting rejected with 503 error code, check for "Circuit Breakers tripped"
-     in logs and the corresponding error message should tell you what transpired (if the failure
-     was caused by tripped circuit breakers).
-
-     If, at any point, the current JVM heap usage goes above 3 GB, queries will be rejected until the heap usage goes below 3 GB again.
-     If you see queries getting rejected with 503 error code, check for "Circuit Breakers tripped"
-     in logs and the corresponding error message should tell you what transpired (if the failure
-     was caused by tripped circuit breakers).
+     The CircuitBreakerManager is the default manager for all circuit breakers.
+     The enabled flag here controls the activation/deactivation of all circuit
+     breakers specified within.
+  -->
+  <circuitBreaker class="solr.CircuitBreakerManager" enabled="true">
+    <!-- Memory Circuit Breaker
+
+     Specific configuration for max JVM heap usage circuit breaker. This configuration defines
+     whether the circuit breaker is enabled and the threshold percentage of maximum heap allocated
+     beyond which queries will be rejected until the current JVM usage goes below the threshold.
+     The valid value for this range is 50-95.
+
+     Consider a scenario where the max heap allocated is 4 GB and memThreshold is defined as 75.
+     Threshold JVM usage will be 4 * 0.75 = 3 GB. Its generally a good idea to keep this value
+     between 75 - 80% of maximum heap allocated.
+
+     If, at any point, the current JVM heap usage goes above 3 GB, queries will be rejected until
+     the heap usage goes below 3 GB again. If you see queries getting rejected with 503 error code,
+     check for "Circuit Breakers tripped" in logs and the corresponding error message should tell
+     you what transpired (if the failure was caused by tripped circuit breakers).
     -->
     <!--
-   <memBreaker enabled="true" threshold="75"/>
+    <str name="memEnabled">true</str>
+    <str name="memThreshold">75</str>
     -->
 
-      <!-- CPU Circuit Breaker Configuration
-
-     Specific configuration for CPU utilization based circuit breaker. This configuration defines whether the circuit breaker is enabled
-     and the average load over the last minute at which the circuit breaker should start rejecting queries.
+    <!-- CPU Circuit Breaker Configuration
 
-     Consider a scenario where the max heap allocated is 4 GB and memoryCircuitBreakerThreshold is
-     defined as 75. Threshold JVM usage will be 4 * 0.75 = 3 GB. Its generally a good idea to keep this value between 75 - 80% of maximum heap
-     allocated.
+     Specific configuration for CPU utilization based circuit breaker. This configuration defines
+     whether the circuit breaker is enabled and the average load over the last minute at which the
+     circuit breaker should start rejecting queries.
     -->
-
-      <!--
-       <cpuBreaker enabled="true" threshold="75"/>
-      -->
-
-  </circuitBreakers>
-
+    <!--
+    <str name="cpuEnabled">true</str>
+    <str name="cpuThreshold">75</str>
+    -->
+  </circuitBreaker>
 
   <!-- Request Dispatcher
 
@@ -631,9 +618,6 @@
          what restrictions may be placed on the ContentStreams from
          those requests
 
-         enableRemoteStreaming - enables use of the stream.file
-         and stream.url parameters for specifying remote streams.
-
          multipartUploadLimitInKB - specifies the max size (in KiB) of
          Multipart File Uploads that Solr will allow in a Request.
 
@@ -649,12 +633,7 @@
          Solr components, but may be useful when developing custom
          plugins.
 
-         *** WARNING ***
-         Before enabling remote streaming, you should make sure your
-         system has authentication enabled.
-
-    <requestParsers enableRemoteStreaming="false"
-                    multipartUploadLimitInKB="-1"
+    <requestParsers multipartUploadLimitInKB="-1"
                     formdataUploadLimitInKB="-1"
                     addHttpRequestToContext="false"/>
       -->
@@ -990,48 +969,6 @@
       <str>[EEE, ]dd MMM yyyy HH:mm[:ss] z</str>
       <str>EEEE, dd-MMM-yy HH:mm:ss z</str>
       <str>EEE MMM ppd HH:mm:ss [z ]yyyy</str>
-
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSSz</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSSZZ</str>
-
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSSz</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSSZZ</str>
-
-      <str>yyyy-MM-dd'T'HH:mm:ss</str>
-      <str>yyyy-MM-dd'T'HH:mm:ssz</str>
-      <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ssZZ</str>
-
-      <str>yyyy-MM-dd'T'HH:mm</str>
-      <str>yyyy-MM-dd'T'HH:mmz</str>
-      <str>yyyy-MM-dd'T'HH:mmZ</str>
-      <str>yyyy-MM-dd'T'HH:mmZZ</str>
-
-      <str>yyyy-MM-dd HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSSz</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSSZZ</str>
-
-      <str>yyyy-MM-dd HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSSz</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSSZZ</str>
-
-      <str>yyyy-MM-dd HH:mm:ss</str>
-      <str>yyyy-MM-dd HH:mm:ssz</str>
-      <str>yyyy-MM-dd HH:mm:ssZ</str>
-      <str>yyyy-MM-dd HH:mm:ssZZ</str>
-
-      <str>yyyy-MM-dd HH:mm</str>
-      <str>yyyy-MM-dd HH:mmz</str>
-      <str>yyyy-MM-dd HH:mmZ</str>
-      <str>yyyy-MM-dd HH:mmZZ</str>
-
-      <str>yyyy-MM-dd</str>
     </arr>
   </updateProcessor>
   <updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields">

diff --git a/src/main/java/edu/tamu/scholars/middleware/config/WebSocketSecurityConfig.java b/src/main/java/edu/tamu/scholars/middleware/config/WebSocketSecurityConfig.java
@@ -29,6 +29,7 @@ protected void configureInbound(MessageSecurityMetadataSourceRegistry messages)
             .simpSubscribeDestMatchers(
                     "/queue/public",
                     "/queue/themes",
+                    "/queue/dataAndAnalyticsViews",
                     "/queue/directoryViews",
                     "/queue/discoveryViews",
                     "/queue/displayViews",

diff --git a/src/main/java/edu/tamu/scholars/middleware/discovery/DiscoveryConstants.java b/src/main/java/edu/tamu/scholars/middleware/discovery/DiscoveryConstants.java
@@ -10,12 +10,16 @@ public class DiscoveryConstants {
 
     public static final String CLASS = "class";
 
+    public static final String ABSTRACT = "abstract";
+
     public static final String TYPE = "type";
 
     public static final String SNIPPET = "snippet";
 
     public static final String MOD_TIME = "modTime";
 
+    public static final String SYNC_IDS = "syncIds";
+
     public static final String QUERY_DELIMETER = ":";
 
     public static final String DEFAULT_QUERY = WILDCARD + QUERY_DELIMETER + WILDCARD;

diff --git a/src/main/java/edu/tamu/scholars/middleware/discovery/component/Harvester.java b/src/main/java/edu/tamu/scholars/middleware/discovery/component/Harvester.java
@@ -1,13 +1,14 @@
 package edu.tamu.scholars.middleware.discovery.component;
 
 import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
+import edu.tamu.scholars.middleware.discovery.model.Individual;
 import reactor.core.publisher.Flux;
 
 public interface Harvester {
 
-    public Flux<AbstractIndexDocument> harvest();
+    public Flux<Individual> harvest();
 
-    public AbstractIndexDocument harvest(String subject);
+    public Individual harvest(String subject);
 
     public Class<AbstractIndexDocument> type();
 

diff --git a/src/main/java/edu/tamu/scholars/middleware/discovery/component/Indexer.java b/src/main/java/edu/tamu/scholars/middleware/discovery/component/Indexer.java
@@ -3,14 +3,15 @@
 import java.util.Collection;
 
 import edu.tamu.scholars.middleware.discovery.model.AbstractIndexDocument;
+import edu.tamu.scholars.middleware.discovery.model.Individual;
 
 public interface Indexer {
 
     public void init();
 
-    public void index(Collection<AbstractIndexDocument> documents);
+    public void index(Collection<Individual> documents);
 
-    public void index(AbstractIndexDocument document);
+    public void index(Individual document);
 
     public void optimize();