Skip to content
This repository has been archived by the owner on Aug 30, 2022. It is now read-only.

Commit

Permalink
#217: Some FieldType optimisations
Browse files Browse the repository at this point in the history
 * WordDilimiter: original is now kept also on query time to make searches like `c++` actually match the indexed token `c++`
* added a PatternReplaceFilterFactory to remove quotes `,` and `;` on both sides and other tailing punctuation marks on terms
  • Loading branch information
westei committed Apr 5, 2018
1 parent cfc3633 commit 6e629b8
Showing 1 changed file with 44 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_de.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="German2" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

Expand All @@ -120,8 +121,10 @@
stemEnglishPossessive="1"
preserveOriginal="1"
/>
<filter class="solr.PatternReplaceFilterFactory" pattern="(^[,;&quot;'´`]+)|([\!\?\.,;&quot;'´`]+$)" replacement=""/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
Expand All @@ -131,48 +134,54 @@
splitOnCaseChange="0" splitOnNumerics="0"
catenateWords="0" catenateNumbers="1" catenateAll="0"
stemEnglishPossessive="1"
preserveOriginal="0"
preserveOriginal="1"
/>
<filter class="solr.PatternReplaceFilterFactory" pattern="(^[,;&quot;'´`]+)|([\!\?\.,;&quot;'´`]+$)" replacement=""/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

<!-- German -->
<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterGraphFilterFactory"
generateWordParts="1" generateNumberParts="1"
splitOnCaseChange="1" splitOnNumerics="1"
catenateWords="1" catenateNumbers="1" catenateAll="1"
stemEnglishPossessive="0"
preserveOriginal="1"
/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterGraphFilterFactory"
generateWordParts="1" generateNumberParts="1"
splitOnCaseChange="0" splitOnNumerics="0"
catenateWords="0" catenateNumbers="0" catenateAll="0"
stemEnglishPossessive="0"
preserveOriginal="0"
/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterGraphFilterFactory"
generateWordParts="1" generateNumberParts="1"
splitOnCaseChange="1" splitOnNumerics="1"
catenateWords="1" catenateNumbers="1" catenateAll="1"
stemEnglishPossessive="0"
preserveOriginal="1"
/>
<filter class="solr.PatternReplaceFilterFactory" pattern="(^[,;&quot;'´`]+)|([\!\?\.,;&quot;'´`]+$)" replacement=""/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterGraphFilterFactory"
generateWordParts="1" generateNumberParts="1"
splitOnCaseChange="0" splitOnNumerics="0"
catenateWords="0" catenateNumbers="0" catenateAll="0"
stemEnglishPossessive="0"
preserveOriginal="1"
/>
<filter class="solr.PatternReplaceFilterFactory" pattern="(^[,;&quot;'´`]+)|([\!\?\.,;&quot;'´`]+$)" replacement=""/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

<fieldType name="type" class="solr.EnumField" enumsConfig="enums.xml" enumName="type"/>

Expand Down

0 comments on commit 6e629b8

Please sign in to comment.