Skip to content

Commit

Permalink
Item14485: better prefix and suffix search
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelDaum committed Sep 18, 2017
1 parent 69a896f commit c6edfca
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 43 deletions.
49 changes: 32 additions & 17 deletions data/System/SolrPlugin.txt
@@ -1,4 +1,4 @@
%META:TOPICINFO{author="ProjectContributor" date="1485183295" format="1.1" version="1"}%
%META:TOPICINFO{author="ProjectContributor" date="1505725909" format="1.1" version="1"}%
---+ Solr Plugin
%FORMFIELD{"Description"}%

Expand Down Expand Up @@ -679,7 +679,8 @@ useful for spatial search.
| text_generic | same as =text= but also splits words on case change while generating word parts. \
a general unstemmed text field - good if one does not know the language of the field. \
this field type is usful when searching for parts of a !WikiWord |
| text_substring | same as =text_generic= but with substring decomposition |
| text_prefix | substring decomposition starting at the front of the string |
| text_suffix | substring decomposition starting at the back of the string |
| text_spell | generic text analysis for spell checking |
| text_sort | this is a text field suitable for sorting alphabetically |
| text_rev | a general unstemmed text field that indexes tokens normally and also \
Expand Down Expand Up @@ -720,7 +721,8 @@ useful for spatial search.
| size | tint | | stored | size of an attachment in bytes |
| spell | text_spell | multivalued | | used for spellchecking |
| state | string | | | used by comments or any other application that tracks specific states of a document, such as "new", "unapproved", "approved", "draft", "unpublished", "published", ... |
| substrings | text_substring | multivalued | | holds substring analysis of the most important search fields |
| text_prefix | text_text_prefix | multivalued | | holds substring analysis of the most important search fields, starting at the front |
| text_suffix | text_text_suffix | multivalued | | holds substring analysis of the most important search fields, starting at the back |
| summary | text_generic | | stored | this is a plainified summary of the topic text |
| tag | string | multivalued | stored | list of tags assigned to this document; note: this field will only be used if Foswiki:Extensions/ClassificationPlugin is installed; content of this field is copied to =category_search= as well (see generic fields below) |
| text | text_generic | | | document text |
Expand Down Expand Up @@ -811,18 +813,30 @@ specific search applications. The destination fields are then analysed using the
| topic | catchall |
| type | catchall |
| state | catchall |
| attachment | substrings |
| category | substrings |
| comment | substrings |
| contributor | substrings |
| field_* | substrings |
| form | substrings |
| name | substrings |
| tag | substrings |
| text | substrings |
| title | substrings |
| topic | substrings |
| type | substrings |
| attachment | text_prefix |
| category | text_prefix |
| comment | text_prefix |
| contributor | text_prefix |
| field_* | text_prefix |
| form | text_prefix |
| name | text_prefix |
| tag | text_prefix |
| text | text_prefix |
| title | text_prefix |
| topic | text_prefix |
| type | text_prefix |
| attachment | text_suffix |
| category | text_suffix |
| comment | text_suffix |
| contributor | text_suffix |
| field_* | text_suffix |
| form | text_suffix |
| name | text_suffix |
| tag | text_suffix |
| text | text_suffix |
| title | text_suffix |
| topic | text_suffix |
| type | text_suffix |
| attachment | phonetic |
| category | phonetic |
| comment | phonetic |
Expand Down Expand Up @@ -860,6 +874,7 @@ specific search applications. The destination fields are then analysed using the

---++ Change History
%TABLE{columnwidths="7em" tablewidth="100%"}%
| 18 Sep 2017: | replacing =text_substring= with =text_prefix= and =text_suffix= to improve substring matching |
| 23 Jan 2017: | converted WebServices::Solr to Moo; \
fixed documentation for iwatch realtime indexing; \
documentation of SOLRSCRIPTURL macro; \
Expand Down Expand Up @@ -982,5 +997,5 @@ specific search applications. The destination fields are then analysed using the
%META:FIELD{name="License" title="License" value="GPL ([[http://www.gnu.org/copyleft/gpl.html][GNU General Public License]])"}%
%META:FIELD{name="Home" title="Home" value="Foswiki:Extensions/SolrPlugin"}%
%META:FIELD{name="Support" title="Support" value="Foswiki:Support/SolrPlugin"}%
%META:FILEATTACHMENT{name="SolrPluginSnap1.png" attr="" comment="" date="1485183295" size="93552" user="ProjectContributor" version="1"}%
%META:FILEATTACHMENT{name="SolrPluginSnap2.png" attr="" comment="" date="1485183295" size="158013" user="ProjectContributor" version="1"}%
%META:FILEATTACHMENT{name="SolrPluginSnap1.png" attr="" comment="" date="1505725909" size="93552" user="ProjectContributor" version="1"}%
%META:FILEATTACHMENT{name="SolrPluginSnap2.png" attr="" comment="" date="1505725909" size="158013" user="ProjectContributor" version="1"}%
85 changes: 64 additions & 21 deletions solr/configsets/foswiki_configs/conf/schema.xml
Expand Up @@ -214,8 +214,7 @@
</analyzer>
</fieldType>

<!-- same as text_generic but with substring match -->
<fieldType name="text_substring" class="solr.TextField" indexed="true" positionIncrementGap="100">
<fieldType name="text_prefix" class="solr.TextField" indexed="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory"
Expand All @@ -228,9 +227,7 @@
preserveOriginal="1"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords.txt" />
<filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="15" />
<!-- <filter class="solr.ReverseStringFilterFactory" /> -->
<filter class="solr.EdgeNGramFilterFactory" minGramSize="3" maxGramSize="15" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
<analyzer type="query">
Expand All @@ -245,8 +242,39 @@
preserveOriginal="1"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords.txt" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="text_suffix" class="solr.TextField" indexed="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory"
splitOnCaseChange="1"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
preserveOriginal="1"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReverseStringFilterFactory" />
<filter class="solr.EdgeNGramFilterFactory" minGramSize="3" maxGramSize="15" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory"
splitOnCaseChange="1"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
preserveOriginal="1"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReverseStringFilterFactory" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
</fieldType>
Expand Down Expand Up @@ -1067,7 +1095,8 @@
<field name="createauthor" type="string" />
<field name="createdate" type="tdate" />
<field name="catchall" type="text_generic" multiValued="true" stored="false" />
<field name="substrings" type="text_substring" multiValued="true" stored="false" />
<field name="text_prefix" type="text_prefix" multiValued="true" stored="false" />
<field name="text_suffix" type="text_suffix" multiValued="true" stored="false" />
<field name="phonetic" type="phonetic" multiValued="true" />
<field name="charnorm" type="text_charnorm" multiValued="true" stored="false" />

Expand Down Expand Up @@ -1259,19 +1288,33 @@
<copyField source="concept" dest="charnorm"/>
<copyField source="sentence" dest="charnorm"/>

<copyField source="attachment" dest="substrings"/>
<copyField source="category" dest="substrings"/>
<copyField source="comment" dest="substrings"/>
<copyField source="field_*" dest="substrings"/>
<copyField source="form" dest="substrings"/>
<copyField source="name" dest="substrings"/>
<copyField source="tag" dest="substrings"/>
<copyField source="text" dest="substrings"/>
<copyField source="title" dest="substrings"/>
<copyField source="topic" dest="substrings"/>
<copyField source="type" dest="substrings"/>
<copyField source="concept" dest="substrings"/>
<copyField source="sentence" dest="substrings"/>
<copyField source="attachment" dest="text_prefix"/>
<copyField source="category" dest="text_prefix"/>
<copyField source="comment" dest="text_prefix"/>
<copyField source="field_*" dest="text_prefix"/>
<copyField source="form" dest="text_prefix"/>
<copyField source="name" dest="text_prefix"/>
<copyField source="tag" dest="text_prefix"/>
<copyField source="text" dest="text_prefix"/>
<copyField source="title" dest="text_prefix"/>
<copyField source="topic" dest="text_prefix"/>
<copyField source="type" dest="text_prefix"/>
<copyField source="concept" dest="text_prefix"/>
<copyField source="sentence" dest="text_prefix"/>

<copyField source="attachment" dest="text_suffix"/>
<copyField source="category" dest="text_suffix"/>
<copyField source="comment" dest="text_suffix"/>
<copyField source="field_*" dest="text_suffix"/>
<copyField source="form" dest="text_suffix"/>
<copyField source="name" dest="text_suffix"/>
<copyField source="tag" dest="text_suffix"/>
<copyField source="text" dest="text_suffix"/>
<copyField source="title" dest="text_suffix"/>
<copyField source="topic" dest="text_suffix"/>
<copyField source="type" dest="text_suffix"/>
<copyField source="concept" dest="text_suffix"/>
<copyField source="sentence" dest="text_suffix"/>

<copyField source="attachment" dest="phonetic"/>
<copyField source="category" dest="phonetic"/>
Expand Down
15 changes: 10 additions & 5 deletions solr/configsets/foswiki_configs/conf/solrconfig.xml
Expand Up @@ -830,7 +830,8 @@
category_search
tag_search^0.5
catchall
substrings
text_prefix
text_suffix
charnorm
phonetic
</str>
Expand All @@ -851,7 +852,8 @@
category_search
tag_search^0.5
catchall
substrings
text_prefix
text_suffix
charnorm
phonetic
</str>
Expand Down Expand Up @@ -890,7 +892,8 @@
category_search
tag_search^0.5
catchall
substrings
text_prefix
text_suffix
charnorm
phonetic
</str>
Expand All @@ -911,7 +914,8 @@
category_search
tag_search^0.5
catchall
substrings
text_prefix
text_suffix
charnorm
phonetic
</str>
Expand Down Expand Up @@ -1658,7 +1662,8 @@
<str>catchall</str>
<str>spell</str>
<str>phonetic</str>
<str>substrings</str>
<str>text_prefix</str>
<str>text_suffix</str>
<str>charnorm</str>
</arr>
</processor>
Expand Down

0 comments on commit c6edfca

Please sign in to comment.