We are using Solr as the search engine for our public access library
catalog. In testing I did a search for a French movie that I know is in
the catalog named: "Kirikou et la sorcière" and nothing was returned.
If I search for just the work "Kirikou" several results are returned,
and the problem becomes apparent. The records contain "Kirikou et la
sorcie?re" where the accent is a unicode combining character following
the "e".
After some research into Unicode normalization, I found and installed a
Unicode normalization filter that is set to convert letters followed by
combining codes into the precomposed form. I also installed a
solr.ISOLatin1AccentFilterFactory that will then convert these
precomposed forms into the latin equivalent without any accent. The
following is the fieldType definition taken from the schema.xml file:
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
So it seems like this should work.
However again searching for "Kirikou et la sorcière" or "sorcière" or
"sorcie?re" or just "sorciere" doesn't return the docment in question.
I've tried looking at the results from solr/admin/analysis.jsp entering
in text from the record for the Field value (Index) and entering in
sorciere in the Field value (Query) and I get the follow results, which
seems to indicate that there should be a match between the stemmed entry
"sorcier" in the record and the stemmed word "sorcier" from the query.
So clearly I am either doing something wrong or misinterpreting the
analyzers, but I am at a loss as to how to figure out what is wrong.
Any suggestions?
org.apache.solr.analysis.WhitespaceTokenizerFactory {}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29
term text Kirikou et la sorcie?re France 3 Cinema / RTBF
(Te?le?vision belge). Grand Prix du festival d'Annecy 1999
France French VHS VIDEO .VHS10969 1 vide?ocassette (1h10 min.)
(VHS) Ocelot, Michel
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word
source start,end 0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42
43,47 48,61 62,69 72,77 78,82 83,85 86,94 95,103 104,108
110,116 117,123 124,127 129,134 135,144 147,148 149,163 164,169
170,175 176,181 183,190 191,197
schema.UnicodeNormalizationFilterFactory {}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29
term text (Kirikou,0,7) (et,8,10) (la,11,13) (sorcière,14,23)
(France,25,31) (3,32,33) (Cinema,34,40) (/,41,42) (RTBF,43,47)
((Télévision,48,61) (belge).,62,69) (Grand,72,77) (Prix,78,82)
(du,83,85) (festival,86,94) (d'Annecy,95,103) (1999,104,108)
(France,110,116) (French,117,123) (VHS,124,127) (VIDEO,129,134)
(.VHS10969,135,144) (1,147,148) (vidéocassette,149,163)
((1h10,164,169) (min.),170,175) ((VHS),176,181) (Ocelot,,183,190)
(Michel,191,197)
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word
source start,end 0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42
43,47 48,61 62,69 72,77 78,82 83,85 86,94 95,103 104,108
110,116 117,123 124,127 129,134 135,144 147,148 149,163 164,169
170,175 176,181 183,190 191,197
org.apache.solr.analysis.ISOLatin1AccentFilterFactory {}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29
term text (Kirikou,0,7) (et,8,10) (la,11,13) (sorciere,14,23)
(France,25,31) (3,32,33) (Cinema,34,40) (/,41,42) (RTBF,43,47)
((Television,48,61) (belge).,62,69) (Grand,72,77) (Prix,78,82)
(du,83,85) (festival,86,94) (d'Annecy,95,103) (1999,104,108)
(France,110,116) (French,117,123) (VHS,124,127) (VIDEO,129,134)
(.VHS10969,135,144) (1,147,148) (videocassette,149,163)
((1h10,164,169) (min.),170,175) ((VHS),176,181) (Ocelot,,183,190)
(Michel,191,197)
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word
source start,end 0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42
43,47 48,61 62,69 72,77 78,82 83,85 86,94 95,103 104,108
110,116 117,123 124,127 129,134 135,144 147,148 149,163 164,169
170,175 176,181 183,190 191,197
org.apache.solr.analysis.StopFilterFactory {words=stopwords.txt,
ignoreCase=true}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29
term text (Kirikou,0,7) (et,8,10) (la,11,13) (sorciere,14,23)
(France,25,31) (3,32,33) (Cinema,34,40) (/,41,42) (RTBF,43,47)
((Television,48,61) (belge).,62,69) (Grand,72,77) (Prix,78,82)
(du,83,85) (festival,86,94) (d'Annecy,95,103) (1999,104,108)
(France,110,116) (French,117,123) (VHS,124,127) (VIDEO,129,134)
(.VHS10969,135,144) (1,147,148) (videocassette,149,163)
((1h10,164,169) (min.),170,175) ((VHS),176,181) (Ocelot,,183,190)
(Michel,191,197)
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word
source start,end 0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42
43,47 48,61 62,69 72,77 78,82 83,85 86,94 95,103 104,108
110,116 117,123 124,127 129,134 135,144 147,148 149,163 164,169
170,175 176,181 183,190 191,197
org.apache.solr.analysis.WordDelimiterFilterFactory
{catenateWords=1, catenateNumbers=1, catenateAll=0,
generateNumberParts=1, generateWordParts=1}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
85 86 87 88 89 90
term text Kirikou 0 7 et 8 10 la 11 13 sorciere 14 23
France 25 31 3 32 33 Cinema 34 40 41 42 RTBF 43 47
Television 48 61 belge 62 69 Grand 72 77 Prix 78 82 du
83 85 festival 86 94 d Annecy 95 103 1999 104 108 France
110 116 French 117 123 VHS 124 127 VIDEO 129 134 VHS
10969 135 144 1 147 148 videocassette 149 163 1 h 10 164
169 min 170 175 VHS 176 181 Ocelot 183 190 Michel 191 197
07 810 1113 1423 2531 33233 3440 4142 4347 4861 6269 7277
7882 8385 8694 dAnnecy 95103 1999104108 110116 117123 124127
129134 10969135144 1147148 149163 10164169 170175 176181 183190
191197
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word
source start,end 0,7 0,7 0,7 8,10 8,10 8,10 11,13 11,13 11,13
14,23 14,23 14,23 25,31 25,31 25,31 32,33 32,33 32,33 34,40
34,40 34,40 41,42 41,42 43,47 43,47 43,47 48,61 48,61 48,61
62,69 62,69 62,69 72,77 72,77 72,77 78,82 78,82 78,82 83,85
83,85 83,85 86,94 86,94 86,94 95,103 95,103 95,103 95,103
104,108 104,108 104,108 110,116 110,116 110,116 117,123 117,123
117,123 124,127 124,127 124,127 129,134 129,134 129,134 135,144
135,144 135,144 135,144 147,148 147,148 147,148 149,163 149,163
149,163 164,169 164,169 164,169 164,169 164,169 170,175 170,175
170,175 176,181 176,181 176,181 183,190 183,190 183,190 191,197
191,197 191,197
0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42 43,47 48,61
62,69 72,77 78,82 83,85 86,94 95,103 95,103 104,108 110,116
117,123 124,127 129,134 135,144 147,148 149,163 164,169 170,175
176,181 183,190 191,197
org.apache.solr.analysis.LowerCaseFilterFactory {}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
85 86 87 88 89 90
term text kirikou 0 7 et 8 10 la 11 13 sorciere 14 23
france 25 31 3 32 33 cinema 34 40 41 42 rtbf 43 47
television 48 61 belge 62 69 grand 72 77 prix 78 82 du
83 85 festival 86 94 d annecy 95 103 1999 104 108 france
110 116 french 117 123 vhs 124 127 video 129 134 vhs
10969 135 144 1 147 148 videocassette 149 163 1 h 10 164
169 min 170 175 vhs 176 181 ocelot 183 190 michel 191 197
07 810 1113 1423 2531 33233 3440 4142 4347 4861 6269 7277
7882 8385 8694 dannecy 95103 1999104108 110116 117123 124127
129134 10969135144 1147148 149163 10164169 170175 176181 183190
191197
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word
source start,end 0,7 0,7 0,7 8,10 8,10 8,10 11,13 11,13 11,13
14,23 14,23 14,23 25,31 25,31 25,31 32,33 32,33 32,33 34,40
34,40 34,40 41,42 41,42 43,47 43,47 43,47 48,61 48,61 48,61
62,69 62,69 62,69 72,77 72,77 72,77 78,82 78,82 78,82 83,85
83,85 83,85 86,94 86,94 86,94 95,103 95,103 95,103 95,103
104,108 104,108 104,108 110,116 110,116 110,116 117,123 117,123
117,123 124,127 124,127 124,127 129,134 129,134 129,134 135,144
135,144 135,144 135,144 147,148 147,148 147,148 149,163 149,163
149,163 164,169 164,169 164,169 164,169 164,169 170,175 170,175
170,175 176,181 176,181 176,181 183,190 183,190 183,190 191,197
191,197 191,197
0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42 43,47 48,61
62,69 72,77 78,82 83,85 86,94 95,103 95,103 104,108 110,116
117,123 124,127 129,134 135,144 147,148 149,163 164,169 170,175
176,181 183,190 191,197
org.apache.solr.analysis.EnglishPorterFilterFactory
{protected=protwords.txt}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
85 86 87 88 89 90
term text kirikou 0 7 et 8 10 la 11 13 sorcier 14 23
franc 25 31 3 32 33 cinema 34 40 41 42 rtbf 43 47
televis 48 61 belg 62 69 grand 72 77 prix 78 82 du 83 85
festiv 86 94 d anneci 95 103 1999 104 108 franc 110 116
french 117 123 vhs 124 127 video 129 134 vhs 10969 135
144 1 147 148 videocassett 149 163 1 h 10 164 169 min
170 175 vhs 176 181 ocelot 183 190 michel 191 197
07 810 1113 1423 2531 33233 3440 4142 4347 4861 6269 7277
7882 8385 8694 danneci 95103 1999104108 110116 117123 124127
129134 10969135144 1147148 149163 10164169 170175 176181 183190
191197
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word
source start,end 0,7 0,7 0,7 8,10 8,10 8,10 11,13 11,13 11,13
14,23 14,23 14,23 25,31 25,31 25,31 32,33 32,33 32,33 34,40
34,40 34,40 41,42 41,42 43,47 43,47 43,47 48,61 48,61 48,61
62,69 62,69 62,69 72,77 72,77 72,77 78,82 78,82 78,82 83,85
83,85 83,85 86,94 86,94 86,94 95,103 95,103 95,103 95,103
104,108 104,108 104,108 110,116 110,116 110,116 117,123 117,123
117,123 124,127 124,127 124,127 129,134 129,134 129,134 135,144
135,144 135,144 135,144 147,148 147,148 147,148 149,163 149,163
149,163 164,169 164,169 164,169 164,169 164,169 170,175 170,175
170,175 176,181 176,181 176,181 183,190 183,190 183,190 191,197
191,197 191,197
0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42 43,47 48,61
62,69 72,77 78,82 83,85 86,94 95,103 95,103 104,108 110,116
117,123 124,127 129,134 135,144 147,148 149,163 164,169 170,175
176,181 183,190 191,197
org.apache.solr.analysis.RemoveDuplicatesTokenFilterFactory {}
term position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
85 86 87 88 89 90
term text kirikou 0 7 et 8 10 la 11 13 sorcier 14 23
franc 25 31 3 32 33 cinema 34 40 41 42 rtbf 43 47
televis 48 61 belg 62 69 grand 72 77 prix 78 82 du 83 85
festiv 86 94 d anneci 95 103 1999 104 108 franc 110 116
french 117 123 vhs 124 127 video 129 134 vhs 10969 135
144 1 147 148 videocassett 149 163 1 h 10 164 169 min
170 175 vhs 176 181 ocelot 183 190 michel 191 197
07 810 1113 1423 2531 33233 3440 4142 4347 4861 6269 7277
7882 8385 8694 danneci 95103 1999104108 110116 117123 124127
129134 10969135144 1147148 149163 10164169 170175 176181 183190
191197
term type word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word
word word word word word word word word word word word word
word word word word word word word word word word word word
word word word word word word
source start,end 0,7 0,7 0,7 8,10 8,10 8,10 11,13 11,13 11,13
14,23 14,23 14,23 25,31 25,31 25,31 32,33 32,33 32,33 34,40
34,40 34,40 41,42 41,42 43,47 43,47 43,47 48,61 48,61 48,61
62,69 62,69 62,69 72,77 72,77 72,77 78,82 78,82 78,82 83,85
83,85 83,85 86,94 86,94 86,94 95,103 95,103 95,103 95,103
104,108 104,108 104,108 110,116 110,116 110,116 117,123 117,123
117,123 124,127 124,127 124,127 129,134 129,134 129,134 135,144
135,144 135,144 135,144 147,148 147,148 147,148 149,163 149,163
149,163 164,169 164,169 164,169 164,169 164,169 170,175 170,175
170,175 176,181 176,181 176,181 183,190 183,190 183,190 191,197
191,197 191,197
0,7 8,10 11,13 14,23 25,31 32,33 34,40 41,42 43,47 48,61
62,69 72,77 78,82 83,85 86,94 95,103 95,103 104,108 110,116
117,123 124,127 129,134 135,144 147,148 149,163 164,169 170,175
176,181 183,190 191,197
Query Analyzer
org.apache.solr.analysis.WhitespaceTokenizerFactory {}
term position 1
term text sorciere
term type word
source start,end 0,8
schema.UnicodeNormalizationFilterFactory {}
term position 1
term text (sorciere,0,8)
term type word
source start,end 0,8
org.apache.solr.analysis.ISOLatin1AccentFilterFactory {}
term position 1
term text (sorciere,0,8)
term type word
source start,end 0,8
org.apache.solr.analysis.SynonymFilterFactory {expand=true,
ignoreCase=true, synonyms=synonyms.txt}
term position 1
term text (sorciere,0,8)
term type word
source start,end 0,8
org.apache.solr.analysis.StopFilterFactory {words=stopwords.txt,
ignoreCase=true}
term position 1
term text (sorciere,0,8)
term type word
source start,end 0,8
org.apache.solr.analysis.WordDelimiterFilterFactory
{catenateWords=0, catenateNumbers=0, catenateAll=0,
generateNumberParts=1, generateWordParts=1}
term position 1 2 3
term text sorciere 0 8
term type word word word
source start,end 0,8 0,8 0,8
org.apache.solr.analysis.LowerCaseFilterFactory {}
term position 1 2 3
term text sorciere 0 8
term type word word word
source start,end 0,8 0,8 0,8
org.apache.solr.analysis.EnglishPorterFilterFactory
{protected=protwords.txt}
term position 1 2 3
term text sorcier 0 8
term type word word word
source start,end 0,8 0,8 0,8
org.apache.solr.analysis.RemoveDuplicatesTokenFilterFactory {}
term position 1 2 3
term text sorcier 0 8
term type word word word
source start,end 0,8 0,8 0,8