Is your _browser_ set to handle the appropriate character set? Or whatever you're using to inspect your data? How about your servlet container?
Best Erick On Mon, Sep 10, 2012 at 7:47 AM, Pranav Prakash <pra...@gmail.com> wrote: > Hi Folks, > > I am attempting to import documents to Solr from MySQL using DIH. One of > the field contains the text - “Future of Mobile Value Added Services (VAS) > in Australia” .Notice the character “ and ”. > > When I am importing, it gets stored as - “Future of Mobile Value Added > Services (VAS) in Australiaâ€�. > > The datasource config clearly mentions use of UTF-8 as follows: > > <dataSource type="JdbcDataSource" > driver="com.mysql.jdbc.Driver" > url="jdbc:mysql://localhost/ohapp_devel" > user="username" > useUnicode="true" > characterEncoding="UTF-8" > password="password" > zeroDateTimeBehavior="convertToNull" > name="app" /> > > > A plain SQL Select statement on the MySQL Console gives appropriate text. I > even tried using following scriptTransformer to get rid of this char, but > it was of no particular use in my case. > > function gsub(source, pattern, replacement) { > var match, result; > if (!((pattern != null) && (replacement != null))) { > return source; > } > result = ''; > while (source.length > 0) { > if ((match = source.match(pattern))) { > result += source.slice(0, match.index); > result += replacement; > source = source.slice(match.index + match[0].length); > } else { > result += source; > source = ''; > } > } > return result; > } > > function fixQuotes(c){ > c = gsub(c, /\342\200(?:\234|\235)/,'"'); > c = gsub(c, /\342\200(?:\230|\231)/,"'"); > c = gsub(c, /\342\200\223/,"-"); > c = gsub(c, /\342\200\246/,"..."); > c = gsub(c, /\303\242\342\202\254\342\204\242/,"'"); > c = gsub(c, /\303\242\342\202\254\302\235/,'"'); > c = gsub(c, /\303\242\342\202\254\305\223/,'"'); > c = gsub(c, /\303\242\342\202\254"/,'-'); > c = gsub(c, /\342\202\254\313\234/,'"'); > c = gsub(c, /“/, '"'); > return c; > } > > function cleanFields(row){ > var fieldsToClean = ['title', 'description']; > for(i =0; i< fieldsToClean.length; i++){ > var old_text = String(row.get(fieldsToClean[i])); > row.put(fieldsToClean[i], fixQuotes(old_text) ); > } > return row; > } > > My understanding goes that this must be a very common problem. It also > occurs with human names which have these chars. What is an appropriate way > to get the appropriate text indexed and searchable? The fieldtype where > this is stored goes as follows > > <fieldType name="text_commongrams" class="solr.TextField"> > <analyzer> > <charFilter class="solr.HTMLStripCharFilterFactory" /> > <tokenizer class="solr.StandardTokenizerFactory" /> > <filter class="solr.RemoveDuplicatesTokenFilterFactory" /> > <filter class="solr.TrimFilterFactory" /> > <filter class="solr.LowerCaseFilterFactory" /> > <filter class="solr.SnowballPorterFilterFactory" language="English" > protected="protwords.txt"/> > <filter class="solr.SynonymFilterFactory" > synonyms="synonyms.txt" > ignoreCase="true" > expand="true" /> > <filter class="solr.CommonGramsFilterFactory" > words="stopwords_en.txt" > ignoreCase="true" /> > <filter class="solr.StopFilterFactory" > words="stopwords_en.txt" > ignoreCase="true" /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" > generateNumberParts="1" > catenateWords="1" > catenateNumbers="1" > catenateAll="0" > preserveOriginal="1" /> > </analyzer> > </fieldType> > > > *Pranav Prakash* > > "temet nosce"