I think it would greatly help if you say specifically where you are
stuck. Otherwise, there are too many directions to guess. The
configuration snippet you have is a little too large to 'parse'.

I believe DataImportHandler has some definition for nested processors,
have you tried using those and having problems?

Do you want extra custom processing for the blobs? Have you tried
writing a CustomProcessor that will call Tika and parse the content
and add it to the record? I am doing this to merge files in filesystem
with metadata records during index (for a test). If that sounds
similar to what you do, I can share my sample privately.

Otherwise, just try to be very specific about:
*) What you are trying to do
*) What you are actually doing to get there, and
*) What specifically you are getting stuck on (Exception? Missed
records? Out of memory? etc)


Personal blog: http://blog.outerthoughts.com/
LinkedIn: http://www.linkedin.com/in/alexandrerafalovitch
- Time is the quality of nature that keeps events from happening all
at once. Lately, it doesn't seem to be working.  (Anonymous  - via GTD

On Thu, Aug 23, 2012 at 2:40 PM, anarchos78
<rigasathanasio...@hotmail.com> wrote:
> Greeting friends,
> Straight to the point. I have stored many BLOBS in a Mysql DB. These are
> mainly PDF's(80%) and .doc. I have also text in the DB. Till now i have
> indexed and i can query the text, but i cannot index the BLOBS. I am trying
> to make a single collection(document)-but sucks. Is there any recipe on how
> to do such a thing?
> *A portion of data-config.xml:*
> <?xml version="1.0" encoding="utf-8"?>
> <dataConfig>
>   <dataSource type="JdbcDataSource"
>                   autoCommit="true" batchSize="-1"
>                   convertType="false"
>                   driver="com.mysql.jdbc.Driver"
>                   url="jdbc:mysql://"
>                   user="root"
>                   password="1a2b3c4d"
>                   name="db"/>
>                  <dataSource name="fieldReader" type="FieldStreamDataSource" 
> />
>   <document>
>   <entity name="aitiologikes_ektheseis"
>         dataSource="db"
>         transformer="HTMLStripTransformer"
>         query="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT(
> body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'"
>         deltaImportQuery="select id, title, title AS grid_title, model, type, 
> url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT(
> body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'
> and id='${dataimporter.delta.id}'"
>         deltaQuery="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT(
> body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'
> and last_modified > '${dataimporter.last_index_time}'">
>                 <field column="id" name="ida" />
>                 <field column="solr_id" name="solr_id" />
>                 <field column="title" name="title" stripHTML="true" />
>                 <field column="grid_title" name="grid_title" stripHTML="true" 
> />
>                 <field column="model" name="model" stripHTML="true" />
>                 <field column="type" name="type" stripHTML="true" />
>                 <field column="url" name="url" stripHTML="true" />
>                 <field column="last_modified" name="last_modified" 
> stripHTML="true"  />
>                 <field column="search_tag" name="search_tag" stripHTML="true" 
> />
>                 <field column="content" name="content" stripHTML="true" />
>     </entity>
>     <entity name="aitiologikes_ektheseis_bin"
>           query="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS
> text from aitiologikes_ektheseis where type = 'bin'"
>           deltaImportQuery="select id, title, title AS grid_title, model, 
> type,
> url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con
> AS text from aitiologikes_ektheseis where type = 'bin' and
> id='${dataimporter.delta.id}'"
>           deltaQuery="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS
> text from aitiologikes_ektheseis where type = 'bin' and last_modified >
> '${dataimporter.last_index_time}'"
>           transformer="TemplateTransformer"
>           dataSource="db">
>                   <field column="id" name="ida" />
>                 <field column="solr_id" name="solr_id" />
>                   <field column="title" name="title" stripHTML="true" />
>                   <field column="grid_title" name="grid_title" 
> stripHTML="true" />
>                   <field column="model" name="model" stripHTML="true" />
>                   <field column="type" name="type" stripHTML="true" />
>                   <field column="url" name="url" stripHTML="true" />
>                   <field column="last_modified" name="last_modified" 
> stripHTML="true"  />
>                   <field column="search_tag" name="search_tag" 
> stripHTML="true" />
>                 <entity dataSource="fieldReader" 
> processor="TikaEntityProcessor"
> dataField="aitiologikes_ektheseis_bin.text" format="text">
>                   <field column="text" name="contentbin" stripHTML="true" />
>                 </entity>
>         </entity>
> ...
> ...
>     </document>
> </dataConfig>
> *A portion from schema.xml (the fieldTypes and filed definition):*
> <fieldType name="text_ktimatologio" class="solr.TextField"
> positionIncrementGap="100">
>       <analyzer type="index">
>         <tokenizer class="solr.StandardTokenizerFactory"/>
>         <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
>         <filter class="solr.LowerCaseFilterFactory"/>
>             <filter class="solr.EnglishPossessiveFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>             <filter class="solr.GreekLowerCaseFilterFactory"/>
>             <filter class="solr.GreekStemFilterFactory"/>
>         <filter class="solr.KeywordMarkerFilterFactory"
> protected="protwords.txt"/>
>         <filter class="solr.PorterStemFilterFactory"/>
>       </analyzer>
>       <analyzer type="query">
>         <tokenizer class="solr.StandardTokenizerFactory"/>
>         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>         <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.GreekLowerCaseFilterFactory"/>
>         <filter class="solr.GreekStemFilterFactory"/>
>         <filter class="solr.LowerCaseFilterFactory"/>
>             <filter class="solr.EnglishPossessiveFilterFactory"/>
>         <filter class="solr.KeywordMarkerFilterFactory"
> protected="protwords.txt"/>
>         <filter class="solr.PorterStemFilterFactory"/>
>       </analyzer>
> </fieldType>
> <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
>         <analyzer type="index">
>                 <charFilter class="solr.HTMLStripCharFilterFactory"/>
>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>                 <filter class="solr.StandardFilterFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.GreekLowerCaseFilterFactory"/>
>                 <filter class="solr.GreekStemFilterFactory"/>
>                 <filter class="solr.HunspellStemFilterFactory"
> dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff"
> ignoreCase="true" />
>         </analyzer>
>         <analyzer type="query">
>                 <charFilter class="solr.HTMLStripCharFilterFactory"/>
>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>                 <filter class="solr.StandardFilterFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.GreekLowerCaseFilterFactory"/>
>                 <filter class="solr.GreekStemFilterFactory"/>
>                 <filter class="solr.HunspellStemFilterFactory"
> dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff"
> ignoreCase="true" />
>         </analyzer>
> </fieldType>
> <fields>
>   <field  name="ida" type="string" indexed="true" stored="true"
> multiValued="false"/>
>   <field  name="solr_id" type="string" indexed="true" stored="true"
> multiValued="false"/>
>   <field  name="title" type="text_ktimatologio" indexed="true"
> stored="true"/>
>   <field  name="grid_title" type="text_ktimatologio" indexed="true"
> stored="true"/>
>   <field  name="model" type="string" indexed="true" stored="true"
> multiValued="false"/>
>   <field  name="type" type="string" indexed="true" stored="true"/>
>   <field  name="url" type="string" indexed="true" stored="true"/>
>   <field  name="last_modified" type="string" indexed="true" stored="true"/>
>   <field  name="search_tag" type="string" indexed="true" stored="true"/>
>   <field  name="contentbin" type="text" indexed="true" stored="true"
> multiValued="true"/>
>   <field  name="content" type="text_ktimatologio" indexed="true"
> stored="true" multiValued="true"/>
> </fields>
> I really need help on this!
> With respect,
> Tom
> Greece
> --
> View this message in context: 
> http://lucene.472066.n3.nabble.com/Indexing-and-querying-BLOBS-stored-in-Mysql-tp4002940.html
> Sent from the Solr - User mailing list archive at Nabble.com.

Reply via email to