Hi - I don't know about the specific field names but you can check it out using 
the parsechecker tool, it prints all detected meta data. 
 
-----Original message-----
> From:Jorge Luis Betancourt Gonzalez <[email protected]>
> Sent: Wed 24-Oct-2012 16:05
> To: [email protected]
> Subject: Re: problems with image dynamic fields in nutch 1.4
> 
> And how can I know the name of the fields generated by the tika parser? there 
> are any prefix used?
> 
> Greetings,
> 
> On Oct 24, 2012, at 10:02 AM, Markus Jelsma <[email protected]> 
> wrote:
> 
> > Hi - you need a custom indexing filter that adds the fields from parsemeta 
> > to the document.
> > 
> > Cheers,
> > 
> > 
> > 
> > -----Original message-----
> >> From:Eyeris Rodriguez Rueda <[email protected]>
> >> Sent: Wed 24-Oct-2012 14:59
> >> To: [email protected]
> >> Subject: problems with image dynamic fields in nutch 1.4 
> >> 
> >> Hi all.
> >> I have a problem when I try to crawl images, specifically with dynamic
> >> fields of that images.
> >> When I do a crawl, nutch is ignoring this dynamic fields.
> >> When I upload manually some images directly  to solr index, solr's  tika is
> >> capable to extract some metadata in dynamic fields like width, height,
> >> content-type, but with nutch crawl those fields are ignored.
> >> I have tried to put in static in solr and nutch schema but continue without
> >> results, here is my schema and solrindex-mapping, Im using nutch 1.4 and
> >> solr 3.6 . Some help or advice will be appreciated.
> >> ************************************************
> >> Schema.xml
> >> 
> >> <fields>  
> >> <field name="segment" type="string" stored="true" indexed="false"/>
> >> <field name="digest" type="string" stored="true" indexed="false"/>
> >> <field name="boost" type="float" stored="true" indexed="false"/>
> >> 
> >> 
> >> <!-- fields for index-basic plugin -->
> >>        <field name="host" type="url" stored="true" indexed="true"/>
> >>        <field name="site" type="string" stored="true" indexed="true"/>
> >>        <field name="url" type="url" stored="true" indexed="true"/>
> >>        <field name="content" type="text" stored="true" indexed="false"/>
> >>        <field name="title" type="text" stored="true" indexed="true"/>
> >>        <field name="cache" type="string" stored="true" indexed="false"/>
> >>        <field name="tstamp" type="date" stored="true" indexed="true"
> >> default="NOW"/>
> >> 
> >> <!-- fields for index-more plugin -->
> >> <field name="date" type="date" stored="true" indexed="true"/>
> >> <field name="contentLength" type="long" stored="true" indexed="true"/>
> >> <field name="lastModified" type="date" stored="true" indexed="true"/>
> >> 
> >> 
> >> <!-- fields for languageidentifier plugin -->
> >>        <field name="lang" type="string" stored="true" indexed="true"/>
> >> 
> >>  <!-- general -->
> >>  <field name="id" type="string" indexed="true"  stored="true"
> >> multiValued="false"/>
> >>  <field name="type" type="string" indexed="true"  stored="true"
> >> multiValued="true" /> 
> >>  <field name="name" type="string" indexed="true"  stored="true"
> >> multiValued="false" /> 
> >>  <field name="thumbnail" type="string" stored="true" indexed="true"/>
> >>  <field name="core0" type="string" indexed="true"  stored="true"
> >> multiValued="false" />
> >> <!-- campos estáticos -->
> >> <!-- para los png -->
> >> <field name="content_type" type="string" indexed="true"  stored="true"
> >> multiValued="false"/>
> >> <field name="width" type="string" indexed="true"  stored="true"
> >> multiValued="false"/>
> >> <field name="height" type="string" indexed="true"  stored="true"
> >> multiValued="false"/>
> >> <field name="stream_name" type="string" indexed="true" stored="true"
> >> multiValued="false"/>
> >> <field name="stream_size" type="string" indexed="true" stored="true"
> >> multiValued="false"/>
> >> 
> >> 
> >>  <dynamicField name="attr_*" type="binary"/>
> >> </fields>
> >> 
> >> <!-- field to use to determine and enforce document uniqueness. -->
> >> <uniqueKey>id</uniqueKey>
> >> 
> >> <!-- field for the QueryParser to use when an explicit fieldname is absent
> >> -->
> >> <defaultSearchField>name</defaultSearchField>
> >> 
> >> <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
> >> <solrQueryParser defaultOperator="OR"/>
> >> </schema>
> >> 
> >> ***********************************************************************
> >> Solrindex-mapping
> >> 
> >> <fields>
> >> <field dest="id" source="url"/>
> >> <field dest="host" source="host"/>
> >> <field dest="site" source="site"/>
> >> <field dest="title" source="title"/>
> >> <field dest="tstamp" source="tstamp"/>
> >> 
> >> <field dest="type" source="type"/>
> >> 
> >> <field dest="date" source="date"/>
> >> <field dest="lang" source="lang"/>
> >> <field dest="contentLength" source="contentLength"/>
> >> <field dest="lastModified" source="lastModified"/>
> >> 
> >> 
> >> <!--Para los tipos de imágenes específicos -->
> >> <field dest="content_type" source="content_type"/>
> >> <field dest="width" source="width"/>
> >> <field dest="height" source="height"/>
> >> <field dest="stream_name" source="stream_name"/>
> >> <field dest="stream_size" source="stream_size"/>
> >> 
> >> 
> >> <field dest="thumbnail" source="thumbnail"/>
> >>    </fields>
> >>    <uniqueKey>id</uniqueKey>
> >> </mapping>
> >> 
> >> 
> >> 
> >> 
> >> _____________________________________________________________________
> >> Ing. Eyeris Rodriguez Rueda
> >> Teléfono:837-3370
> >> Universidad de las Ciencias Informáticas
> >> _____________________________________________________________________
> >> 
> >> 
> >> 10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS 
> >> INFORMATICAS...
> >> CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION
> >> 
> >> http://www.uci.cu
> >> http://www.facebook.com/universidad.uci
> >> http://www.flickr.com/photos/universidad_uci
> >> 
> > 
> > 10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS 
> > INFORMATICAS...
> > CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION
> > 
> > http://www.uci.cu
> > http://www.facebook.com/universidad.uci
> > http://www.flickr.com/photos/universidad_uci
> 
> 
> 10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS 
> INFORMATICAS...
> CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION
> 
> http://www.uci.cu
> http://www.facebook.com/universidad.uci
> http://www.flickr.com/photos/universidad_uci
> 

Reply via email to