Hi all.
I have a problem when I try to crawl images, specifically with dynamic
fields of that images.
When I do a crawl, nutch is ignoring this dynamic fields.
When I upload manually some images directly  to solr index, solr's  tika is
capable to extract some metadata in dynamic fields like width, height,
content-type, but with nutch crawl those fields are ignored.
I have tried to put in static in solr and nutch schema but continue without
results, here is my schema and solrindex-mapping, Im using nutch 1.4 and
solr 3.6 . Some help or advice will be appreciated.
************************************************
Schema.xml

<fields>  
<field name="segment" type="string" stored="true" indexed="false"/>
<field name="digest" type="string" stored="true" indexed="false"/>
<field name="boost" type="float" stored="true" indexed="false"/>


<!-- fields for index-basic plugin -->
        <field name="host" type="url" stored="true" indexed="true"/>
        <field name="site" type="string" stored="true" indexed="true"/>
        <field name="url" type="url" stored="true" indexed="true"/>
        <field name="content" type="text" stored="true" indexed="false"/>
        <field name="title" type="text" stored="true" indexed="true"/>
        <field name="cache" type="string" stored="true" indexed="false"/>
        <field name="tstamp" type="date" stored="true" indexed="true"
default="NOW"/>

<!-- fields for index-more plugin -->
<field name="date" type="date" stored="true" indexed="true"/>
<field name="contentLength" type="long" stored="true" indexed="true"/>
<field name="lastModified" type="date" stored="true" indexed="true"/>


<!-- fields for languageidentifier plugin -->
        <field name="lang" type="string" stored="true" indexed="true"/>

  <!-- general -->
  <field name="id" type="string" indexed="true"  stored="true"
multiValued="false"/>
  <field name="type" type="string" indexed="true"  stored="true"
multiValued="true" /> 
  <field name="name" type="string" indexed="true"  stored="true"
multiValued="false" /> 
  <field name="thumbnail" type="string" stored="true" indexed="true"/>
  <field name="core0" type="string" indexed="true"  stored="true"
multiValued="false" />
<!-- campos estáticos -->
<!-- para los png -->
<field name="content_type" type="string" indexed="true"  stored="true"
multiValued="false"/>
<field name="width" type="string" indexed="true"  stored="true"
multiValued="false"/>
<field name="height" type="string" indexed="true"  stored="true"
multiValued="false"/>
<field name="stream_name" type="string" indexed="true" stored="true"
multiValued="false"/>
<field name="stream_size" type="string" indexed="true" stored="true"
multiValued="false"/>


  <dynamicField name="attr_*" type="binary"/>
 </fields>

 <!-- field to use to determine and enforce document uniqueness. -->
 <uniqueKey>id</uniqueKey>

 <!-- field for the QueryParser to use when an explicit fieldname is absent
-->
 <defaultSearchField>name</defaultSearchField>

 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>
</schema>

***********************************************************************
Solrindex-mapping

<fields>
<field dest="id" source="url"/>
<field dest="host" source="host"/>
<field dest="site" source="site"/>
<field dest="title" source="title"/>
<field dest="tstamp" source="tstamp"/>

<field dest="type" source="type"/>

<field dest="date" source="date"/>
<field dest="lang" source="lang"/>
<field dest="contentLength" source="contentLength"/>
<field dest="lastModified" source="lastModified"/>


<!--Para los tipos de imágenes específicos -->
<field dest="content_type" source="content_type"/>
<field dest="width" source="width"/>
<field dest="height" source="height"/>
<field dest="stream_name" source="stream_name"/>
<field dest="stream_size" source="stream_size"/>


<field dest="thumbnail" source="thumbnail"/>
        </fields>
        <uniqueKey>id</uniqueKey>
</mapping>




_____________________________________________________________________
Ing. Eyeris Rodriguez Rueda
Teléfono:837-3370
Universidad de las Ciencias Informáticas
_____________________________________________________________________


10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS 
INFORMATICAS...
CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION

http://www.uci.cu
http://www.facebook.com/universidad.uci
http://www.flickr.com/photos/universidad_uci

Reply via email to