Hi all.
I have a problem when I try to crawl images, specifically with dynamic
fields of that images.
When I do a crawl, nutch is ignoring this dynamic fields.
When I upload manually some images directly to solr index, solr's tika is
capable to extract some metadata in dynamic fields like width, height,
content-type, but with nutch crawl those fields are ignored.
I have tried to put in static in solr and nutch schema but continue without
results, here is my schema and solrindex-mapping, Im using nutch 1.4 and
solr 3.6 . Some help or advice will be appreciated.
************************************************
Schema.xml
<fields>
<field name="segment" type="string" stored="true" indexed="false"/>
<field name="digest" type="string" stored="true" indexed="false"/>
<field name="boost" type="float" stored="true" indexed="false"/>
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="true" indexed="true"/>
<field name="site" type="string" stored="true" indexed="true"/>
<field name="url" type="url" stored="true" indexed="true"/>
<field name="content" type="text" stored="true" indexed="false"/>
<field name="title" type="text" stored="true" indexed="true"/>
<field name="cache" type="string" stored="true" indexed="false"/>
<field name="tstamp" type="date" stored="true" indexed="true"
default="NOW"/>
<!-- fields for index-more plugin -->
<field name="date" type="date" stored="true" indexed="true"/>
<field name="contentLength" type="long" stored="true" indexed="true"/>
<field name="lastModified" type="date" stored="true" indexed="true"/>
<!-- fields for languageidentifier plugin -->
<field name="lang" type="string" stored="true" indexed="true"/>
<!-- general -->
<field name="id" type="string" indexed="true" stored="true"
multiValued="false"/>
<field name="type" type="string" indexed="true" stored="true"
multiValued="true" />
<field name="name" type="string" indexed="true" stored="true"
multiValued="false" />
<field name="thumbnail" type="string" stored="true" indexed="true"/>
<field name="core0" type="string" indexed="true" stored="true"
multiValued="false" />
<!-- campos estáticos -->
<!-- para los png -->
<field name="content_type" type="string" indexed="true" stored="true"
multiValued="false"/>
<field name="width" type="string" indexed="true" stored="true"
multiValued="false"/>
<field name="height" type="string" indexed="true" stored="true"
multiValued="false"/>
<field name="stream_name" type="string" indexed="true" stored="true"
multiValued="false"/>
<field name="stream_size" type="string" indexed="true" stored="true"
multiValued="false"/>
<dynamicField name="attr_*" type="binary"/>
</fields>
<!-- field to use to determine and enforce document uniqueness. -->
<uniqueKey>id</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent
-->
<defaultSearchField>name</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR"/>
</schema>
***********************************************************************
Solrindex-mapping
<fields>
<field dest="id" source="url"/>
<field dest="host" source="host"/>
<field dest="site" source="site"/>
<field dest="title" source="title"/>
<field dest="tstamp" source="tstamp"/>
<field dest="type" source="type"/>
<field dest="date" source="date"/>
<field dest="lang" source="lang"/>
<field dest="contentLength" source="contentLength"/>
<field dest="lastModified" source="lastModified"/>
<!--Para los tipos de imágenes específicos -->
<field dest="content_type" source="content_type"/>
<field dest="width" source="width"/>
<field dest="height" source="height"/>
<field dest="stream_name" source="stream_name"/>
<field dest="stream_size" source="stream_size"/>
<field dest="thumbnail" source="thumbnail"/>
</fields>
<uniqueKey>id</uniqueKey>
</mapping>
_____________________________________________________________________
Ing. Eyeris Rodriguez Rueda
Teléfono:837-3370
Universidad de las Ciencias Informáticas
_____________________________________________________________________
10mo. ANIVERSARIO DE LA CREACION DE LA UNIVERSIDAD DE LAS CIENCIAS
INFORMATICAS...
CONECTADOS AL FUTURO, CONECTADOS A LA REVOLUCION
http://www.uci.cu
http://www.facebook.com/universidad.uci
http://www.flickr.com/photos/universidad_uci