Hi, I am new to Nutch and Solr, please help!!
I am using Nutch-1.9, Solr 4.10.2 and Hadoop 2.4.1 I always get org.apache.solr.common.SolrException, unknown field ‘host’, what would be wrong? The schema.xml has <field name="host" type="string" stored="false" indexed="true"/> I have already copied [nutch] schema.xml to [solr] schema.xml, restarted solr, From the indexing phase, Nutch always returns "unknown field ‘host’” error, Below are my settings. What would be wrong? Regards Arthur input_url/seed.txt http://nutch.apache.org/ conf/solrindex-mapping.xml <?xml version="1.0" encoding="UTF-8"?> <mapping> <fields> <field dest="content" source="content"/> <field dest="title" source="title"/> <field dest="host" source="host"/> <field dest="segment" source="segment"/> <field dest="boost" source="boost"/> <field dest="digest" source="digest"/> <field dest="tstamp" source="tstamp"/> <field dest="subject" source="subject"/> <field dest="description" source="description"/> <field dest="comments" source="comments"/> <field dest="author" source="author"/> <field dest="keywords" source="keywords"/> <field dest="category" source="category"/> <field dest="lastModified" source="lastModified"/> </fields> <uniqueKey>id</uniqueKey> </mapping> conf/regex-urlfilter.txt # The default url filter. # Better for whole-internet crawling. # skip file: ftp: and mailto: urls -^(file|ftp|mailto): # skip image and other suffixes we can't yet parse # for a more extensive coverage use the urlfilter-suffix plugin -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$ # skip URLs containing certain characters as probable queries, etc. -[?*!@=] # skip URLs with slash-delimited segment that repeats 3+ times, to break loops -.*(/[^/]+)/[^/]+\1/[^/]+\1/ # accept anything else #+. +^http://([a-z0-9]*\.)*nutch.apache.org/ conf/nutch-site.xml <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>http.agent.name</name> <value>MyBot</value> </property> <property> <name>http.robots.agents</name> <value>MyBot,*</value> </property> <property> <name>fetcher.store.content</name> <value>true</value> </property> <property> <name>fetcher.max.crawl.delay</name> <value>-1</value> </property> <property> <name>plugin.includes</name> <value>protocol-http|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|indexer-solr|urlnormalizer-(pass|regex|basic)</value> </property> <property> <name>mapred.temp.dir</name> <value>/tmp</value> </property> <property> <name>metatags.names</name> <value>metatag.keywords;metatag.description</value> </property> <property> <name>index.parse.md</name> <value>metatag.description,metatag.keywords</value> </property> </configuration> solr/collection1/conf/schema.xml <?xml version="1.0" encoding="UTF-8" ?> <schema name="nutch" version="1.5"> <types> <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="date" class="solr.TrieDateField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="text" class="solr.TextField" positionIncrementGap="100"> <analyzer> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory"/> <!-- <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> --> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> </fieldType> <fieldType name="url" class="solr.TextField" positionIncrementGap="100"> <analyzer> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/> </analyzer> </fieldType> </types> <fields> <field name="_root_" type="string" indexed="true" stored="false"/> <field name="id" type="string" stored="true" indexed="true" required="true"/> <!-- core fields --> <field name="_version_" type="long" indexed="true" stored="true"/> <field name="host" type="string" stored="false" indexed="true"/> <field name="digest" type="string" stored="true" indexed="false"/> <field name="segment" type="string" stored="true" indexed="false"/> <field name="boost" type="float" stored="true" indexed="false"/> <field name="tstamp" type="date" stored="true" indexed="false"/> <field name="url" type="text" indexed="true" stored="true" required="true"/> <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> <field name="last_modified" type="date" indexed="true" stored="true"/> <field name="links" type="string" indexed="true" stored="true" multiValued="true"/> <!-- fields for the metatags plugin --> <field name="title" type="text" indexed="true" stored="true" multiValued="true"/> <field name="subject" type="text" indexed="true" stored="true"/> <field name="description" type="text" stored="true" indexed="true"/> <field name="comments" type="text" indexed="true" stored="true"/> <field name="author" type="text" indexed="true" stored="true"/> <field name="keywords" type="text" stored="true" indexed="true"/> <field name="category" type="text" indexed="true" stored="true"/> <field name="resourcename" type="text" indexed="true" stored="true"/> <!-- fields for index-basic plugin --> <field name="content" type="text" indexed="true" stored="true" multiValued="true"/> <field name="title" type="text" stored="true" indexed="true"/> <field name="cache" type="string" stored="true" indexed="false"/> <!-- fields for index-anchor plugin --> <field name="anchor" type="string" stored="true" indexed="true" multiValued="true"/> <!-- fields for index-more plugin --> <field name="type" type="string" stored="true" indexed="true" multiValued="true"/> <field name="contentLength" type="long" stored="true" indexed="false"/> <field name="lastModified" type="date" stored="true" indexed="false"/> <field name="date" type="date" stored="true" indexed="true"/> <!-- fields for languageidentifier plugin --> <field name="lang" type="string" stored="true" indexed="true"/> <!-- fields for subcollection plugin --> <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/> <!-- fields for feed plugin (tag is also used by microformats-reltag)--> <field name="author" type="string" stored="true" indexed="true"/> <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/> <field name="feed" type="string" stored="true" indexed="true"/> <field name="publishedDate" type="date" stored="true" indexed="true"/> <field name="updatedDate" type="date" stored="true" indexed="true"/> <!-- fields for creativecommons plugin --> <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/> <!-- fields for tld plugin --> <field name="tld" type="string" stored="false" indexed="false"/> </fields> <!--<uniqueKey>id</uniqueKey> --> <uniqueKey>url</uniqueKey> <defaultSearchField>content</defaultSearchField> <solrQueryParser defaultOperator="OR"/> </schema> Nutch log: 2014-12-03 06:53:12,513 INFO crawl.LinkDb - LinkDb: starting at 2014-12-03 06:53:12 2014-12-03 06:53:12,513 INFO crawl.LinkDb - LinkDb: linkdb: output_url/linkdb 2014-12-03 06:53:12,513 INFO crawl.LinkDb - LinkDb: URL normalize: true 2014-12-03 06:53:12,513 INFO crawl.LinkDb - LinkDb: URL filter: true 2014-12-03 06:53:12,513 INFO crawl.LinkDb - LinkDb: internal links will be ignored. 2014-12-03 06:53:12,513 INFO crawl.LinkDb - LinkDb: adding segment: output_url/segments/20141203064933 2014-12-03 06:53:13,688 INFO crawl.LinkDb - LinkDb: merging with existing linkdb: output_url/linkdb 2014-12-03 06:53:14,755 INFO crawl.LinkDb - LinkDb: finished at 2014-12-03 06:53:14, elapsed: 00:00:02 2014-12-03 06:53:15,085 INFO crawl.DeduplicationJob - DeduplicationJob: starting at 2014-12-03 06:53:15 2014-12-03 06:53:15,204 WARN util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 2014-12-03 06:53:16,365 INFO crawl.DeduplicationJob - Deduplication: 0 documents marked as duplicates 2014-12-03 06:53:16,365 INFO crawl.DeduplicationJob - Deduplication: Updating status of duplicate urls into crawl db. 2014-12-03 06:53:17,466 INFO crawl.DeduplicationJob - Deduplication finished at 2014-12-03 06:53:17, elapsed: 00:00:02 2014-12-03 06:53:17,819 INFO indexer.IndexingJob - Indexer: starting at 2014-12-03 06:53:17 2014-12-03 06:53:17,859 INFO indexer.IndexingJob - Indexer: deleting gone documents: false 2014-12-03 06:53:17,860 INFO indexer.IndexingJob - Indexer: URL filtering: false 2014-12-03 06:53:17,860 INFO indexer.IndexingJob - Indexer: URL normalizing: false 2014-12-03 06:53:17,969 INFO indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter 2014-12-03 06:53:17,969 INFO indexer.IndexingJob - Active IndexWriters : SOLRIndexWriter solr.server.url : URL of the SOLR instance (mandatory) solr.commit.size : buffer size when sending to SOLR (default 1000) solr.mapping.file : name of the mapping file for fields (default solrindex-mapping.xml) solr.auth : use authentication (default false) solr.auth.username : use authentication (default false) solr.auth : username for authentication solr.auth.password : password for authentication 2014-12-03 06:53:17,971 INFO indexer.IndexerMapReduce - IndexerMapReduce: crawldb: output_url/crawldb 2014-12-03 06:53:17,971 INFO indexer.IndexerMapReduce - IndexerMapReduce: linkdb: output_url/linkdb 2014-12-03 06:53:17,971 INFO indexer.IndexerMapReduce - IndexerMapReduces: adding segment: output_url/segments/20141203064933 2014-12-03 06:53:18,038 WARN util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 2014-12-03 06:53:18,273 INFO anchor.AnchorIndexingFilter - Anchor deduplication is: off 2014-12-03 06:53:18,657 INFO indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: content dest: content 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: title dest: title 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: host dest: host 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: segment dest: segment 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: boost dest: boost 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: digest dest: digest 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: tstamp dest: tstamp 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: subject dest: subject 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: description dest: description 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: comments dest: comments 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: author dest: author 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: keywords dest: keywords 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: category dest: category 2014-12-03 06:53:18,671 INFO solr.SolrMappingReader - source: lastModified dest: lastModified 2014-12-03 06:53:18,742 INFO solr.SolrIndexWriter - Indexing 39 documents 2014-12-03 06:53:18,793 INFO solr.SolrIndexWriter - Indexing 39 documents 2014-12-03 06:53:18,805 WARN mapred.LocalJobRunner - job_local637755932_0001 org.apache.solr.common.SolrException: Bad Request Bad Request request: http://192.168.0.1:8983/solr/update?wt=javabin&version=2 at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:430) at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:244) at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:105) at org.apache.nutch.indexwriter.solr.SolrIndexWriter.close(SolrIndexWriter.java:155) at org.apache.nutch.indexer.IndexWriters.close(IndexWriters.java:118) at org.apache.nutch.indexer.IndexerOutputFormat$1.close(IndexerOutputFormat.java:44) at org.apache.hadoop.mapred.ReduceTask$OldTrackingRecordWriter.close(ReduceTask.java:467) at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:535) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:398) 2014-12-03 06:53:19,196 ERROR indexer.IndexingJob - Indexer: java.io.IOException: Job failed! at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1357) at org.apache.nutch.indexer.IndexingJob.index(IndexingJob.java:114) at org.apache.nutch.indexer.IndexingJob.run(IndexingJob.java:176) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) at org.apache.nutch.indexer.IndexingJob.main(IndexingJob.java:186) Solr log: logs/solr.log INFO - 2014-12-03 06:53:18.776; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0 ERROR - 2014-12-03 06:53:18.777; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host' at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185) at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78) at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238) at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164) at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69) at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51) at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926) at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080) at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692) at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100) at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247) at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174) at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99) at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74) at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967) at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207) at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419) at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557) at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075) at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384) at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) at org.eclipse.jetty.server.Server.handle(Server.java:368) at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489) at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953) at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014) at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953) at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240) at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) at java.lang.Thread.run(Thread.java:745) INFO - 2014-12-03 06:53:18.796; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0 ERROR - 2014-12-03 06:53:18.797; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host' at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185) at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78) at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238) at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164) at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69) at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51) at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926) at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080) at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692) at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100) at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247) at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174) at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99) at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74) at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967) at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207) at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419) at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557) at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075) at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384) at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) at org.eclipse.jetty.server.Server.handle(Server.java:368) at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489) at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953) at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014) at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953) at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240) at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) at java.lang.Thread.run(Thread.java:745)

