hi
I tried to index a huge url set with nutch-1.4 with hadoop-0.20. In
reduce part it throws an error like this. I think some char break xml. Any
idea how to resolve this?
May 15, 2012 10:37:59 AM org.apache.solr.update.processor.LogUpdateProcessor
finish
INFO: [collection1] webapp=/solr path=/update params={wt=javabin&version=2}
{add=[http://www.all-pac.com/corrugated-packaging.html,
http://www.all-pac.com/plastic-packaging.html,
http://www.all-pop.com/housewares_category_01.html,
http://www.all-pro-sol.com/cd-dvd-duplicators-publishers-news-and-events/index.html,
http://www.all-propainting.com/en/stratum,
http://www.all-rock-climbing-gear.com/tag/file/,
http://www.all-rock-climbing-gear.com/tag/life/,
http://www.all-rock-climbing-gear.com/tag/magazine/,
http://www.all-safe.com.au/faqs/faq-windspire/howwindspireswork,
http://www.all-startelescope.com/index.php/Star-Parties.html, ... (253
adds)]} 0 1375
May 15, 2012 10:37:59 AM org.apache.solr.common.SolrException log
SEVERE: org.apache.solr.common.SolrException: Unexpected end of input block;
expected an identifier
at [row,col {unknown-source}]: [1,984856]
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:81)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:73)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1544)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:442)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:263)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1337)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:484)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:119)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:524)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:233)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1065)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:413)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:192)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:999)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:117)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:250)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:149)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:111)
at org.eclipse.jetty.server.Server.handle(Server.java:351)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:454)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:47)
at
org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:900)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:954)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:952)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:230)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:66)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:254)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:599)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:534)
at java.lang.Thread.run(Thread.java:662)
Caused by: com.ctc.wstx.exc.WstxEOFException: Unexpected end of input block;
expected an identifier
at [row,col {unknown-source}]: [1,984856]
at
com.ctc.wstx.sr.StreamScanner.throwUnexpectedEOB(StreamScanner.java:700)
at
com.ctc.wstx.sr.StreamScanner.loadMoreFromCurrent(StreamScanner.java:1054)
at
com.ctc.wstx.sr.StreamScanner.getNextCharFromCurrent(StreamScanner.java:811)
at
com.ctc.wstx.sr.BasicStreamReader.handleNsAttrs(BasicStreamReader.java:2982)
at
com.ctc.wstx.sr.BasicStreamReader.handleStartElem(BasicStreamReader.java:2936)
at
com.ctc.wstx.sr.BasicStreamReader.nextFromTree(BasicStreamReader.java:2848)
at com.ctc.wstx.sr.BasicStreamReader.next(BasicStreamReader.java:1019)
at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:273)
at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:134)
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:79)
... 30 more
May 15, 2012 10:37:59 AM org.apache.solr.common.SolrException log
SEVERE: null:org.eclipse.jetty.io.EofException
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:154)
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:101)
at
org.apache.solr.common.util.FastOutputStream.flushBuffer(FastOutputStream.java:185)
at
org.apache.solr.common.util.JavaBinCodec.marshal(JavaBinCodec.java:94)
at
org.apache.solr.response.BinaryResponseWriter.write(BinaryResponseWriter.java:49)
at
org.apache.solr.servlet.SolrDispatchFilter.writeResponse(SolrDispatchFilter.java:391)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:276)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1337)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:484)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:119)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:524)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:233)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1065)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:413)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:192)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:999)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:117)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:250)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:149)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:111)
at org.eclipse.jetty.server.Server.handle(Server.java:351)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:454)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:47)
at
org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:900)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:954)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:952)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:230)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:66)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:254)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:599)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:534)
at java.lang.Thread.run(Thread.java:662)
May 15, 2012 10:37:59 AM org.apache.solr.common.SolrException log
SEVERE: null:org.eclipse.jetty.io.EofException
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:154)
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:101)
at
org.apache.solr.common.util.FastOutputStream.flushBuffer(FastOutputStream.java:185)
at
org.apache.solr.common.util.JavaBinCodec.marshal(JavaBinCodec.java:94)
at
org.apache.solr.response.BinaryResponseWriter.write(BinaryResponseWriter.java:49)
at
org.apache.solr.servlet.SolrDispatchFilter.writeResponse(SolrDispatchFilter.java:391)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:276)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1337)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:484)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:119)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:524)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:233)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1065)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:413)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:192)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:999)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:117)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:250)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:149)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:111)
at org.eclipse.jetty.server.Server.handle(Server.java:351)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:454)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:47)
at
org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:900)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:954)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:952)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:230)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:66)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:254)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:599)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:534)
at java.lang.Thread.run(Thread.java:662)
--
View this message in context:
http://lucene.472066.n3.nabble.com/nutch-1-4-with-solr-4-tp3983810.html
Sent from the Nutch - User mailing list archive at Nabble.com.