[
https://issues.apache.org/jira/browse/NUTCH-1016?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13271354#comment-13271354
]
Christian Johnsson commented on NUTCH-1016:
-------------------------------------------
Ok, never got the error before with 1.5rc1, It started this morning. Been
running for 1 week without errors.
May 9, 2012 1:46:31 PM org.apache.solr.common.SolrException log
SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException]
Invalid UTF-8 character 0xffff at char #1427640, byte #1564649)
at
com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
at
com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:301)
at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:157)
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:79)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:58)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1372)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
at
org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at
org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at
org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
at
org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
at
org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at
org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at
org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at
org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:293)
at
org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:859)
at
org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:602)
at
org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
at java.lang.Thread.run(Thread.java:636)
Caused by: java.io.CharConversionException: Invalid UTF-8 character 0xffff at
char #1427640, byte #1564649)
at com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335)
at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
at
com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:57)
at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
at
com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:4628)
at
com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126)
at
com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
at
com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649)
... 21 more
and
May 9, 2012 1:46:36 PM org.apache.solr.common.SolrException log
SEVERE: java.lang.RuntimeException: [was class java.io.IOException] Invalid CRLF
at
com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
at
com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:301)
at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:157)
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:79)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:58)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1372)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
at
org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at
org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at
org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
at
org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
at
org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at
org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at
org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at
org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:293)
at
org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:859)
at
org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:602)
at
org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
at java.lang.Thread.run(Thread.java:636)
Caused by: java.io.IOException: Invalid CRLF
at
org.apache.coyote.http11.filters.ChunkedInputFilter.parseCRLF(ChunkedInputFilter.java:352)
at
org.apache.coyote.http11.filters.ChunkedInputFilter.doRead(ChunkedInputFilter.java:151)
at
org.apache.coyote.http11.InternalInputBuffer.doRead(InternalInputBuffer.java:710)
at org.apache.coyote.Request.doRead(Request.java:427)
at
org.apache.catalina.connector.InputBuffer.realReadBytes(InputBuffer.java:304)
at org.apache.tomcat.util.buf.ByteChunk.substract(ByteChunk.java:419)
at org.apache.catalina.connector.InputBuffer.read(InputBuffer.java:327)
at
org.apache.catalina.connector.CoyoteInputStream.read(CoyoteInputStream.java:162)
at com.ctc.wstx.io.UTF8Reader.loadMore(UTF8Reader.java:365)
at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:110)
at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
at
com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:57)
at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
at
com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:4628)
at
com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126)
at
com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
at
com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649)
... 21 more
> Strip UTF-8 non-character codepoints
> ------------------------------------
>
> Key: NUTCH-1016
> URL: https://issues.apache.org/jira/browse/NUTCH-1016
> Project: Nutch
> Issue Type: Bug
> Components: indexer
> Affects Versions: 1.3
> Reporter: Markus Jelsma
> Assignee: Markus Jelsma
> Fix For: 1.4
>
> Attachments: NUTCH-1016-1.4-4.patch, NUTCH-1016-2.0.patch
>
>
> During a very large crawl i found a few documents producing non-character
> codepoints. When indexing to Solr this will yield the following exception:
> {code}
> SEVERE: java.lang.RuntimeException: [was class
> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> #1142033, byte #1155068)
> at
> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
> at
> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
> {code}
> Quite annoying! Here's quick fix for SolrWriter that'll pass the value of the
> content field to a method to strip away non-characters. I'm not too sure
> about this implementation but the tests i've done locally with a huge dataset
> now passes correctly. Here's a list of codepoints to strip away:
> http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
> Please comment!
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators:
https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira