lewismc commented on pull request #541: URL: https://github.com/apache/nutch/pull/541#issuecomment-667447680
Additionally, ``` ./runtime/local/bin/nutch parsechecker http://nutch.apache.org fetching: http://nutch.apache.org robots.txt whitelist not configured. parsing: http://nutch.apache.org contentType: text/html signature: 0a441dff545768701b539bf0ea7407bf http://nutch.apache.org Version: 5 Status: failed(2,200): org.apache.nutch.parse.ParseException: Unable to successfully parse content Title: Outlinks: 0 Content Metadata: nutch.crawl.score = 0.0 Parse Metadata: ``` The hadoop.log looks as follows ``` cat runtime/local/logs/hadoop.log 2020-07-31 18:33:05,687 INFO parse.ParserChecker - fetching: http://nutch.apache.org 2020-07-31 18:33:05,789 INFO protocol.RobotRulesParser - robots.txt whitelist not configured. 2020-07-31 18:33:05,790 INFO http.Http - http.proxy.host = null 2020-07-31 18:33:05,790 INFO http.Http - http.proxy.port = 8080 2020-07-31 18:33:05,790 INFO http.Http - http.proxy.exception.list = false 2020-07-31 18:33:05,791 INFO http.Http - http.timeout = 10000 2020-07-31 18:33:05,791 INFO http.Http - http.content.limit = 1048576 2020-07-31 18:33:05,791 INFO http.Http - http.agent = lewismc/Nutch-1.18-SNAPSHOT 2020-07-31 18:33:05,791 INFO http.Http - http.accept.language = en-us,en-gb,en;q=0.7,*;q=0.3 2020-07-31 18:33:05,791 INFO http.Http - http.accept = text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 2020-07-31 18:33:05,791 INFO http.Http - http.enable.cookie.header = true 2020-07-31 18:33:06,217 ERROR extractor.ExtractorRegistryImpl - Failed to instantiate service java.util.ServiceConfigurationError: org.apache.any23.extractor.ExtractorFactory: Provider org.apache.any23.extractor.rdf.FunctionalSyntaxExtractorFactory could not be instantiated at java.util.ServiceLoader.fail(ServiceLoader.java:232) at java.util.ServiceLoader.access$100(ServiceLoader.java:185) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384) at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404) at java.util.ServiceLoader$1.next(ServiceLoader.java:480) at org.eclipse.rdf4j.common.lang.service.ServiceRegistry.<init>(ServiceRegistry.java:43) at org.apache.any23.extractor.ExtractorRegistryImpl.<init>(ExtractorRegistryImpl.java:45) at org.apache.any23.extractor.ExtractorRegistryImpl.getInstance(ExtractorRegistryImpl.java:56) at org.apache.any23.Any23.<init>(Any23.java:134) at org.apache.any23.Any23.<init>(Any23.java:144) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.parse(Any23ParseFilter.java:101) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.<init>(Any23ParseFilter.java:83) at org.apache.nutch.any23.Any23ParseFilter.filter(Any23ParseFilter.java:155) at org.apache.nutch.parse.HtmlParseFilters.filter(HtmlParseFilters.java:45) at org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:256) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:34) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:23) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.ExceptionInInitializerError at org.eclipse.rdf4j.rio.RDFFormat.<clinit>(RDFFormat.java:72) at org.apache.any23.extractor.rdf.FunctionalSyntaxExtractorFactory.<init>(FunctionalSyntaxExtractorFactory.java:42) at org.apache.any23.extractor.rdf.FunctionalSyntaxExtractorFactory.<clinit>(FunctionalSyntaxExtractorFactory.java:39) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at java.lang.Class.newInstance(Class.java:442) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380) ... 18 more Caused by: java.lang.ClassCastException: org.apache.xerces.jaxp.datatype.DatatypeFactoryImpl cannot be cast to javax.xml.datatype.DatatypeFactory at javax.xml.datatype.DatatypeFactory.newInstance(Unknown Source) at org.eclipse.rdf4j.model.impl.AbstractValueFactory.<clinit>(AbstractValueFactory.java:53) ... 27 more 2020-07-31 18:33:06,219 ERROR extractor.ExtractorRegistryImpl - Failed to instantiate service java.util.ServiceConfigurationError: org.apache.any23.extractor.ExtractorFactory: Provider org.apache.any23.extractor.rdf.ManchesterSyntaxExtractorFactory could not be instantiated at java.util.ServiceLoader.fail(ServiceLoader.java:232) at java.util.ServiceLoader.access$100(ServiceLoader.java:185) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384) at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404) at java.util.ServiceLoader$1.next(ServiceLoader.java:480) at org.eclipse.rdf4j.common.lang.service.ServiceRegistry.<init>(ServiceRegistry.java:43) at org.apache.any23.extractor.ExtractorRegistryImpl.<init>(ExtractorRegistryImpl.java:45) at org.apache.any23.extractor.ExtractorRegistryImpl.getInstance(ExtractorRegistryImpl.java:56) at org.apache.any23.Any23.<init>(Any23.java:134) at org.apache.any23.Any23.<init>(Any23.java:144) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.parse(Any23ParseFilter.java:101) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.<init>(Any23ParseFilter.java:83) at org.apache.nutch.any23.Any23ParseFilter.filter(Any23ParseFilter.java:155) at org.apache.nutch.parse.HtmlParseFilters.filter(HtmlParseFilters.java:45) at org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:256) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:34) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:23) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.semanticweb.owlapi.rio.OWLAPIRDFFormat at org.apache.any23.extractor.rdf.ManchesterSyntaxExtractorFactory.<init>(ManchesterSyntaxExtractorFactory.java:42) at org.apache.any23.extractor.rdf.ManchesterSyntaxExtractorFactory.<clinit>(ManchesterSyntaxExtractorFactory.java:39) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at java.lang.Class.newInstance(Class.java:442) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380) ... 18 more 2020-07-31 18:33:06,220 ERROR extractor.ExtractorRegistryImpl - Failed to instantiate service java.util.ServiceConfigurationError: org.apache.any23.extractor.ExtractorFactory: Provider org.apache.any23.extractor.rdf.NQuadsExtractorFactory could not be instantiated at java.util.ServiceLoader.fail(ServiceLoader.java:232) at java.util.ServiceLoader.access$100(ServiceLoader.java:185) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384) at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404) at java.util.ServiceLoader$1.next(ServiceLoader.java:480) at org.eclipse.rdf4j.common.lang.service.ServiceRegistry.<init>(ServiceRegistry.java:43) at org.apache.any23.extractor.ExtractorRegistryImpl.<init>(ExtractorRegistryImpl.java:45) at org.apache.any23.extractor.ExtractorRegistryImpl.getInstance(ExtractorRegistryImpl.java:56) at org.apache.any23.Any23.<init>(Any23.java:134) at org.apache.any23.Any23.<init>(Any23.java:144) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.parse(Any23ParseFilter.java:101) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.<init>(Any23ParseFilter.java:83) at org.apache.nutch.any23.Any23ParseFilter.filter(Any23ParseFilter.java:155) at org.apache.nutch.parse.HtmlParseFilters.filter(HtmlParseFilters.java:45) at org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:256) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:34) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:23) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.eclipse.rdf4j.rio.RDFFormat at org.apache.any23.extractor.rdf.NQuadsExtractorFactory.<init>(NQuadsExtractorFactory.java:42) at org.apache.any23.extractor.rdf.NQuadsExtractorFactory.<clinit>(NQuadsExtractorFactory.java:39) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at java.lang.Class.newInstance(Class.java:442) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380) ... 18 more 2020-07-31 18:33:06,232 WARN parse.ParseUtil - Error parsing http://nutch.apache.org with org.apache.nutch.parse.html.HtmlParser java.util.concurrent.ExecutionException: java.lang.ExceptionInInitializerError at java.util.concurrent.FutureTask.report(FutureTask.java:122) at java.util.concurrent.FutureTask.get(FutureTask.java:206) at org.apache.nutch.parse.ParseUtil.runParser(ParseUtil.java:188) at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:92) at org.apache.nutch.parse.ParserChecker.process(ParserChecker.java:260) at org.apache.nutch.util.AbstractChecker.processSingle(AbstractChecker.java:87) at org.apache.nutch.parse.ParserChecker.run(ParserChecker.java:145) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76) at org.apache.nutch.parse.ParserChecker.main(ParserChecker.java:302) Caused by: java.lang.ExceptionInInitializerError at org.apache.tika.config.TikaConfig.<init>(TikaConfig.java:158) at org.apache.any23.mime.TikaMIMETypeDetector.<init>(TikaMIMETypeDetector.java:205) at org.apache.any23.Any23.<init>(Any23.java:83) at org.apache.any23.Any23.<init>(Any23.java:133) at org.apache.any23.Any23.<init>(Any23.java:144) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.parse(Any23ParseFilter.java:101) at org.apache.nutch.any23.Any23ParseFilter$Any23Parser.<init>(Any23ParseFilter.java:83) at org.apache.nutch.any23.Any23ParseFilter.filter(Any23ParseFilter.java:155) at org.apache.nutch.parse.HtmlParseFilters.filter(HtmlParseFilters.java:45) at org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:256) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:34) at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:23) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.ClassCastException: org.apache.xerces.jaxp.SAXParserFactoryImpl cannot be cast to javax.xml.parsers.SAXParserFactory at javax.xml.parsers.SAXParserFactory.newInstance(Unknown Source) at org.apache.tika.utils.XMLReaderUtils.getSAXParserFactory(XMLReaderUtils.java:216) at org.apache.tika.utils.XMLReaderUtils.setPoolSize(XMLReaderUtils.java:611) at org.apache.tika.utils.XMLReaderUtils.<clinit>(XMLReaderUtils.java:112) ... 16 more 2020-07-31 18:33:06,232 WARN parse.ParseUtil - Unable to successfully parse content http://nutch.apache.org of type text/html 2020-07-31 18:33:06,234 INFO crawl.SignatureFactory - Using Signature impl: org.apache.nutch.crawl.MD5Signature 2020-07-31 18:33:06,238 INFO parse.ParserChecker - parsing: http://nutch.apache.org 2020-07-31 18:33:06,238 INFO parse.ParserChecker - contentType: text/html 2020-07-31 18:33:06,238 INFO parse.ParserChecker - signature: 0a441dff545768701b539bf0ea7407bf ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]

