[ 
https://issues.apache.org/jira/browse/NUTCH-2545?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16422684#comment-16422684
 ] 

ASF GitHub Bot commented on NUTCH-2545:
---------------------------------------

lewismc closed pull request #306: NUTCH-2545 Upgrade to Any23 2.2
URL: https://github.com/apache/nutch/pull/306
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index e4ef4831d..d9b504400 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -34,9 +34,6 @@
   <property name="repository.apache.org"
     value="https://repository.apache.org/content/repositories/snapshots/";
     override="false"/>
-  <property name="any23.svn.apache.org"
-    value="http://svn.apache.org/repos/asf/any23/repo-ext/";
-    override="false"/>
   <property name="maven2.pattern"
     
value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/>
   <property name="maven2.pattern.ext"
@@ -50,13 +47,6 @@
       pattern="${maven2.pattern.ext}"
       m2compatible="true"
       />
-    <ibiblio name="any23-svn-repository"
-      root="${any23.svn.apache.org}"
-      m2compatible="true"
-      changingMatcher="regexp"
-      changingPattern=".*SNAPSHOT.*"
-      checkmodified="true"
-      />
     <ibiblio name="apache-snapshot"
       root="${repository.apache.org}"
       m2compatible="true" 
@@ -74,7 +64,6 @@
       <resolver ref="maven2"/>
       <resolver ref="apache-snapshot"/>
       <resolver ref="sonatype"/>
-      <resolver ref="any23-svn-repository"/>
     </chain>
     <chain name="internal">
       <resolver ref="local"/>
@@ -86,7 +75,6 @@
       <resolver ref="maven2"/>
       <resolver ref="apache-snapshot"/>
       <resolver ref="sonatype"/>
-      <resolver ref="any23-svn-repository"/>
     </chain>
   </resolvers>
   <modules>
diff --git a/src/plugin/any23/howto_upgrade_any23.txt 
b/src/plugin/any23/howto_upgrade_any23.txt
index 45eb92e6c..cf0d07703 100644
--- a/src/plugin/any23/howto_upgrade_any23.txt
+++ b/src/plugin/any23/howto_upgrade_any23.txt
@@ -1,8 +1,6 @@
-1. Upgrade Any23 dependency in trunk/ivy/ivy.xml
+1. Upgrade Any223 dependency in src/plugin/any23/ivy.xml
 
-2. Upgrade Any223 dependency in src/plugin/any23/ivy.xml
-
-3. Upgrade Any23's own dependencies in src/plugin/any23/plugin.xml
+2. Upgrade Any23's own dependencies in src/plugin/any23/plugin.xml
    To get the list of dependencies and their versions execute:
    $ ant -f ./build-ivy.xml
-   $ ls lib/
+   $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'
diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml
index 4b526e26f..0e65d931d 100644
--- a/src/plugin/any23/ivy.xml
+++ b/src/plugin/any23/ivy.xml
@@ -36,13 +36,14 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.any23" name="apache-any23-core" rev="2.1" 
conf="*->default">
+    <dependency org="org.apache.any23" name="apache-any23-core" rev="2.2" 
conf="*->default">
       <exclude org="org.apache.commons" name="commons-lang" />
       <exclude org="org.apache.commons" name="commons-compress" />
       <exclude org="org.slf4j" name="slf4j-log4j12" />
       <exclude org="org.slf4j" name="slf4j-api" />
       <exclude org="xerces" />
     </dependency>
+    <dependency org="org.apache.commons" name="commons-rdf-api" rev="0.5.0" 
conf="*->default"/>
   </dependencies>
 
 </ivy-module>
diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml
index 3b099cd2b..71c552215 100644
--- a/src/plugin/any23/plugin.xml
+++ b/src/plugin/any23/plugin.xml
@@ -25,173 +25,162 @@
     <library name="any23.jar">
       <export name="*"/>
     </library>
-    <library name="annotations-12.0.jar"/>
-    <library name="aopalliance-1.0.jar"/>
-    <library name="apache-any23-api-2.1.jar"/>
-    <library name="apache-any23-core-2.1.jar"/>
-    <library name="apache-any23-csvutils-2.1.jar"/>
-    <library name="apache-any23-encoding-2.1.jar"/>
-    <library name="apache-any23-mime-2.1.jar"/>
-    <library name="asm-5.0.4.jar"/>
-    <library name="bcmail-jdk15on-1.54.jar"/>
-    <library name="bcpkix-jdk15on-1.54.jar"/>
-    <library name="bcprov-jdk15on-1.54.jar"/>
-    <library name="boilerpipe-1.1.0.jar"/>
-    <library name="bzip2-0.9.1.jar"/>
-    <library name="c3p0-0.9.1.1.jar"/>
-    <library name="caffeine-2.3.5.jar"/>
-    <library name="cdm-4.5.5.jar"/>
-    <library name="commons-codec-1.10.jar"/>
-    <library name="commons-collections4-4.1.jar"/>
-    <library name="commons-compress-1.13.jar"/>
-    <library name="commons-csv-1.0-SNAPSHOT-rev1148315.jar"/>
-    <library name="commons-exec-1.3.jar"/>
-    <library name="commons-httpclient-3.1.jar"/>
-    <library name="commons-io-2.4.jar"/>
-    <library name="commons-logging-1.0.4.jar"/>
-    <library name="commons-rdf-api-0.5.0.jar"/>
-    <library name="commons-vfs2-2.0.jar"/>
-    <library name="curvesapi-1.04.jar"/>
-    <library name="cxf-core-3.0.12.jar"/>
-    <library name="cxf-rt-frontend-jaxrs-3.0.12.jar"/>
-    <library name="cxf-rt-rs-client-3.0.12.jar"/>
-    <library name="cxf-rt-transports-http-3.0.12.jar"/>
-    <library name="ehcache-core-2.6.2.jar"/>
-    <library name="fluent-hc-4.5.3.jar"/>
-    <library name="fontbox-2.0.6.jar"/>
-    <library name="geoapi-3.0.0.jar"/>
-    <library name="grib-4.5.5.jar"/>
-    <library name="gson-2.2.4.jar"/>
-    <library name="guava-20.0.jar"/>
-    <library name="guice-4.1.0.jar"/>
-    <library name="guice-assistedinject-4.1.0.jar"/>
-    <library name="guice-multibindings-4.1.0.jar"/>
-    <library name="httpclient-4.5.3.jar"/>
-    <library name="httpclient-cache-4.5.3.jar"/>
-    <library name="httpclient-osgi-4.5.3.jar"/>
-    <library name="httpcore-4.4.6.jar"/>
-    <library name="httpcore-nio-4.4.6.jar"/>
-    <library name="httpcore-osgi-4.4.6.jar"/>
-    <library name="httpmime-4.5.3.jar"/>
-    <library name="httpservices-4.5.5.jar"/>
-    <library name="isoparser-1.1.18.jar"/>
-    <library name="jackcess-2.1.4.jar"/>
-    <library name="jackcess-encrypt-2.1.1.jar"/>
-    <library name="jackson-annotations-2.8.5.jar"/>
-    <library name="jackson-core-2.8.5.jar"/>
-    <library name="jackson-databind-2.8.5.jar"/>
-    <library name="java-libpst-0.8.1.jar"/>
-    <library name="javax.annotation-api-1.2.jar"/>
-    <library name="javax.inject-1.jar"/>
-    <library name="javax.ws.rs-api-2.0.1.jar"/>
-    <library name="jcip-annotations-1.0.jar"/>
-    <library name="jcl-over-slf4j-1.7.25.jar"/>
-    <library name="jcommander-1.30.jar"/>
-    <library name="jdom2-2.0.4.jar"/>
-    <library name="jempbox-1.8.13.jar"/>
-    <library name="jhighlight-1.0.2.jar"/>
-    <library name="jj2000-5.2.jar"/>
-    <library name="jmatio-1.2.jar"/>
-    <library name="jna-4.1.0.jar"/>
-    <library name="joda-time-2.2.jar"/>
-    <library name="json-20140107.jar"/>
-    <library name="json-simple-1.1.1.jar"/>
-    <library name="jsonic-1.2.11.jar"/>
-    <library name="jsonld-java-0.10.0.jar"/>
-    <library name="jsoup-1.7.2.jar"/>
-    <library name="jsr-275-0.9.3.jar"/>
-    <library name="jsr305-3.0.1.jar"/>
-    <library name="jul-to-slf4j-1.7.24.jar"/>
-    <library name="juniversalchardet-1.0.3.jar"/>
-    <library name="junrar-0.7.jar"/>
-    <library name="language-detector-0.5.jar"/>
-    <library name="mapdb-1.0.8.jar"/>
-    <library name="maven-scm-api-1.4.jar"/>
-    <library name="maven-scm-provider-svn-commons-1.4.jar"/>
-    <library name="maven-scm-provider-svnexe-1.4.jar"/>
-    <library name="metadata-extractor-2.9.1.jar"/>
-    <library name="nekohtml-1.9.20.jar"/>
-    <library name="netcdf4-4.5.5.jar"/>
-    <library name="opennlp-tools-1.6.0.jar"/>
-    <library name="owlapi-api-5.1.0.jar"/>
-    <library name="owlapi-apibinding-5.1.0.jar"/>
-    <library name="owlapi-impl-5.1.0.jar"/>
-    <library name="owlapi-oboformat-5.1.0.jar"/>
-    <library name="owlapi-parsers-5.1.0.jar"/>
-    <library name="owlapi-rio-5.1.0.jar"/>
-    <library name="owlapi-tools-5.1.0.jar"/>
-    <library name="pdfbox-2.0.6.jar"/>
-    <library name="pdfbox-debugger-2.0.6.jar"/>
-    <library name="pdfbox-tools-2.0.6.jar"/>
-    <library name="plexus-utils-1.5.6.jar"/>
-    <library name="poi-3.16.jar"/>
-    <library name="poi-ooxml-3.16.jar"/>
-    <library name="poi-ooxml-schemas-3.16.jar"/>
-    <library name="poi-scratchpad-3.16.jar"/>
-    <library name="protobuf-java-2.5.0.jar"/>
-    <library name="quartz-2.2.0.jar"/>
-    <library name="rdf4j-http-client-2.2.2.jar"/>
-    <library name="rdf4j-http-protocol-2.2.2.jar"/>
-    <library name="rdf4j-model-2.2.2.jar"/>
-    <library name="rdf4j-query-2.2.2.jar"/>
-    <library name="rdf4j-queryalgebra-evaluation-2.2.2.jar"/>
-    <library name="rdf4j-queryalgebra-model-2.2.2.jar"/>
-    <library name="rdf4j-queryparser-api-2.2.2.jar"/>
-    <library name="rdf4j-queryparser-serql-2.2.2.jar"/>
-    <library name="rdf4j-queryparser-sparql-2.2.2.jar"/>
-    <library name="rdf4j-queryresultio-api-2.2.2.jar"/>
-    <library name="rdf4j-queryresultio-sparqlxml-2.2.2.jar"/>
-    <library name="rdf4j-repository-api-2.2.2.jar"/>
-    <library name="rdf4j-repository-sail-2.2.2.jar"/>
-    <library name="rdf4j-repository-sparql-2.2.2.jar"/>
-    <library name="rdf4j-rio-api-2.2.2.jar"/>
-    <library name="rdf4j-rio-binary-2.1.4.jar"/>
-    <library name="rdf4j-rio-datatypes-2.2.2.jar"/>
-    <library name="rdf4j-rio-jsonld-2.2.2.jar"/>
-    <library name="rdf4j-rio-languages-2.2.2.jar"/>
-    <library name="rdf4j-rio-n3-2.2.2.jar"/>
-    <library name="rdf4j-rio-nquads-2.2.2.jar"/>
-    <library name="rdf4j-rio-ntriples-2.2.2.jar"/>
-    <library name="rdf4j-rio-rdfjson-2.2.2.jar"/>
-    <library name="rdf4j-rio-rdfxml-2.2.2.jar"/>
-    <library name="rdf4j-rio-trig-2.2.2.jar"/>
-    <library name="rdf4j-rio-trix-2.2.2.jar"/>
-    <library name="rdf4j-rio-turtle-2.2.2.jar"/>
-    <library name="rdf4j-sail-api-2.2.2.jar"/>
-    <library name="rdf4j-sail-base-2.2.2.jar"/>
-    <library name="rdf4j-sail-inferencer-2.2.2.jar"/>
-    <library name="rdf4j-sail-memory-2.2.2.jar"/>
-    <library name="rdf4j-sail-model-2.2.2.jar"/>
-    <library name="rdf4j-util-2.2.2.jar"/>
-    <library name="regexp-1.3.jar"/>
-    <library name="rome-1.5.1.jar"/>
-    <library name="rome-utils-1.5.1.jar"/>
-    <library name="semargl-core-0.7.jar"/>
-    <library name="semargl-rdf-0.7.jar"/>
-    <library name="semargl-rdf4j-0.7.jar"/>
-    <library name="semargl-rdfa-0.7.jar"/>
-    <library name="sentiment-analysis-parser-0.1.jar"/>
-    <library name="sis-metadata-0.6.jar"/>
-    <library name="sis-netcdf-0.6.jar"/>
-    <library name="sis-referencing-0.6.jar"/>
-    <library name="sis-storage-0.6.jar"/>
-    <library name="sis-utility-0.6.jar"/>
-    <library name="snakeyaml-1.17.jar"/>
-    <library name="stax2-api-3.1.4.jar"/>
-    <library name="tagsoup-1.2.1.jar"/>
-    <library name="tika-core-1.15.jar"/>
-    <library name="tika-langdetect-1.13.jar"/>
-    <library name="tika-parsers-1.15.jar"/>
-    <library name="trove4j-3.0.3.jar"/>
-    <library name="udunits-4.5.5.jar"/>
-    <library name="vorbis-java-core-0.8.jar"/>
-    <library name="vorbis-java-tika-0.8.jar"/>
-    <library name="woodstox-core-asl-4.4.1.jar"/>
-    <library name="xml-apis-1.4.01.jar"/>
-    <library name="xmlbeans-2.6.0.jar"/>
-    <library name="xmlschema-core-2.2.1.jar"/>
-    <library name="xmpcore-5.1.2.jar"/>
-    <library name="xz-1.6.jar"/>
+      <library name="aopalliance-1.0.jar"/>
+      <library name="apache-any23-api-2.2.jar"/>
+      <library name="apache-any23-core-2.2.jar"/>
+      <library name="apache-any23-csvutils-2.2.jar"/>
+      <library name="apache-any23-encoding-2.2.jar"/>
+      <library name="apache-any23-mime-2.2.jar"/>
+      <library name="apache-mime4j-core-0.8.1.jar"/>
+      <library name="apache-mime4j-dom-0.8.1.jar"/>
+      <library name="asm-5.0.4.jar"/>
+      <library name="bcmail-jdk15on-1.54.jar"/>
+      <library name="bcpkix-jdk15on-1.54.jar"/>
+      <library name="bcprov-jdk15on-1.54.jar"/>
+      <library name="boilerpipe-1.1.0.jar"/>
+      <library name="bzip2-0.9.1.jar"/>
+      <library name="c3p0-0.9.1.1.jar"/>
+      <library name="caffeine-2.5.6.jar"/>
+      <library name="cdm-4.5.5.jar"/>
+      <library name="commons-codec-1.10.jar"/>
+      <library name="commons-collections4-4.1.jar"/>
+      <library name="commons-csv-1.5.jar"/>
+      <library name="commons-exec-1.3.jar"/>
+      <library name="commons-httpclient-3.1.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-lang-2.6.jar"/>
+      <library name="commons-logging-1.2.jar"/>
+      <library name="commons-rdf-api-0.5.0.jar"/>
+      <library name="curvesapi-1.04.jar"/>
+      <library name="cxf-core-3.0.16.jar"/>
+      <library name="cxf-rt-frontend-jaxrs-3.0.16.jar"/>
+      <library name="cxf-rt-rs-client-3.0.16.jar"/>
+      <library name="cxf-rt-transports-http-3.0.16.jar"/>
+      <library name="ehcache-core-2.6.2.jar"/>
+      <library name="fluent-hc-4.5.3.jar"/>
+      <library name="fontbox-2.0.8.jar"/>
+      <library name="geoapi-3.0.0.jar"/>
+      <library name="grib-4.5.5.jar"/>
+      <library name="gson-2.8.1.jar"/>
+      <library name="guava-20.0.jar"/>
+      <library name="guice-4.1.0.jar"/>
+      <library name="guice-assistedinject-4.1.0.jar"/>
+      <library name="guice-multibindings-4.1.0.jar"/>
+      <library name="httpclient-4.5.3.jar"/>
+      <library name="httpclient-cache-4.5.3.jar"/>
+      <library name="httpclient-osgi-4.5.3.jar"/>
+      <library name="httpcore-4.4.6.jar"/>
+      <library name="httpcore-nio-4.4.6.jar"/>
+      <library name="httpcore-osgi-4.4.6.jar"/>
+      <library name="httpmime-4.5.3.jar"/>
+      <library name="httpservices-4.5.5.jar"/>
+      <library name="isoparser-1.1.18.jar"/>
+      <library name="jackcess-2.1.8.jar"/>
+      <library name="jackcess-encrypt-2.1.2.jar"/>
+      <library name="jackson-annotations-2.9.0.jar"/>
+      <library name="jackson-core-2.9.2.jar"/>
+      <library name="jackson-databind-2.9.0.jar"/>
+      <library name="java-libpst-0.8.1.jar"/>
+      <library name="javax.annotation-api-1.2.jar"/>
+      <library name="javax.inject-1.jar"/>
+      <library name="javax.ws.rs-api-2.0.1.jar"/>
+      <library name="jcip-annotations-1.0.jar"/>
+      <library name="jcl-over-slf4j-1.7.25.jar"/>
+      <library name="jcommander-1.30.jar"/>
+      <library name="jdom2-2.0.4.jar"/>
+      <library name="jempbox-1.8.13.jar"/>
+      <library name="jhighlight-1.0.2.jar"/>
+      <library name="jmatio-1.2.jar"/>
+      <library name="jna-4.1.0.jar"/>
+      <library name="joda-time-2.2.jar"/>
+      <library name="json-1.8.jar"/>
+      <library name="json-simple-1.1.1.jar"/>
+      <library name="jsonld-java-0.11.1.jar"/>
+      <library name="jsoup-1.11.2.jar"/>
+      <library name="jsr-275-0.9.3.jar"/>
+      <library name="jsr305-3.0.1.jar"/>
+      <library name="jul-to-slf4j-1.7.24.jar"/>
+      <library name="juniversalchardet-1.0.3.jar"/>
+      <library name="junrar-0.7.jar"/>
+      <library name="mapdb-1.0.8.jar"/>
+      <library name="metadata-extractor-2.10.1.jar"/>
+      <library name="nekohtml-1.9.20.jar"/>
+      <library name="netcdf4-4.5.5.jar"/>
+      <library name="opennlp-tools-1.8.3.jar"/>
+      <library name="owlapi-api-5.1.3.jar"/>
+      <library name="owlapi-apibinding-5.1.3.jar"/>
+      <library name="owlapi-impl-5.1.3.jar"/>
+      <library name="owlapi-oboformat-5.1.3.jar"/>
+      <library name="owlapi-parsers-5.1.3.jar"/>
+      <library name="owlapi-rio-5.1.3.jar"/>
+      <library name="owlapi-tools-5.1.3.jar"/>
+      <library name="pdfbox-2.0.8.jar"/>
+      <library name="pdfbox-tools-2.0.8.jar"/>
+      <library name="poi-3.16.jar"/>
+      <library name="poi-ooxml-3.16.jar"/>
+      <library name="poi-ooxml-schemas-3.16.jar"/>
+      <library name="poi-scratchpad-3.16.jar"/>
+      <library name="protobuf-java-2.5.0.jar"/>
+      <library name="quartz-2.2.0.jar"/>
+      <library name="rdf4j-http-client-2.2.4.jar"/>
+      <library name="rdf4j-http-protocol-2.2.4.jar"/>
+      <library name="rdf4j-model-2.2.4.jar"/>
+      <library name="rdf4j-query-2.2.4.jar"/>
+      <library name="rdf4j-queryalgebra-evaluation-2.2.4.jar"/>
+      <library name="rdf4j-queryalgebra-model-2.2.4.jar"/>
+      <library name="rdf4j-queryparser-api-2.2.4.jar"/>
+      <library name="rdf4j-queryparser-serql-2.2.4.jar"/>
+      <library name="rdf4j-queryparser-sparql-2.2.4.jar"/>
+      <library name="rdf4j-queryresultio-api-2.2.4.jar"/>
+      <library name="rdf4j-queryresultio-sparqlxml-2.2.4.jar"/>
+      <library name="rdf4j-repository-api-2.2.4.jar"/>
+      <library name="rdf4j-repository-sail-2.2.4.jar"/>
+      <library name="rdf4j-repository-sparql-2.2.4.jar"/>
+      <library name="rdf4j-rio-api-2.2.4.jar"/>
+      <library name="rdf4j-rio-binary-2.2.2.jar"/>
+      <library name="rdf4j-rio-datatypes-2.2.4.jar"/>
+      <library name="rdf4j-rio-jsonld-2.2.4.jar"/>
+      <library name="rdf4j-rio-languages-2.2.4.jar"/>
+      <library name="rdf4j-rio-n3-2.2.4.jar"/>
+      <library name="rdf4j-rio-nquads-2.2.4.jar"/>
+      <library name="rdf4j-rio-ntriples-2.2.4.jar"/>
+      <library name="rdf4j-rio-rdfjson-2.2.4.jar"/>
+      <library name="rdf4j-rio-rdfxml-2.2.4.jar"/>
+      <library name="rdf4j-rio-trig-2.2.4.jar"/>
+      <library name="rdf4j-rio-trix-2.2.4.jar"/>
+      <library name="rdf4j-rio-turtle-2.2.4.jar"/>
+      <library name="rdf4j-sail-api-2.2.4.jar"/>
+      <library name="rdf4j-sail-base-2.2.4.jar"/>
+      <library name="rdf4j-sail-inferencer-2.2.4.jar"/>
+      <library name="rdf4j-sail-memory-2.2.4.jar"/>
+      <library name="rdf4j-sail-model-2.2.4.jar"/>
+      <library name="rdf4j-util-2.2.4.jar"/>
+      <library name="rome-1.5.1.jar"/>
+      <library name="rome-utils-1.5.1.jar"/>
+      <library name="semargl-core-0.7.jar"/>
+      <library name="semargl-rdf-0.7.jar"/>
+      <library name="semargl-rdf4j-0.7.jar"/>
+      <library name="semargl-rdfa-0.7.jar"/>
+      <library name="sentiment-analysis-parser-0.1.jar"/>
+      <library name="sis-metadata-0.6.jar"/>
+      <library name="sis-netcdf-0.6.jar"/>
+      <library name="sis-referencing-0.6.jar"/>
+      <library name="sis-storage-0.6.jar"/>
+      <library name="sis-utility-0.6.jar"/>
+      <library name="snakeyaml-1.17.jar"/>
+      <library name="stax2-api-3.1.4.jar"/>
+      <library name="tagsoup-1.2.1.jar"/>
+      <library name="tika-core-1.17.jar"/>
+      <library name="tika-parsers-1.17.jar"/>
+      <library name="trove4j-3.0.3.jar"/>
+      <library name="udunits-4.5.5.jar"/>
+      <library name="vorbis-java-core-0.8.jar"/>
+      <library name="vorbis-java-tika-0.8.jar"/>
+      <library name="woodstox-core-asl-4.4.1.jar"/>
+      <library name="xmlbeans-2.6.0.jar"/>
+      <library name="xmlschema-core-2.2.2.jar"/>
+      <library name="xmpcore-5.1.3.jar"/>
+      <library name="xz-1.6.jar"/>
   </runtime>
 
   <requires>
diff --git 
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java 
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index e64131046..6cfd4211a 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -18,8 +18,6 @@
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.StringReader;
 import java.net.URISyntaxException;
 import java.nio.charset.Charset;
 import java.util.Set;
@@ -40,14 +38,9 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
-import org.ccil.cowan.tagsoup.XMLWriter;
-import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.XMLReader;
 
 /**
  * <p>This implementation of {@link org.apache.nutch.parse.HtmlParseFilter}
@@ -75,7 +68,7 @@
    * Constant identifier used as a Key for writing and reading
    * triples to and from the metadata Map field.
    */
-  public final static String ANY23_TRIPLES = "Any23-Triples";
+  public static final String ANY23_TRIPLES = "Any23-Triples";
 
   public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
   public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
@@ -85,10 +78,11 @@
     Set<String> triples = null;
 
     Any23Parser(String url, String htmlContent, String contentType, String... 
extractorNames) throws TripleHandlerException {
-      triples = new TreeSet<String>();
+      triples = new TreeSet<>();
       try {
         parse(url, htmlContent, contentType, extractorNames);
       } catch (URISyntaxException e) {
+        LOG.error("Error parsing URI: {}", url, e);
         throw new RuntimeException(e.getReason());
       } catch (IOException e) {
         e.printStackTrace();
@@ -106,21 +100,12 @@
     private void parse(String url, String htmlContent, String contentType, 
String... extractorNames) throws URISyntaxException, IOException, 
TripleHandlerException {
       Any23 any23 = new Any23(extractorNames);
       any23.setMIMETypeDetector(null);
-
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
       try {
-        // Fix input to avoid extraction error 
(https://github.com/semarglproject/semargl/issues/37#issuecomment-69381281)
-        XMLReader reader = SAXParserImpl.newInstance(null).getXMLReader();
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        XMLWriter writer = new XMLWriter(new OutputStreamWriter(baos));
-        reader.setContentHandler(writer);
-        reader.parse(new InputSource(new StringReader(htmlContent)));
-        String input = new String(baos.toByteArray(), 
Charset.forName("UTF-8"));
-
-        baos = new ByteArrayOutputStream();
         TripleHandler tHandler = new NTriplesWriter(baos);
         BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
         try {
-          any23.extract(input, url, contentType, "UTF-8", bHandler);
+          any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
         } catch (IOException e) {
           LOG.error("Error while reading the source", e);
         } catch (ExtractionException e) {
@@ -135,18 +120,18 @@ private void parse(String url, String htmlContent, String 
contentType, String...
         String n3 = baos.toString("UTF-8");
         String[] triplesStrings = n3.split("\n");
         Collections.addAll(triples, triplesStrings);
-      } catch (SAXException e) {
-        LOG.error("Unexpected SAXException", e);
       } catch (IOException e) {
         LOG.error("Unexpected IOException", e);
       }
     }
   }
 
+  @Override
   public Configuration getConf() {
     return this.conf;
   }
 
+  @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
   }
diff --git 
a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java 
b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
index 571a3d5a5..4271730c3 100644
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
+++ b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
@@ -62,16 +62,19 @@
   
   private String file2 = "microdata_basic.html";
 
-  private static final int EXPECTED_TRIPLES_1 = 79;
+  private static final int EXPECTED_TRIPLES_1 = 68;
   
-  private static final int EXPECTED_TRIPLES_2 = 39;
+  private static final int EXPECTED_TRIPLES_2 = 38;
   
   @Before
   public void setUp() {
     this.conf = NutchConfiguration.create();
     conf.set("file.content.limit", "-1");
     conf.set("parser.timeout", "-1");
-    conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, 
"html-embedded-jsonld,html-head-icbm,html-head-links,html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
+    conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, 
"html-embedded-jsonld,html-head-icbm,html-head-links,"
+            + 
"html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,"
+            + 
"html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,"
+            + 
"html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
     conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
   }
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Upgrade to Any23 2.2
> --------------------
>
>                 Key: NUTCH-2545
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2545
>             Project: Nutch
>          Issue Type: Improvement
>          Components: any23, plugin
>            Reporter: Lewis John McGibbney
>            Assignee: Lewis John McGibbney
>            Priority: Minor
>             Fix For: 1.15
>
>
> We recently released Any23 2.2. I would like to update the Any23 plugin to 
> this newest version.
> PR coming up.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to