Repository: any23 Updated Branches: refs/heads/master e650a8d1a -> 3475ebd67
ANY23-347 fixed RDFParseExceptions caused by unbound xml prefixes Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/3475ebd6 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/3475ebd6 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/3475ebd6 Branch: refs/heads/master Commit: 3475ebd6708e7857cbb021d3c51961148cb8ce87 Parents: e650a8d Author: Hans <[email protected]> Authored: Thu Jun 28 11:57:24 2018 -0500 Committer: Hans <[email protected]> Committed: Thu Jun 28 17:14:24 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/rdf/BaseRDFExtractor.java | 12 ++++++++ .../rdfa/AbstractRDFaExtractorTestCase.java | 4 +-- .../extractor/rdfa/RDFa11ExtractorTest.java | 11 ++++++++ .../resources/html/rdfa/basic-with-errors.html | 29 ++++++++++++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 3391c33..a1eab72 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -149,12 +149,24 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { // fix for ANY23-350: valid xml attribute names are ^[a-zA-Z_:][-a-zA-Z0-9_:.] Attribute attr = it.next(); String key = attr.getKey().replaceAll("[^-a-zA-Z0-9_:.]", ""); + + // fix for ANY23-347: strip xml namespaces + int prefixlen = key.lastIndexOf(':') + 1; + String prefix = key.substring(0, prefixlen).toLowerCase(); + key = (prefix.equals("xmlns:") || prefix.equals("xml:") ? prefix : "") + + key.substring(prefixlen); + if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")) { attr.setKey(key); } else { it.remove(); } } + + String tagName = ((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", ""); + tagName = tagName.substring(tagName.lastIndexOf(':') + 1); + ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div"); + return FilterResult.CONTINUE; } return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java index a187077..21935cb 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java @@ -41,7 +41,7 @@ public abstract class AbstractRDFaExtractorTestCase extends /** * Verify the basic RDFa support. * - * @throws org.openrdf.repository.RepositoryException + * @throws org.eclipse.rdf4j.repository.RepositoryException */ @Test public void testBasic() throws Exception { @@ -118,7 +118,7 @@ public abstract class AbstractRDFaExtractorTestCase extends * href="http://files.openspring.net/tmp/drupal-test-frontpage.html">Drupal * test page</a>. * - * @throws org.openrdf.repository.RepositoryException + * @throws org.eclipse.rdf4j.repository.RepositoryException */ @Test public void testDrupalTestPage() throws Exception { http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java index 7849f50..8c65df9 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java @@ -79,6 +79,17 @@ public class RDFa11ExtractorTest extends AbstractRDFaExtractorTestCase { } @Test + public void testBasicWithSyntaxErrors() { + //test issues ANY23-347 and ANY23-350 + assertExtract("/html/rdfa/basic-with-errors.html"); + assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", "en")); + assertContains(null, vDCTERMS.title, + RDFUtils.literal("The trouble with Bob", "en")); + assertContains(null, RDFUtils.iri("http://fake.org/prop"), + RDFUtils.literal("Mary", "en")); + } + + @Test public void testIssue326() { assertExtract("/html/rdfa/rdfa-issue326-and-267.html"); } http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/test-resources/src/test/resources/html/rdfa/basic-with-errors.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/rdfa/basic-with-errors.html b/test-resources/src/test/resources/html/rdfa/basic-with-errors.html new file mode 100644 index 0000000..b0c4ad3 --- /dev/null +++ b/test-resources/src/test/resources/html/rdfa/basic-with-errors.html @@ -0,0 +1,29 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"> +<head> + <link rel=\"shortcut icon\"> +</head> +<body> +<div xmlns:dc="http://purl.org/dc/terms/" xmlns:fake="http://fake.org/" pw:twitter-via="AgendaCulturel" pw:share-popups="true" pw:#="should remove"> + <pw:h2 property="dc:title">The trouble with Bob</pw:h2> + <pw:" property="dc:creator">Alice</pw:"> + <h3" property="fake:prop">Mary</h3"> + ... +</div> +</body> +</html> \ No newline at end of file
