Repository: any23
Updated Branches:
  refs/heads/master e650a8d1a -> 3475ebd67


ANY23-347 fixed RDFParseExceptions caused by unbound xml prefixes


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/3475ebd6
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/3475ebd6
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/3475ebd6

Branch: refs/heads/master
Commit: 3475ebd6708e7857cbb021d3c51961148cb8ce87
Parents: e650a8d
Author: Hans <[email protected]>
Authored: Thu Jun 28 11:57:24 2018 -0500
Committer: Hans <[email protected]>
Committed: Thu Jun 28 17:14:24 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/rdf/BaseRDFExtractor.java   | 12 ++++++++
 .../rdfa/AbstractRDFaExtractorTestCase.java     |  4 +--
 .../extractor/rdfa/RDFa11ExtractorTest.java     | 11 ++++++++
 .../resources/html/rdfa/basic-with-errors.html  | 29 ++++++++++++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index 3391c33..a1eab72 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -149,12 +149,24 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                                 // fix for ANY23-350: valid xml attribute 
names are ^[a-zA-Z_:][-a-zA-Z0-9_:.]
                                 Attribute attr = it.next();
                                 String key = 
attr.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
+
+                                // fix for ANY23-347: strip xml namespaces
+                                int prefixlen = key.lastIndexOf(':') + 1;
+                                String prefix = key.substring(0, 
prefixlen).toLowerCase();
+                                key = (prefix.equals("xmlns:") || 
prefix.equals("xml:") ? prefix : "")
+                                        + key.substring(prefixlen);
+
                                 if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")) 
{
                                     attr.setKey(key);
                                 } else {
                                     it.remove();
                                 }
                             }
+
+                            String tagName = 
((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", "");
+                            tagName = 
tagName.substring(tagName.lastIndexOf(':') + 1);
+                            
((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName 
: "div");
+
                             return FilterResult.CONTINUE;
                         }
                         return node instanceof DataNode || node instanceof 
Comment || node instanceof DocumentType

http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java
 
b/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java
index a187077..21935cb 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java
@@ -41,7 +41,7 @@ public abstract class AbstractRDFaExtractorTestCase extends
        /**
         * Verify the basic RDFa support.
         *
-        * @throws org.openrdf.repository.RepositoryException
+        * @throws org.eclipse.rdf4j.repository.RepositoryException
         */
        @Test
        public void testBasic() throws Exception {
@@ -118,7 +118,7 @@ public abstract class AbstractRDFaExtractorTestCase extends
         * 
href="http://files.openspring.net/tmp/drupal-test-frontpage.html";>Drupal
         * test page</a>.
         *
-        * @throws org.openrdf.repository.RepositoryException
+        * @throws org.eclipse.rdf4j.repository.RepositoryException
         */
        @Test
        public void testDrupalTestPage() throws Exception {

http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java 
b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
index 7849f50..8c65df9 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
@@ -79,6 +79,17 @@ public class RDFa11ExtractorTest extends 
AbstractRDFaExtractorTestCase {
     }
 
     @Test
+    public void testBasicWithSyntaxErrors() {
+        //test issues ANY23-347 and ANY23-350
+        assertExtract("/html/rdfa/basic-with-errors.html");
+        assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", 
"en"));
+        assertContains(null, vDCTERMS.title,
+                RDFUtils.literal("The trouble with Bob", "en"));
+        assertContains(null, RDFUtils.iri("http://fake.org/prop";),
+                RDFUtils.literal("Mary", "en"));
+    }
+
+    @Test
     public void testIssue326() {
         assertExtract("/html/rdfa/rdfa-issue326-and-267.html");
     }

http://git-wip-us.apache.org/repos/asf/any23/blob/3475ebd6/test-resources/src/test/resources/html/rdfa/basic-with-errors.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/html/rdfa/basic-with-errors.html 
b/test-resources/src/test/resources/html/rdfa/basic-with-errors.html
new file mode 100644
index 0000000..b0c4ad3
--- /dev/null
+++ b/test-resources/src/test/resources/html/rdfa/basic-with-errors.html
@@ -0,0 +1,29 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+    <link rel=\"shortcut icon\">
+</head>
+<body>
+<div xmlns:dc="http://purl.org/dc/terms/"; xmlns:fake="http://fake.org/"; 
pw:twitter-via="AgendaCulturel" pw:share-popups="true" pw:#="should remove">
+    <pw:h2 property="dc:title">The trouble with Bob</pw:h2>
+    <pw:" property="dc:creator">Alice</pw:">
+    <h3" property="fake:prop">Mary</h3">
+    ...
+</div>
+</body>
+</html>
\ No newline at end of file

Reply via email to