Repository: any23
Updated Branches:
  refs/heads/master 31e1142d1 -> c3abfe1c0


ANY23-350 fixed RDFParseException caused by bad attribute names/values in 
malformed HTML


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/c3abfe1c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/c3abfe1c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/c3abfe1c

Branch: refs/heads/master
Commit: c3abfe1c0cc51dbdd0404b313a2691ab62b33f78
Parents: 31e1142
Author: Hans <[email protected]>
Authored: Wed Jun 27 17:19:53 2018 -0500
Committer: Hans <[email protected]>
Committed: Wed Jun 27 17:19:53 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/rdf/BaseRDFExtractor.java       | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/c3abfe1c/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index e09e20a..3391c33 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -29,10 +29,12 @@ import org.eclipse.rdf4j.rio.RDFParser;
 import org.eclipse.rdf4j.rio.RDFHandlerException;
 import org.eclipse.rdf4j.rio.RioSetting;
 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
+import org.jsoup.nodes.Attribute;
 import org.jsoup.nodes.Comment;
 import org.jsoup.nodes.DataNode;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.DocumentType;
+import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Entities;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.NodeFilter;
@@ -47,6 +49,7 @@ import java.io.PushbackInputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
+import java.util.Iterator;
 
 /**
  * Base class for a generic <i>RDF</i>
@@ -141,6 +144,19 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                 NodeTraversor.filter(new NodeFilter() {
                     @Override
                     public FilterResult head(Node node, int depth) {
+                        if (node instanceof Element) {
+                            for (Iterator<Attribute> it = 
node.attributes().iterator(); it.hasNext(); ) {
+                                // fix for ANY23-350: valid xml attribute 
names are ^[a-zA-Z_:][-a-zA-Z0-9_:.]
+                                Attribute attr = it.next();
+                                String key = 
attr.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
+                                if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")) 
{
+                                    attr.setKey(key);
+                                } else {
+                                    it.remove();
+                                }
+                            }
+                            return FilterResult.CONTINUE;
+                        }
                         return node instanceof DataNode || node instanceof 
Comment || node instanceof DocumentType
                                 ? FilterResult.REMOVE : FilterResult.CONTINUE;
                     }

Reply via email to