Repository: any23
Updated Branches:
  refs/heads/master d283d70ce -> 6173637bb


ANY23-376 fix IllegalArgumentException in microdata extractor


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6173637b
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6173637b
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6173637b

Branch: refs/heads/master
Commit: 6173637bb801da62b07b69be64fa2c75f8d54904
Parents: d283d70
Author: Hans <[email protected]>
Authored: Tue Jul 31 15:35:55 2018 -0500
Committer: Hans <[email protected]>
Committed: Tue Jul 31 15:35:55 2018 -0500

----------------------------------------------------------------------
 .../extractor/microdata/MicrodataParser.java    |  11 +-
 .../microdata/MicrodataExtractorTest.java       |  15 ++-
 .../microdata-bad-properties-expected.nquads    |  84 +++++++++++++
 .../microdata/microdata-bad-properties.html     | 125 +++++++++++++++++++
 4 files changed, 231 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
index 32faec3..f305620 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
@@ -17,6 +17,7 @@
 package org.apache.any23.extractor.microdata;
 
 import org.apache.any23.extractor.html.DomUtils;
+import org.apache.commons.lang.StringUtils;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -394,9 +395,15 @@ public class MicrodataParser {
         while (treeWalker.nextNode() != null);
 
         final List<ItemProp> result = new ArrayList<>();
-        for(Node itemPropNode :  accepted) {
+        for (Node itemPropNode : accepted) {
             final String itemProp = DomUtils.readAttribute(itemPropNode, 
ITEMPROP_ATTRIBUTE, null);
-            final String[] propertyNames = itemProp.split(" ");
+
+            if (StringUtils.isBlank(itemProp)) {
+                manageError(new MicrodataParserException("invalid property 
name '" + itemProp + "'", itemPropNode));
+                continue;
+            }
+
+            final String[] propertyNames = itemProp.trim().split("\\s+");
             ItemPropValue itemPropValue;
             for (String propertyName : propertyNames) {
                 try {

http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index 280b3f7..e858ea3 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata;
 
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
@@ -89,7 +90,6 @@ public class MicrodataExtractorTest extends 
AbstractExtractorTestCase {
         assertExtract("/microdata/microdata-missing-scheme.html");
         assertModelNotEmpty();
         assertContains(null, RDF.TYPE, 
RDFUtils.iri("http://schema.org/Answer";));
-        System.out.println(dumpHumanReadableTriples());
     }
 
     /**
@@ -206,9 +206,20 @@ public class MicrodataExtractorTest extends 
AbstractExtractorTestCase {
         extractAndVerifyAgainstNQuads("microdata-bad-types.html", 
"microdata-bad-types-expected.nquads");
     }
 
+    @Test
+    public void testBadPropertyNames() throws IOException {
+        extractAndVerifyAgainstNQuads("microdata-bad-properties.html", 
"microdata-bad-properties-expected.nquads", false);
+        assertIssue(IssueReport.IssueLevel.ERROR, ".*invalid property name 
''.*\"path\" : 
\"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
+    }
+
     private void extractAndVerifyAgainstNQuads(String actual, String expected)
+            throws RepositoryException, RDFHandlerException, IOException, 
RDFParseException {
+        extractAndVerifyAgainstNQuads(actual, expected, true);
+    }
+
+    private void extractAndVerifyAgainstNQuads(String actual, String expected, 
boolean assertNoIssues)
     throws RepositoryException, RDFHandlerException, IOException, 
RDFParseException {
-        assertExtract("/microdata/" + actual);
+        assertExtract("/microdata/" + actual, assertNoIssues);
         assertModelNotEmpty();
         logger.debug( dumpModelToNQuads() );
         List<Statement> expectedStatements = loadResultStatement("/microdata/" 
+ expected);

http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
 
b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
new file mode 100644
index 0000000..e5b6f29
--- /dev/null
+++ 
b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
@@ -0,0 +1,84 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+_:node1cjov1p83x2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/endDate> "2018-07-29T17:00:00-07:00" 
<http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/name> "Midwest Fire Fest" 
<http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/description> "Come to the most unique 
festival in the Midwest" <http://bob.example.com/> .
+_:node1cjov1p83x3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x3 <http://schema.org/hasMap> 
"http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523"; 
<http://bob.example.com/> .
+_:node1cjov1p83x3 <http://schema.org/name> "Westside Park" 
<http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/location> _:node1cjov1p83x3 
<http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/url> 
<https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000>
 <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/startDate> "2018-07-29T09:00:00-07:00" 
<http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node1cjov1p83x2 <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/endDate> "2018-07-31T13:00:00-07:00" 
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/name> "Cambridge Senior Meals" 
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/description> "Cambridge Senior Meals are 
served at Noon every Tuesday and Friday" <http://bob.example.com/> .
+_:node1cjov1p83x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/streetAddress> "200 Spring Steet" 
<http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/postalCode> "53523" 
<http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/addressLocality> "Cambridge" 
<http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/addressRegion> "WI" 
<http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/address> _:node1cjov1p83x6 
<http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/hasMap> 
"http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523"; 
<http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/name> "Amundson Center" 
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/location> _:node1cjov1p83x5 
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/url> 
<https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000>
 <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/startDate> "2018-07-31T12:00:00-07:00" 
<http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node1cjov1p83x4 <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/endDate> "2018-07-31T19:00:00-07:00" 
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/name> "Begin to Knit Classes" 
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/description> "Learn to knit at 
Kaleidoscope Fibers - Cambridge's speciality yarn,..." 
<http://bob.example.com/> .
+_:node1cjov1p83x8 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x9 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x9 <http://schema.org/streetAddress> "Null" 
<http://bob.example.com/> .
+_:node1cjov1p83x8 <http://schema.org/address> _:node1cjov1p83x9 
<http://bob.example.com/> .
+_:node1cjov1p83x8 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main 
Street" <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/location> _:node1cjov1p83x8 
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/url> 
<https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000>
 <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/startDate> "2018-07-31T17:00:00-07:00" 
<http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node1cjov1p83x7 <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" 
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/name> "Cambridge Historic School Museum 
Tour" <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/description> "Built in 1906, the 
Cambridge Historic School - listed on the..." <http://bob.example.com/> .
+_:node1cjov1p83x11 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x12 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x12 <http://schema.org/streetAddress> "Null" 
<http://bob.example.com/> .
+_:node1cjov1p83x11 <http://schema.org/address> _:node1cjov1p83x12 
<http://bob.example.com/> .
+_:node1cjov1p83x11 <http://schema.org/name> "Cambridge Historic School" 
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/location> _:node1cjov1p83x11 
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/url> 
<https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000>
 <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/startDate> "2018-08-01T12:30:00-07:00" 
<http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node1cjov1p83x10 <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" 
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/name> "Begin to Knit Classes" 
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/description> "Learn to knit at 
Kaleidoscope Fibers - Cambridge's speciality yarn,..." 
<http://bob.example.com/> .
+_:node1cjov1p83x14 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x15 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x15 <http://schema.org/streetAddress> "Null" 
<http://bob.example.com/> .
+_:node1cjov1p83x14 <http://schema.org/address> _:node1cjov1p83x15 
<http://bob.example.com/> .
+_:node1cjov1p83x14 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main 
Street" <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/location> _:node1cjov1p83x14 
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/url> 
<https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000>
 <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/startDate> "2018-08-01T13:00:00-07:00" 
<http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node1cjov1p83x13 <http://bob.example.com/> .

http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/microdata-bad-properties.html 
b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
new file mode 100644
index 0000000..23d4e80
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
@@ -0,0 +1,125 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- Excerpted from: https://cambridgewi.com/events-calendar/ -->
+<html>
+
+<head></head>
+
+<body>
+<div itemscope="" itemtype="http://schema.org/Event";>
+    <div>
+        <div>
+            <a 
href="https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000";
 itemprop="url"><span itemprop="name">Midwest Fire Fest</span></a>
+            <div><span>Jul 29, 2018</span>&nbsp;<span>9:00am</span></div>
+        </div>
+        <div itemprop="description">Come to the most unique festival in the 
Midwest</div>
+    </div>
+    <meta itemprop=" startDate " content="2018-07-29T09:00:00-07:00">
+    <meta itemprop=" endDate " content="2018-07-29T17:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" 
itemtype="http://schema.org/Place";>
+        <meta itemprop=" name" content="Westside Park">
+        <meta itemprop="hasMap " 
content="http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523";>
+        <div itemprop="" itemscope="itemscope" 
itemtype="http://schema.org/PostalAddress";>
+            <meta itemprop="streetAddress" content="300 Water Street">
+            <meta itemprop="addressLocality" content="Cambridge">
+            <meta itemprop="addressRegion" content="WI">
+            <meta itemprop="postalCode" content="53523">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event";>
+    <div>
+        <div>
+            <a 
href="https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000";
 itemprop="url"><span itemprop="name">Cambridge Senior Meals</span></a>
+            <div><span>Jul 31, 2018</span>&nbsp;<span>12:00pm</span></div>
+        </div>
+        <div itemprop="description">Cambridge Senior Meals are served&nbsp;at 
Noon every Tuesday and Friday</div>
+    </div>
+    <meta itemprop="startDate" content="2018-07-31T12:00:00-07:00">
+    <meta itemprop="endDate" content="2018-07-31T13:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" 
itemtype="http://schema.org/Place";>
+        <meta itemprop="name" content="Amundson Center">
+        <meta itemprop="hasMap" 
content="http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523";>
+        <div itemprop="address" itemscope="itemscope" 
itemtype="http://schema.org/PostalAddress";>
+            <meta itemprop="streetAddress" content="200 Spring Steet">
+            <meta itemprop="addressLocality" content="Cambridge">
+            <meta itemprop="addressRegion" content="WI">
+            <meta itemprop="postalCode" content="53523">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event";>
+    <div>
+        <div>
+            <a 
href="https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000";
 itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a>
+            <div><span>Jul 31, 2018</span>&nbsp;<span>5:00pm</span></div>
+
+        </div>
+        <div itemprop="description">Learn to knit at Kaleidoscope Fibers - 
Cambridge's speciality yarn,...</div>
+    </div>
+    <meta itemprop="startDate" content="2018-07-31T17:00:00-07:00">
+    <meta itemprop="endDate" content="2018-07-31T19:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" 
itemtype="http://schema.org/Place";>
+        <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main 
Street">
+        <div itemprop="address" itemscope="itemscope" 
itemtype="http://schema.org/PostalAddress";>
+            <meta itemprop="streetAddress" content="">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event";>
+    <div>
+        <div>
+            <a 
href="https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000";
 itemprop="url"><span itemprop="name">Cambridge Historic School Museum 
Tour</span></a>
+            <div><span>Aug 1, 2018</span>&nbsp;<span>12:30pm</span></div>
+        </div>
+        <div itemprop="description">Built in 1906, the Cambridge Historic 
School -&nbsp;listed on the...</div>
+    </div>
+    <div class="rhc-clear"></div>
+    <meta itemprop="startDate" content="2018-08-01T12:30:00-07:00">
+    <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" 
itemtype="http://schema.org/Place";>
+        <meta itemprop="name" content="Cambridge Historic School">
+        <div itemprop="address" itemscope="itemscope" 
itemtype="http://schema.org/PostalAddress";>
+            <meta itemprop="streetAddress" content="">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event";>
+    <div>
+        <div>
+            <a 
href="https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000";
 itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a>
+            <div><span>Aug 1, 2018</span>&nbsp;<span>1:00pm</span></div>
+        </div>
+        <div itemprop="description">Learn to knit at Kaleidoscope Fibers - 
Cambridge's speciality yarn,...</div>
+    </div>
+    <meta itemprop="startDate" content="2018-08-01T13:00:00-07:00">
+    <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" 
itemtype="http://schema.org/Place";>
+        <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main 
Street">
+        <div itemprop="address" itemscope="itemscope" 
itemtype="http://schema.org/PostalAddress";>
+            <meta itemprop="streetAddress" content="">
+        </div>
+    </div>
+</div>
+
+</body>
+</html>
\ No newline at end of file

Reply via email to