Repository: any23 Updated Branches: refs/heads/master d283d70ce -> 6173637bb
ANY23-376 fix IllegalArgumentException in microdata extractor Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6173637b Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6173637b Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6173637b Branch: refs/heads/master Commit: 6173637bb801da62b07b69be64fa2c75f8d54904 Parents: d283d70 Author: Hans <[email protected]> Authored: Tue Jul 31 15:35:55 2018 -0500 Committer: Hans <[email protected]> Committed: Tue Jul 31 15:35:55 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/MicrodataParser.java | 11 +- .../microdata/MicrodataExtractorTest.java | 15 ++- .../microdata-bad-properties-expected.nquads | 84 +++++++++++++ .../microdata/microdata-bad-properties.html | 125 +++++++++++++++++++ 4 files changed, 231 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java index 32faec3..f305620 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java @@ -17,6 +17,7 @@ package org.apache.any23.extractor.microdata; import org.apache.any23.extractor.html.DomUtils; +import org.apache.commons.lang.StringUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -394,9 +395,15 @@ public class MicrodataParser { while (treeWalker.nextNode() != null); final List<ItemProp> result = new ArrayList<>(); - for(Node itemPropNode : accepted) { + for (Node itemPropNode : accepted) { final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null); - final String[] propertyNames = itemProp.split(" "); + + if (StringUtils.isBlank(itemProp)) { + manageError(new MicrodataParserException("invalid property name '" + itemProp + "'", itemPropNode)); + continue; + } + + final String[] propertyNames = itemProp.trim().split("\\s+"); ItemPropValue itemPropValue; for (String propertyName : propertyNames) { try { http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java index 280b3f7..e858ea3 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java @@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.IssueReport; import org.apache.any23.extractor.html.AbstractExtractorTestCase; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; @@ -89,7 +90,6 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { assertExtract("/microdata/microdata-missing-scheme.html"); assertModelNotEmpty(); assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer")); - System.out.println(dumpHumanReadableTriples()); } /** @@ -206,9 +206,20 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads"); } + @Test + public void testBadPropertyNames() throws IOException { + extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads", false); + assertIssue(IssueReport.IssueLevel.ERROR, ".*invalid property name ''.*\"path\" : \"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*"); + } + private void extractAndVerifyAgainstNQuads(String actual, String expected) + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + extractAndVerifyAgainstNQuads(actual, expected, true); + } + + private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues) throws RepositoryException, RDFHandlerException, IOException, RDFParseException { - assertExtract("/microdata/" + actual); + assertExtract("/microdata/" + actual, assertNoIssues); assertModelNotEmpty(); logger.debug( dumpModelToNQuads() ); List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected); http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads new file mode 100644 index 0000000..e5b6f29 --- /dev/null +++ b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +_:node1cjov1p83x2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> . +_:node1cjov1p83x2 <http://schema.org/endDate> "2018-07-29T17:00:00-07:00" <http://bob.example.com/> . +_:node1cjov1p83x2 <http://schema.org/name> "Midwest Fire Fest" <http://bob.example.com/> . +_:node1cjov1p83x2 <http://schema.org/description> "Come to the most unique festival in the Midwest" <http://bob.example.com/> . +_:node1cjov1p83x3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> . +_:node1cjov1p83x3 <http://schema.org/hasMap> "http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523" <http://bob.example.com/> . +_:node1cjov1p83x3 <http://schema.org/name> "Westside Park" <http://bob.example.com/> . +_:node1cjov1p83x2 <http://schema.org/location> _:node1cjov1p83x3 <http://bob.example.com/> . +_:node1cjov1p83x2 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000> <http://bob.example.com/> . +_:node1cjov1p83x2 <http://schema.org/startDate> "2018-07-29T09:00:00-07:00" <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x2 <http://bob.example.com/> . +_:node1cjov1p83x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> . +_:node1cjov1p83x4 <http://schema.org/endDate> "2018-07-31T13:00:00-07:00" <http://bob.example.com/> . +_:node1cjov1p83x4 <http://schema.org/name> "Cambridge Senior Meals" <http://bob.example.com/> . +_:node1cjov1p83x4 <http://schema.org/description> "Cambridge Senior Meals are served at Noon every Tuesday and Friday" <http://bob.example.com/> . +_:node1cjov1p83x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> . +_:node1cjov1p83x6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> . +_:node1cjov1p83x6 <http://schema.org/streetAddress> "200 Spring Steet" <http://bob.example.com/> . +_:node1cjov1p83x6 <http://schema.org/postalCode> "53523" <http://bob.example.com/> . +_:node1cjov1p83x6 <http://schema.org/addressLocality> "Cambridge" <http://bob.example.com/> . +_:node1cjov1p83x6 <http://schema.org/addressRegion> "WI" <http://bob.example.com/> . +_:node1cjov1p83x5 <http://schema.org/address> _:node1cjov1p83x6 <http://bob.example.com/> . +_:node1cjov1p83x5 <http://schema.org/hasMap> "http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523" <http://bob.example.com/> . +_:node1cjov1p83x5 <http://schema.org/name> "Amundson Center" <http://bob.example.com/> . +_:node1cjov1p83x4 <http://schema.org/location> _:node1cjov1p83x5 <http://bob.example.com/> . +_:node1cjov1p83x4 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000> <http://bob.example.com/> . +_:node1cjov1p83x4 <http://schema.org/startDate> "2018-07-31T12:00:00-07:00" <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x4 <http://bob.example.com/> . +_:node1cjov1p83x7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> . +_:node1cjov1p83x7 <http://schema.org/endDate> "2018-07-31T19:00:00-07:00" <http://bob.example.com/> . +_:node1cjov1p83x7 <http://schema.org/name> "Begin to Knit Classes" <http://bob.example.com/> . +_:node1cjov1p83x7 <http://schema.org/description> "Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,..." <http://bob.example.com/> . +_:node1cjov1p83x8 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> . +_:node1cjov1p83x9 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> . +_:node1cjov1p83x9 <http://schema.org/streetAddress> "Null" <http://bob.example.com/> . +_:node1cjov1p83x8 <http://schema.org/address> _:node1cjov1p83x9 <http://bob.example.com/> . +_:node1cjov1p83x8 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main Street" <http://bob.example.com/> . +_:node1cjov1p83x7 <http://schema.org/location> _:node1cjov1p83x8 <http://bob.example.com/> . +_:node1cjov1p83x7 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000> <http://bob.example.com/> . +_:node1cjov1p83x7 <http://schema.org/startDate> "2018-07-31T17:00:00-07:00" <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x7 <http://bob.example.com/> . +_:node1cjov1p83x10 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> . +_:node1cjov1p83x10 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" <http://bob.example.com/> . +_:node1cjov1p83x10 <http://schema.org/name> "Cambridge Historic School Museum Tour" <http://bob.example.com/> . +_:node1cjov1p83x10 <http://schema.org/description> "Built in 1906, the Cambridge Historic School - listed on the..." <http://bob.example.com/> . +_:node1cjov1p83x11 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> . +_:node1cjov1p83x12 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> . +_:node1cjov1p83x12 <http://schema.org/streetAddress> "Null" <http://bob.example.com/> . +_:node1cjov1p83x11 <http://schema.org/address> _:node1cjov1p83x12 <http://bob.example.com/> . +_:node1cjov1p83x11 <http://schema.org/name> "Cambridge Historic School" <http://bob.example.com/> . +_:node1cjov1p83x10 <http://schema.org/location> _:node1cjov1p83x11 <http://bob.example.com/> . +_:node1cjov1p83x10 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000> <http://bob.example.com/> . +_:node1cjov1p83x10 <http://schema.org/startDate> "2018-08-01T12:30:00-07:00" <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x10 <http://bob.example.com/> . +_:node1cjov1p83x13 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> . +_:node1cjov1p83x13 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" <http://bob.example.com/> . +_:node1cjov1p83x13 <http://schema.org/name> "Begin to Knit Classes" <http://bob.example.com/> . +_:node1cjov1p83x13 <http://schema.org/description> "Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,..." <http://bob.example.com/> . +_:node1cjov1p83x14 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> . +_:node1cjov1p83x15 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> . +_:node1cjov1p83x15 <http://schema.org/streetAddress> "Null" <http://bob.example.com/> . +_:node1cjov1p83x14 <http://schema.org/address> _:node1cjov1p83x15 <http://bob.example.com/> . +_:node1cjov1p83x14 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main Street" <http://bob.example.com/> . +_:node1cjov1p83x13 <http://schema.org/location> _:node1cjov1p83x14 <http://bob.example.com/> . +_:node1cjov1p83x13 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000> <http://bob.example.com/> . +_:node1cjov1p83x13 <http://schema.org/startDate> "2018-08-01T13:00:00-07:00" <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x13 <http://bob.example.com/> . http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/test-resources/src/test/resources/microdata/microdata-bad-properties.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-bad-properties.html b/test-resources/src/test/resources/microdata/microdata-bad-properties.html new file mode 100644 index 0000000..23d4e80 --- /dev/null +++ b/test-resources/src/test/resources/microdata/microdata-bad-properties.html @@ -0,0 +1,125 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- Excerpted from: https://cambridgewi.com/events-calendar/ --> +<html> + +<head></head> + +<body> +<div itemscope="" itemtype="http://schema.org/Event"> + <div> + <div> + <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000" itemprop="url"><span itemprop="name">Midwest Fire Fest</span></a> + <div><span>Jul 29, 2018</span> <span>9:00am</span></div> + </div> + <div itemprop="description">Come to the most unique festival in the Midwest</div> + </div> + <meta itemprop=" startDate " content="2018-07-29T09:00:00-07:00"> + <meta itemprop=" endDate " content="2018-07-29T17:00:00-07:00"> + <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place"> + <meta itemprop=" name" content="Westside Park"> + <meta itemprop="hasMap " content="http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523"> + <div itemprop="" itemscope="itemscope" itemtype="http://schema.org/PostalAddress"> + <meta itemprop="streetAddress" content="300 Water Street"> + <meta itemprop="addressLocality" content="Cambridge"> + <meta itemprop="addressRegion" content="WI"> + <meta itemprop="postalCode" content="53523"> + </div> + </div> +</div> + +<div itemscope="" itemtype="http://schema.org/Event"> + <div> + <div> + <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000" itemprop="url"><span itemprop="name">Cambridge Senior Meals</span></a> + <div><span>Jul 31, 2018</span> <span>12:00pm</span></div> + </div> + <div itemprop="description">Cambridge Senior Meals are served at Noon every Tuesday and Friday</div> + </div> + <meta itemprop="startDate" content="2018-07-31T12:00:00-07:00"> + <meta itemprop="endDate" content="2018-07-31T13:00:00-07:00"> + <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place"> + <meta itemprop="name" content="Amundson Center"> + <meta itemprop="hasMap" content="http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523"> + <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress"> + <meta itemprop="streetAddress" content="200 Spring Steet"> + <meta itemprop="addressLocality" content="Cambridge"> + <meta itemprop="addressRegion" content="WI"> + <meta itemprop="postalCode" content="53523"> + </div> + </div> +</div> + +<div itemscope="" itemtype="http://schema.org/Event"> + <div> + <div> + <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000" itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a> + <div><span>Jul 31, 2018</span> <span>5:00pm</span></div> + + </div> + <div itemprop="description">Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,...</div> + </div> + <meta itemprop="startDate" content="2018-07-31T17:00:00-07:00"> + <meta itemprop="endDate" content="2018-07-31T19:00:00-07:00"> + <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place"> + <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main Street"> + <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress"> + <meta itemprop="streetAddress" content=""> + </div> + </div> +</div> + +<div itemscope="" itemtype="http://schema.org/Event"> + <div> + <div> + <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000" itemprop="url"><span itemprop="name">Cambridge Historic School Museum Tour</span></a> + <div><span>Aug 1, 2018</span> <span>12:30pm</span></div> + </div> + <div itemprop="description">Built in 1906, the Cambridge Historic School - listed on the...</div> + </div> + <div class="rhc-clear"></div> + <meta itemprop="startDate" content="2018-08-01T12:30:00-07:00"> + <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00"> + <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place"> + <meta itemprop="name" content="Cambridge Historic School"> + <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress"> + <meta itemprop="streetAddress" content=""> + </div> + </div> +</div> + +<div itemscope="" itemtype="http://schema.org/Event"> + <div> + <div> + <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000" itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a> + <div><span>Aug 1, 2018</span> <span>1:00pm</span></div> + </div> + <div itemprop="description">Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,...</div> + </div> + <meta itemprop="startDate" content="2018-08-01T13:00:00-07:00"> + <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00"> + <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place"> + <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main Street"> + <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress"> + <meta itemprop="streetAddress" content=""> + </div> + </div> +</div> + +</body> +</html> \ No newline at end of file
