Repository: any23 Updated Branches: refs/heads/master 5b93f21ec -> 36682ccdf
ANY23-154 allow unused itemprops Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/36682ccd Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/36682ccd Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/36682ccd Branch: refs/heads/master Commit: 36682ccdfbddcd924cb5840e25d956f581e7125f Parents: 5b93f21 Author: Hans <[email protected]> Authored: Sun Oct 28 20:13:31 2018 -0500 Committer: Hans <[email protected]> Committed: Sun Oct 28 20:13:31 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/ItemPropValue.java | 3 +- .../extractor/microdata/MicrodataParser.java | 68 ++++++++++++++++---- .../microdata/MicrodataExtractorTest.java | 7 ++ .../resources/microdata/unused-itemprop.html | 30 +++++++++ 4 files changed, 93 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/36682ccd/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java index 2b6659a..8b5bffd 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java @@ -97,6 +97,7 @@ public class ItemPropValue { public ItemPropValue(Object content, Type type) { this.type = Objects.requireNonNull(type, "type cannot be null"); this.content = type.checkClass(content); + this.literal = null; } ItemPropValue(Literal literal) { @@ -122,7 +123,7 @@ public class ItemPropValue { this.content = content; } - Literal literal; + final Literal literal; /** * @return the content object. http://git-wip-us.apache.org/repos/asf/any23/blob/36682ccd/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java index f59bbdb..8964b32 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java @@ -41,6 +41,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; /** * This class provides utility methods for handling <b>Microdata</b> @@ -162,6 +163,27 @@ public class MicrodataParser { return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null; } + private static boolean isContainedInItemScope(Node node) { + for (Node p = node.getParentNode(); p != null; p = p.getParentNode()) { + NamedNodeMap attrs = p.getAttributes(); + if (attrs != null && attrs.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) { + return true; + } + } + return false; + } + + private static boolean isContainedInId(Node node, Set<String> ids) { + do { + String id = DomUtils.readAttribute(node, "id", null); + if (id != null && ids.contains(id)) { + return true; + } + node = node.getParentNode(); + } while (node != null); + return false; + } + /** * Returns only the <i>itemScope</i>s that are top level items. * @@ -171,13 +193,27 @@ public class MicrodataParser { public static List<Node> getTopLevelItemScopeNodes(Node node) { final List<Node> itemScopes = getItemScopeNodes(node); final List<Node> topLevelItemScopes = new ArrayList<>(); - for(Node itemScope : itemScopes) { - if( ! isItemProp(itemScope) ) { + final List<Node> possibles = new ArrayList<>(); + for (Node itemScope : itemScopes) { + if (!isItemProp(itemScope)) { topLevelItemScopes.add(itemScope); + } else if (!isContainedInItemScope(itemScope)) { + possibles.add(itemScope); + } + } + + if (!possibles.isEmpty()) { + Set<String> refIds = itemScopes.stream() + .flatMap(n -> Arrays.stream(itemrefIds(n))) + .collect(Collectors.toSet()); + + for (Node itemScope : possibles) { + if (!isContainedInId(itemScope, refIds)) { + topLevelItemScopes.add(itemScope); + } } } - // ANY23-131 Nested Microdata are not extracted - //return getUnnestedNodes( topLevelItemScopes ); + return topLevelItemScopes; } @@ -470,15 +506,14 @@ public class MicrodataParser { continue; } - final String[] propertyNames = itemProp.trim().split("\\s+"); ItemPropValue itemPropValue; - for (String propertyName : propertyNames) { - try { - itemPropValue = getPropertyValue(itemPropNode); - } catch (MicrodataParserException mpe) { - manageError(mpe); - continue; - } + try { + itemPropValue = getPropertyValue(itemPropNode); + } catch (MicrodataParserException mpe) { + manageError(mpe); + continue; + } + for (String propertyName : itemProp.trim().split("\\s+")) { result.add( new ItemProp( DomUtils.getXPathForNode(itemPropNode), @@ -537,6 +572,12 @@ public class MicrodataParser { return result.toArray( new ItemProp[result.size()] ); } + private static final String[] EMPTY_STRINGS = new String[0]; + private static String[] itemrefIds(Node node) { + String itemref = DomUtils.readAttribute(node, "itemref" , null); + return StringUtils.isBlank(itemref) ? EMPTY_STRINGS : itemref.trim().split("\\s+"); + } + /** * Returns the {@link ItemScope} instance described within the specified <code>node</code>. * @@ -550,12 +591,11 @@ public class MicrodataParser { return itemScope; final String id = DomUtils.readAttribute(node, "id" , null); - final String itemref = DomUtils.readAttribute(node, "itemref" , null); final String itemType = DomUtils.readAttribute(node, "itemtype", null); final String itemId = DomUtils.readAttribute(node, "itemid" , null); final List<ItemProp> itemProps = getItemProps(node, true); - final String[] itemrefIDs = itemref == null ? new String[0] : itemref.split("\\s+"); + final String[] itemrefIDs = itemrefIds(node); final ItemProp[] deferredProperties; try { deferredProperties = deferProperties(itemrefIDs); http://git-wip-us.apache.org/repos/asf/any23/blob/36682ccd/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java index 9d7a079..11aa353 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java @@ -99,6 +99,13 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { } @Test + public void testUnusedItemprop() { + //Test for ANY23-154 + assertExtract("/microdata/unused-itemprop.html"); + assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Offer")); + } + + @Test public void testExample2() { //Property URI generation for hcard assertExtract("/microdata/example2.html"); http://git-wip-us.apache.org/repos/asf/any23/blob/36682ccd/test-resources/src/test/resources/microdata/unused-itemprop.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/unused-itemprop.html b/test-resources/src/test/resources/microdata/unused-itemprop.html new file mode 100644 index 0000000..ca50180 --- /dev/null +++ b/test-resources/src/test/resources/microdata/unused-itemprop.html @@ -0,0 +1,30 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!-- Test for ANY23-154 --> + +<!DOCTYPE html> +<html lang="en"> +<head> +</head> +<body> + +<div id="someid" itemprop="offer" itemscope itemtype="http://schema.org/Offer"> +</div> + +</body> +</html> \ No newline at end of file
