Repository: any23 Updated Branches: refs/heads/master 316b4ec0d -> a1b72b720
ANY23-339 fixes itemscope hashcode collision problem, allows absolute URIs as subjects Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/a1b72b72 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/a1b72b72 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/a1b72b72 Branch: refs/heads/master Commit: a1b72b720a2cdb2802fd8e82856ee67702d002cd Parents: 316b4ec Author: Hans <[email protected]> Authored: Fri Mar 30 12:04:25 2018 -0500 Committer: Hans <[email protected]> Committed: Fri Mar 30 12:04:25 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/MicrodataExtractor.java | 29 ++++++++++++-------- .../microdata/MicrodataExtractorTest.java | 9 ++++++ 2 files changed, 27 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/a1b72b72/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index aa01dfe..d2fa7aa 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -40,6 +40,8 @@ import org.w3c.dom.NodeList; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Date; import java.util.HashMap; @@ -430,21 +432,12 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { IRI documentIRI, ExtractionResult out, Map<ItemScope, Resource> mappings ) throws ExtractionException { - Resource subject; - if (mappings.containsKey(itemScope)) { - subject = mappings.get(itemScope); - } else if (isAbsoluteURL(itemScope.getItemId())) { - subject = RDFUtils.iri(itemScope.getItemId()); - } else { - subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode())); - } - mappings.put(itemScope, subject); + Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(scope.getItemId())); // ItemScope.type could be null, but surely it's a valid URL String itemScopeType = ""; if (itemScope.getType() != null) { - String itemType; - itemType = itemScope.getType().toString(); + String itemType = itemScope.getType().toString(); out.writeTriple(subject, RDF.TYPE, RDFUtils.iri(itemType)); itemScopeType = itemScope.getType().toString(); } @@ -472,6 +465,20 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { return subject; } + private static Resource createSubjectForItemId(String itemId) { + if (itemId != null) { + try { + URI uri = new URI(itemId.trim()); + if (uri.isAbsolute()) { + return RDFUtils.iri(uri.toString()); + } + } catch (URISyntaxException e) { + //not an absolute uri + } + } + return RDFUtils.bnode(); + } + private void processProperty( Resource subject, String propName, http://git-wip-us.apache.org/repos/asf/any23/blob/a1b72b72/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java index f8a0650..8161b36 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java @@ -20,6 +20,7 @@ package org.apache.any23.extractor.microdata; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractorFactory; import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -73,6 +74,14 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { logger.debug(dumpModelToNQuads()); } + @Test + public void testMicrodataBasic() { + assertExtract("/microdata/microdata-basic.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 40); + assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4); + } + /** * Reference test as provided by <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich Snippet for Microdata.</a> *
