Repository: any23 Updated Branches: refs/heads/master 7cbd82e88 -> 6b1469152
ANY23-404 hardcode default microdata registry Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6b146915 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6b146915 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6b146915 Branch: refs/heads/master Commit: 6b1469152ccd30f93b0686a73bd1ba02955d6411 Parents: 7cbd82e Author: Hans <[email protected]> Authored: Tue Oct 23 19:37:37 2018 -0500 Committer: Hans <[email protected]> Committed: Tue Oct 23 19:37:37 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/MicrodataExtractor.java | 54 ++++++++++++-------- .../microdata/MicrodataExtractorTest.java | 21 ++++++++ .../src/test/resources/microdata/example2.html | 28 ++++++++++ .../src/test/resources/microdata/example5.html | 31 +++++++++++ 4 files changed, 113 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index 3663800..3b45dd4 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -64,8 +64,6 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { private String documentLanguage; - private IRI defaultNamespace; - @Override public ExtractorDescription getDescription() { return MicrodataExtractorFactory.getDescriptionInstance(); @@ -95,7 +93,10 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { return; } + final IRI documentIRI = extractionContext.getDocumentIRI(); + boolean isStrict = extractionParameters.getFlag("any23.microdata.strict"); + final IRI defaultNamespace; if (!isStrict) { defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default")); if (!defaultNamespace.getLocalName().isEmpty()) { @@ -110,10 +111,9 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { /** * 5.2.6 */ - final IRI documentIRI = extractionContext.getDocumentIRI(); final Map<ItemScope, Resource> mappings = new HashMap<>(); for (ItemScope itemScope : itemScopes) { - Resource subject = processType(itemScope, documentIRI, out, mappings); + Resource subject = processType(itemScope, documentIRI, out, mappings, defaultNamespace); out.writeTriple( documentIRI, MICRODATA_ITEM, @@ -417,26 +417,31 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { private Resource processType( ItemScope itemScope, IRI documentIRI, ExtractionResult out, - Map<ItemScope, Resource> mappings + Map<ItemScope, Resource> mappings, IRI defaultNamespace ) throws ExtractionException { Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(scope.getItemId())); IRI itemScopeType = getType(itemScope); if (itemScopeType != null) { out.writeTriple(subject, RDF.TYPE, itemScopeType); + defaultNamespace = getNamespaceIRI(itemScopeType); } for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) { String propName = itemProps.getKey(); + IRI predicate = getPredicate(defaultNamespace, propName); + if (predicate == null) { + continue; + } for (ItemProp itemProp : itemProps.getValue()) { try { processProperty( subject, - propName, + predicate, itemProp, - itemScopeType, documentIRI, mappings, - out + out, + defaultNamespace ); } catch (URISyntaxException e) { throw new ExtractionException( @@ -461,40 +466,47 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { private void processProperty( Resource subject, - String propName, + IRI predicate, ItemProp itemProp, - IRI itemScopeType, IRI documentIRI, Map<ItemScope, Resource> mappings, - ExtractionResult out + ExtractionResult out, + IRI defaultNamespace ) throws URISyntaxException, ExtractionException { - IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : defaultNamespace, propName); - if (predicate == null) { - return; - } - Value value; Object propValue = itemProp.getValue().getContent(); ItemPropValue.Type propType = itemProp.getValue().getType(); if (propType.equals(ItemPropValue.Type.Nested)) { - value = processType((ItemScope) propValue, documentIRI, out, mappings); + value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace); } else if (propType.equals(ItemPropValue.Type.Plain)) { value = RDFUtils.literal((String) propValue, documentLanguage); } else if (propType.equals(ItemPropValue.Type.Link)) { value = toAbsoluteIRI(documentIRI, (String)propValue); + //TODO: support registries so hardcoding not needed + if (predicate.stringValue().equals("http://schema.org/additionalType")) { + out.writeTriple(subject, RDF.TYPE, value); + } } else if (propType.equals(ItemPropValue.Type.Date)) { value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE); } else { throw new RuntimeException("Invalid Type '" + - propType + "' for ItemPropValue with name: '" + propName + "'"); + propType + "' for ItemPropValue with name: '" + predicate + "'"); } out.writeTriple(subject, predicate, value); } - private static IRI getPredicate(IRI itemType, String localName) { - return toAbsoluteIRI(localName).orElseGet(() -> itemType == null ? null : - RDFUtils.iri(itemType.getNamespace(), localName.trim())); + private static final String hcardPrefix = "http://microformats.org/profile/hcard"; + private static final IRI hcardNamespaceIRI = RDFUtils.iri("http://microformats.org/profile/hcard#"); + + private static IRI getNamespaceIRI(IRI itemType) { + //TODO: support registries so hardcoding not needed + return itemType.stringValue().startsWith(hcardPrefix) ? hcardNamespaceIRI : itemType; + } + + private static IRI getPredicate(IRI namespaceIRI, String localName) { + return toAbsoluteIRI(localName).orElseGet(() -> namespaceIRI == null ? null : + RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim())); } private static Optional<IRI> toAbsoluteIRI(String urlString) { http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java index e858ea3..fedd5fa 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java @@ -24,6 +24,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.model.vocabulary.RDF; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,6 +79,26 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { } @Test + public void testExample2() { + //Property URI generation for hcard + assertExtract("/microdata/example2.html"); + assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard")); + assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value)null); + assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value)null); + } + + @Test + public void testExample5() { + //Vocabulary expansion for schema.org + assertExtract("/microdata/example5.html"); + assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person")); + assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person")); + assertContains(null, RDFUtils.iri("http://schema.org/additionalType"), RDFUtils.iri("http://xmlns.com/foaf/0.1/Person")); + assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:[email protected]")); + assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:[email protected]")); + } + + @Test public void testMicrodataBasic() { assertExtract("/microdata/microdata-basic.html"); assertModelNotEmpty(); http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example2.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/example2.html b/test-resources/src/test/resources/microdata/example2.html new file mode 100644 index 0000000..6ad5a33 --- /dev/null +++ b/test-resources/src/test/resources/microdata/example2.html @@ -0,0 +1,28 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- source: http://w3c.github.io/microdata-rdf --> + +<html lang="en"> +<body> +<span itemscope itemtype="http://microformats.org/profile/hcard"> + <span itemprop="n" itemscope> + <span itemprop="given-name">Princeton</span> + </span> +</span> +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example5.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/example5.html b/test-resources/src/test/resources/microdata/example5.html new file mode 100644 index 0000000..ba05051 --- /dev/null +++ b/test-resources/src/test/resources/microdata/example5.html @@ -0,0 +1,31 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- source: http://w3c.github.io/microdata-rdf --> + +<html lang="en"> +<head> +</head> +<body> +<div itemscope itemtype="http://schema.org/Person"> + <link itemprop="additionalType" href="http://xmlns.com/foaf/0.1/Person"/> + <a itemprop="email http://xmlns.com/foaf/0.1/mbox" href="mailto:[email protected]"> + [email protected] + </a> +</div> +</body> +</html> \ No newline at end of file
