Repository: any23 Updated Branches: refs/heads/master db25f0213 -> d82e0e501
ANY23-16 fix microdata property URIs Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/d82e0e50 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/d82e0e50 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/d82e0e50 Branch: refs/heads/master Commit: d82e0e501b7bcf6b10cdb34a97c3dd9dd94719d9 Parents: db25f02 Author: Hans <[email protected]> Authored: Thu Apr 5 01:19:03 2018 -0500 Committer: Hans <[email protected]> Committed: Thu Apr 5 01:19:03 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/MicrodataExtractor.java | 48 +++++++++++--------- .../microdata/microdata-nested-expected.nquads | 8 ++-- .../microdata-richsnippet-expected.nquads | 24 +++++----- .../schemaorg-example-1-expected.nquads | 10 ++-- .../schemaorg-example-2-expected.nquads | 10 ++-- 5 files changed, 53 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index d2fa7aa..42d9133 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -48,6 +48,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; /** @@ -64,9 +65,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { private String documentLanguage; - private boolean isStrict; - - private String defaultNamespace; + private IRI defaultNamespace; @Override public ExtractorDescription getDescription() { @@ -97,9 +96,14 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { return; } - isStrict = extractionParameters.getFlag("any23.microdata.strict"); + boolean isStrict = extractionParameters.getFlag("any23.microdata.strict"); if (!isStrict) { - defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default"); + defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default")); + if (!defaultNamespace.getLocalName().isEmpty()) { + throw new IllegalArgumentException("invalid namespace IRI: " + defaultNamespace); + } + } else { + defaultNamespace = null; } documentLanguage = getDocumentLanguage(in); @@ -435,11 +439,11 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(scope.getItemId())); // ItemScope.type could be null, but surely it's a valid URL - String itemScopeType = ""; + IRI itemScopeType = null; if (itemScope.getType() != null) { String itemType = itemScope.getType().toString(); out.writeTriple(subject, RDF.TYPE, RDFUtils.iri(itemType)); - itemScopeType = itemScope.getType().toString(); + itemScopeType = RDFUtils.iri(itemScope.getType().toString()); } for (String propName : itemScope.getProperties().keySet()) { List<ItemProp> itemProps = itemScope.getProperties().get(propName); @@ -483,25 +487,17 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { Resource subject, String propName, ItemProp itemProp, - String itemScopeType, + IRI itemScopeType, IRI documentIRI, Map<ItemScope, Resource> mappings, ExtractionResult out ) throws MalformedURLException, ExtractionException { - IRI predicate; - if (!isAbsoluteURL(propName) && "".equals(itemScopeType) && isStrict) { + + IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : defaultNamespace, propName); + if (predicate == null) { return; - } else if (!isAbsoluteURL(propName) && "".equals(itemScopeType) && !isStrict) { - predicate = RDFUtils.iri(toAbsoluteURL( - defaultNamespace, - propName, - '/').toString()); - } else { - predicate = RDFUtils.iri(toAbsoluteURL( - itemScopeType, - propName, - '/').toString()); } + Value value; Object propValue = itemProp.getValue().getContent(); ItemPropValue.Type propType = itemProp.getValue().getType(); @@ -523,7 +519,17 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { out.writeTriple(subject, predicate, value); } - private boolean isAbsoluteURL(String urlString) { + private static IRI getPredicate(IRI itemType, String localName) { + if (isAbsoluteURL(localName)) { + return RDFUtils.iri(localName); + } else if (itemType != null) { + return RDFUtils.iri(itemType.getNamespace(), Objects.requireNonNull(localName)); + } else { + return null; + } + } + + private static boolean isAbsoluteURL(String urlString) { boolean result = false; try { URL url = new URL(urlString); http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads b/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads index dbf6d4a..663ad5b 100644 --- a/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads +++ b/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads @@ -18,8 +18,8 @@ <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node295195eb5d5124e03da26bafc7313bc <http://bob.example.com/> . _:node3ecb85b37ebfd65a5d57ab82374a5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Movie> <http://bob.example.com/> . _:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://bob.example.com/> . -_:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://schema.org/Person/name> "James Cameron" <http://bob.example.com/> . -_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/Movie/director> _:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://bob.example.com/> . -_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/Movie/name> "Avatar" <http://bob.example.com/> . -_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/Movie/name> "James Cameron" <http://bob.example.com/> . +_:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://schema.org/name> "James Cameron" <http://bob.example.com/> . +_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/director> _:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://bob.example.com/> . +_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/name> "Avatar" <http://bob.example.com/> . +_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/name> "James Cameron" <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node3ecb85b37ebfd65a5d57ab82374a5 <http://bob.example.com/> . \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads b/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads index f59e6a0..73cf794 100644 --- a/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads +++ b/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads @@ -17,16 +17,16 @@ _:node9423934b5f186fd49d90edd31b5625ba <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data-vocabulary.org/Person> <http://bob.example.com/> . _:nodee94f8737ad89876c85bd87156a1eb585 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data-vocabulary.org/Address> <http://bob.example.com/> . -_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/Address/street-address> "1234 Peach Drive" <http://bob.example.com/> . -_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/Address/locality> "Warner Robins" <http://bob.example.com/> . -_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/Address/region> "Georgia" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/address> _:nodee94f8737ad89876c85bd87156a1eb585 <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/affiliation> "University of Dreams" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/street-address> "1234 Peach Drive" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/name> "John Doe" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/nickname> "Johnny" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/locality> "Warner Robins" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/title> "graduate research assistant" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/region> "Georgia" <http://bob.example.com/> . -_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/url> <http://www.JohnnyD.com> <http://bob.example.com/> . +_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/street-address> "1234 Peach Drive" <http://bob.example.com/> . +_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/locality> "Warner Robins" <http://bob.example.com/> . +_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/region> "Georgia" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/address> _:nodee94f8737ad89876c85bd87156a1eb585 <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/affiliation> "University of Dreams" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/street-address> "1234 Peach Drive" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/name> "John Doe" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/nickname> "Johnny" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/locality> "Warner Robins" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/title> "graduate research assistant" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/region> "Georgia" <http://bob.example.com/> . +_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/url> <http://www.JohnnyD.com> <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node9423934b5f186fd49d90edd31b5625ba <http://bob.example.com/> . \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads b/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads index 360443a..47f9cab 100644 --- a/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads +++ b/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads @@ -16,9 +16,9 @@ # _:node86af95e129f7381bd44dceb4ff02b7e <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/AudioObject> <http://bob.example.com/> . -_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/duration> "T0M15S" <http://bob.example.com/> . -_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/description> "Recorded on a terrace of Girona a sunday morning" <http://bob.example.com/> . -_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/name> "12oclock_girona.mp3" <http://bob.example.com/> . -_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/encodingFormat> "mp3" <http://bob.example.com/> . -_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/contentURL> "http://media.freesound.org/data/0/previews/719__elmomo__12oclock_girona_preview.mp3" <http://bob.example.com/> . +_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/duration> "T0M15S" <http://bob.example.com/> . +_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/description> "Recorded on a terrace of Girona a sunday morning" <http://bob.example.com/> . +_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/name> "12oclock_girona.mp3" <http://bob.example.com/> . +_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/encodingFormat> "mp3" <http://bob.example.com/> . +_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/contentURL> "http://media.freesound.org/data/0/previews/719__elmomo__12oclock_girona_preview.mp3" <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node86af95e129f7381bd44dceb4ff02b7e <http://bob.example.com/> . \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads index 8b054d4..504b6c8 100644 --- a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads +++ b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads @@ -16,11 +16,11 @@ # _:node8b30931f1dde708283dc52546c5572a6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Product> <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/price> "$55,000.00" <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/description> "2010 Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/name> "2010 Dodge Challenger SRT8" <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/image> <http://bob.example.com//microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/url> <http://vheminc.com/> <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/price> "$55,000.00" <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/name> "2010 Dodge Challenger SRT8" <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> <http://bob.example.com//microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/url> <http://vheminc.com/> <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node8b30931f1dde708283dc52546c5572a6 <http://bob.example.com/> . <http://bob.example.com/> <http://purl.org/dc/terms/title> "HTML5 Microdata Example - http://schema.org/Product" <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com//images/favicon1.ico> <http://bob.example.com/> .
