Repository: any23
Updated Branches:
  refs/heads/master db25f0213 -> d82e0e501


ANY23-16 fix microdata property URIs


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/d82e0e50
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/d82e0e50
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/d82e0e50

Branch: refs/heads/master
Commit: d82e0e501b7bcf6b10cdb34a97c3dd9dd94719d9
Parents: db25f02
Author: Hans <[email protected]>
Authored: Thu Apr 5 01:19:03 2018 -0500
Committer: Hans <[email protected]>
Committed: Thu Apr 5 01:19:03 2018 -0500

----------------------------------------------------------------------
 .../extractor/microdata/MicrodataExtractor.java | 48 +++++++++++---------
 .../microdata/microdata-nested-expected.nquads  |  8 ++--
 .../microdata-richsnippet-expected.nquads       | 24 +++++-----
 .../schemaorg-example-1-expected.nquads         | 10 ++--
 .../schemaorg-example-2-expected.nquads         | 10 ++--
 5 files changed, 53 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index d2fa7aa..42d9133 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -48,6 +48,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Set;
 
 /**
@@ -64,9 +65,7 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
 
     private String documentLanguage;
 
-    private boolean isStrict;
-
-    private String defaultNamespace;
+    private IRI defaultNamespace;
 
     @Override
     public ExtractorDescription getDescription() {
@@ -97,9 +96,14 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
             return;
         }
 
-        isStrict = extractionParameters.getFlag("any23.microdata.strict");
+        boolean isStrict = 
extractionParameters.getFlag("any23.microdata.strict");
         if (!isStrict) {
-            defaultNamespace = 
extractionParameters.getProperty("any23.microdata.ns.default");
+            defaultNamespace = 
RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
+            if (!defaultNamespace.getLocalName().isEmpty()) {
+                throw new IllegalArgumentException("invalid namespace IRI: " + 
defaultNamespace);
+            }
+        } else {
+            defaultNamespace = null;
         }
 
         documentLanguage = getDocumentLanguage(in);
@@ -435,11 +439,11 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
         Resource subject = mappings.computeIfAbsent(itemScope, scope -> 
createSubjectForItemId(scope.getItemId()));
 
         // ItemScope.type could be null, but surely it's a valid URL
-        String itemScopeType = "";
+        IRI itemScopeType = null;
         if (itemScope.getType() != null) {
             String itemType = itemScope.getType().toString();
             out.writeTriple(subject, RDF.TYPE, RDFUtils.iri(itemType));
-            itemScopeType = itemScope.getType().toString();
+            itemScopeType = RDFUtils.iri(itemScope.getType().toString());
         }
         for (String propName : itemScope.getProperties().keySet()) {
             List<ItemProp> itemProps = itemScope.getProperties().get(propName);
@@ -483,25 +487,17 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
             Resource subject,
             String propName,
             ItemProp itemProp,
-            String itemScopeType,
+            IRI itemScopeType,
             IRI documentIRI,
             Map<ItemScope, Resource> mappings,
             ExtractionResult out
     ) throws MalformedURLException, ExtractionException {
-        IRI predicate;
-        if (!isAbsoluteURL(propName) && "".equals(itemScopeType) && isStrict) {
+
+        IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : 
defaultNamespace, propName);
+        if (predicate == null) {
             return;
-        } else if (!isAbsoluteURL(propName) && "".equals(itemScopeType) && 
!isStrict) {
-            predicate = RDFUtils.iri(toAbsoluteURL(
-                    defaultNamespace,
-                    propName,
-                    '/').toString());
-        } else {
-            predicate = RDFUtils.iri(toAbsoluteURL(
-                    itemScopeType,
-                    propName,
-                    '/').toString());
         }
+
         Value value;
         Object propValue = itemProp.getValue().getContent();
         ItemPropValue.Type propType = itemProp.getValue().getType();
@@ -523,7 +519,17 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
         out.writeTriple(subject, predicate, value);
     }
 
-    private boolean isAbsoluteURL(String urlString) {
+    private static IRI getPredicate(IRI itemType, String localName) {
+        if (isAbsoluteURL(localName)) {
+            return RDFUtils.iri(localName);
+        } else if (itemType != null) {
+            return RDFUtils.iri(itemType.getNamespace(), 
Objects.requireNonNull(localName));
+        } else {
+            return null;
+        }
+    }
+
+    private static boolean isAbsoluteURL(String urlString) {
         boolean result = false;
         try {
             URL url = new URL(urlString);

http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads 
b/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads
index dbf6d4a..663ad5b 100644
--- 
a/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads
+++ 
b/test-resources/src/test/resources/microdata/microdata-nested-expected.nquads
@@ -18,8 +18,8 @@
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node295195eb5d5124e03da26bafc7313bc <http://bob.example.com/> .
 _:node3ecb85b37ebfd65a5d57ab82374a5 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Movie> 
<http://bob.example.com/> .
 _:node1fd8d9ab2f041cdaecbae55b76fadc1 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> 
<http://bob.example.com/> .
-_:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://schema.org/Person/name> "James 
Cameron" <http://bob.example.com/> .
-_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/Movie/director> 
_:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://bob.example.com/> .
-_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/Movie/name> "Avatar" 
<http://bob.example.com/> .
-_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/Movie/name> "James 
Cameron" <http://bob.example.com/> .
+_:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://schema.org/name> "James Cameron" 
<http://bob.example.com/> .
+_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/director> 
_:node1fd8d9ab2f041cdaecbae55b76fadc1 <http://bob.example.com/> .
+_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/name> "Avatar" 
<http://bob.example.com/> .
+_:node3ecb85b37ebfd65a5d57ab82374a5 <http://schema.org/name> "James Cameron" 
<http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node3ecb85b37ebfd65a5d57ab82374a5 <http://bob.example.com/> .
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads
 
b/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads
index f59e6a0..73cf794 100644
--- 
a/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads
+++ 
b/test-resources/src/test/resources/microdata/microdata-richsnippet-expected.nquads
@@ -17,16 +17,16 @@
 
 _:node9423934b5f186fd49d90edd31b5625ba 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://data-vocabulary.org/Person> <http://bob.example.com/> .
 _:nodee94f8737ad89876c85bd87156a1eb585 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://data-vocabulary.org/Address> <http://bob.example.com/> .
-_:nodee94f8737ad89876c85bd87156a1eb585 
<http://data-vocabulary.org/Address/street-address> "1234 Peach Drive" 
<http://bob.example.com/> .
-_:nodee94f8737ad89876c85bd87156a1eb585 
<http://data-vocabulary.org/Address/locality> "Warner Robins" 
<http://bob.example.com/> .
-_:nodee94f8737ad89876c85bd87156a1eb585 
<http://data-vocabulary.org/Address/region> "Georgia" <http://bob.example.com/> 
.
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/address> 
_:nodee94f8737ad89876c85bd87156a1eb585 <http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/affiliation> "University of Dreams" 
<http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/street-address> "1234 Peach Drive" 
<http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/name> "John Doe" <http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/nickname> "Johnny" <http://bob.example.com/> 
.
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/locality> "Warner Robins" 
<http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/title> "graduate research assistant" 
<http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/Person/region> "Georgia" <http://bob.example.com/> .
-_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/Person/url> 
<http://www.JohnnyD.com> <http://bob.example.com/> .
+_:nodee94f8737ad89876c85bd87156a1eb585 
<http://data-vocabulary.org/street-address> "1234 Peach Drive" 
<http://bob.example.com/> .
+_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/locality> 
"Warner Robins" <http://bob.example.com/> .
+_:nodee94f8737ad89876c85bd87156a1eb585 <http://data-vocabulary.org/region> 
"Georgia" <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/address> 
_:nodee94f8737ad89876c85bd87156a1eb585 <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/affiliation> "University of Dreams" 
<http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba 
<http://data-vocabulary.org/street-address> "1234 Peach Drive" 
<http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/name> "John 
Doe" <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/nickname> 
"Johnny" <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/locality> 
"Warner Robins" <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/title> 
"graduate research assistant" <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/region> 
"Georgia" <http://bob.example.com/> .
+_:node9423934b5f186fd49d90edd31b5625ba <http://data-vocabulary.org/url> 
<http://www.JohnnyD.com> <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node9423934b5f186fd49d90edd31b5625ba <http://bob.example.com/> .
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads
 
b/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads
index 360443a..47f9cab 100644
--- 
a/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads
+++ 
b/test-resources/src/test/resources/microdata/schemaorg-example-1-expected.nquads
@@ -16,9 +16,9 @@
 #
 
 _:node86af95e129f7381bd44dceb4ff02b7e 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/AudioObject> <http://bob.example.com/> .
-_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/duration> 
"T0M15S" <http://bob.example.com/> .
-_:node86af95e129f7381bd44dceb4ff02b7e 
<http://schema.org/AudioObject/description> "Recorded on a terrace of Girona a 
sunday morning" <http://bob.example.com/> .
-_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/AudioObject/name> 
"12oclock_girona.mp3" <http://bob.example.com/> .
-_:node86af95e129f7381bd44dceb4ff02b7e 
<http://schema.org/AudioObject/encodingFormat> "mp3" <http://bob.example.com/> .
-_:node86af95e129f7381bd44dceb4ff02b7e 
<http://schema.org/AudioObject/contentURL> 
"http://media.freesound.org/data/0/previews/719__elmomo__12oclock_girona_preview.mp3";
 <http://bob.example.com/> .
+_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/duration> "T0M15S" 
<http://bob.example.com/> .
+_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/description> 
"Recorded on a terrace of Girona a sunday morning" <http://bob.example.com/> .
+_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/name> 
"12oclock_girona.mp3" <http://bob.example.com/> .
+_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/encodingFormat> "mp3" 
<http://bob.example.com/> .
+_:node86af95e129f7381bd44dceb4ff02b7e <http://schema.org/contentURL> 
"http://media.freesound.org/data/0/previews/719__elmomo__12oclock_girona_preview.mp3";
 <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node86af95e129f7381bd44dceb4ff02b7e <http://bob.example.com/> .
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/d82e0e50/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
 
b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
index 8b054d4..504b6c8 100644
--- 
a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
+++ 
b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
@@ -16,11 +16,11 @@
 #
 
 _:node8b30931f1dde708283dc52546c5572a6 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Product> 
<http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/price> 
"$55,000.00" <http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/description> 
"2010 Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark 
Slate Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual 
Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" 
<http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/name> "2010 
Dodge Challenger SRT8" <http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/image> 
<http://bob.example.com//microdata/images/2010-dodge-challenger-srt8.jpg> 
<http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/Product/url> 
<http://vheminc.com/> <http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/price> "$55,000.00" 
<http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 
Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate 
Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual 
Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" 
<http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/name> "2010 Dodge 
Challenger SRT8" <http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> 
<http://bob.example.com//microdata/images/2010-dodge-challenger-srt8.jpg> 
<http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/url> 
<http://vheminc.com/> <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> 
_:node8b30931f1dde708283dc52546c5572a6 <http://bob.example.com/> .
 <http://bob.example.com/> <http://purl.org/dc/terms/title> "HTML5 Microdata 
Example - http://schema.org/Product"; <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> 
<http://bob.example.com//images/favicon1.ico> <http://bob.example.com/> .

Reply via email to