Repository: any23 Updated Branches: refs/heads/master a58d59e35 -> 8b951d8e0
ANY23-409 allow multiple microdata itemtype values Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/8b951d8e Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/8b951d8e Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/8b951d8e Branch: refs/heads/master Commit: 8b951d8e06ed5ad941ec4ba452532bb93d04a057 Parents: a58d59e Author: Hans <[email protected]> Authored: Wed Oct 24 16:36:12 2018 -0500 Committer: Hans <[email protected]> Committed: Wed Oct 24 16:36:12 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/microdata/ItemScope.java | 63 +++++++++++--------- .../extractor/microdata/MicrodataExtractor.java | 21 +++---- .../extractor/microdata/MicrodataParser.java | 36 ++++++++--- 3 files changed, 73 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java index 2f079bb..1612aad 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java @@ -17,14 +17,17 @@ package org.apache.any23.extractor.microdata; +import org.apache.any23.rdf.RDFUtils; import org.apache.commons.lang.StringUtils; import org.eclipse.rdf4j.common.net.ParsedIRI; +import org.eclipse.rdf4j.model.IRI; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -34,6 +37,7 @@ import java.util.regex.Pattern; * This class describes a <b>Microdata <i>itemscope</i></b>. * * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ public class ItemScope extends Item { @@ -55,7 +59,7 @@ public class ItemScope extends Item { /** * <i>itemscope</i> type. */ - private final URL type; + private final List<IRI> type; /** * <i>itemscope</i> external identifier. @@ -73,44 +77,39 @@ public class ItemScope extends Item { * @param itemId <i>itemscope</i> id. Can be <code>null</code>. */ public ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, String type, String itemId) { - this(xpath, itemProps, id, refs, stringToUrl(type), itemId); + this(xpath, itemProps, id, refs, stringToSingletonIRI(type), itemId); } private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?"); - static URL stringToUrl(String type) { + static List<IRI> stringToSingletonIRI(String type) { if (StringUtils.isNotBlank(type)) { - try { - ParsedIRI iri = ParsedIRI.create(type.trim()); - if (StringUtils.isBlank(iri.getScheme())) { - String host = iri.getHost(); - if (StringUtils.isNotBlank(host)) { - iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment()); - } else { - String path = iri.getPath(); - if (path != null && looksLikeStartsWithHost.matcher(path).matches()) { - iri = ParsedIRI.create("http://" + iri.toString()); - } + ParsedIRI iri = ParsedIRI.create(type.trim()); + if (StringUtils.isBlank(iri.getScheme())) { + String host = iri.getHost(); + if (StringUtils.isNotBlank(host)) { + iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment()); + } else { + String path = iri.getPath(); + if (path != null && looksLikeStartsWithHost.matcher(path).matches()) { + iri = ParsedIRI.create("http://" + iri.toString()); } } - - return new URL(iri.toString()); - } catch (MalformedURLException murle) { - throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage()); } + return Collections.singletonList(RDFUtils.iri(iri.toString())); } else { - return null; + return Collections.emptyList(); } } - ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, URL type, String itemId) { + ItemScope(String xpath, ItemProp[] itemProps, String id, String[] refs, List<IRI> types, String itemId) { super(xpath); if (itemProps == null) { throw new NullPointerException("itemProps list cannot be null."); } - this.type = type; + this.type = types; this.id = id; this.refs = refs; this.itemId = itemId; @@ -162,6 +161,20 @@ public class ItemScope extends Item { * @return <i>itemscope</i> type. */ public URL getType() { + //No longer using URL. + //But for backwards compatibility: + try { + return type.isEmpty() ? null : new URL(type.get(0).stringValue()); + } catch (MalformedURLException e) { + try { + return new URL(ParsedIRI.create(type.get(0).stringValue()).toASCIIString()); + } catch (Exception e1) { + return null; + } + } + } + + List<IRI> getTypes() { return type; } @@ -200,7 +213,7 @@ public class ItemScope extends Item { getXpath(), id == null ? null : "\"" + id + "\"", refs == null ? null : toJSON(refs), - type == null ? null : "\"" + type + "\"", + type.isEmpty() ? null : "\"" + type.get(0) + "\"", itemId == null ? null : "\"" + itemId + "\"", sb.toString() ); @@ -248,11 +261,7 @@ public class ItemScope extends Item { } protected void acquireProperty(ItemProp itemProp) { - List<ItemProp> itemProps = properties.get(itemProp.getName()); - if (itemProps == null) { - itemProps = new ArrayList<>(); - properties.put(itemProp.getName(), itemProps); - } + List<ItemProp> itemProps = properties.computeIfAbsent(itemProp.getName(), k -> new ArrayList<>()); if (!itemProps.contains(itemProp)) itemProps.add(itemProp); } http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index 1e1f021..efd54e9 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -41,7 +41,6 @@ import org.w3c.dom.NodeList; import java.io.IOException; import java.net.URISyntaxException; -import java.net.URL; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -419,12 +418,15 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { IRI documentIRI, ExtractionResult out, Map<ItemScope, Resource> mappings, IRI defaultNamespace ) throws ExtractionException { - Resource subject = mappings.computeIfAbsent(itemScope, scope -> createSubjectForItemId(documentIRI, scope.getItemId())); - - IRI itemScopeType = getType(itemScope); - if (itemScopeType != null) { - out.writeTriple(subject, RDF.TYPE, itemScopeType); - defaultNamespace = getNamespaceIRI(itemScopeType); + Resource subject = mappings.computeIfAbsent(itemScope, scope -> + createSubjectForItemId(documentIRI, scope.getItemId())); + + List<IRI> itemScopeTypes = itemScope.getTypes(); + if (!itemScopeTypes.isEmpty()) { + defaultNamespace = getNamespaceIRI(itemScopeTypes.get(0)); + for (IRI type : itemScopeTypes) { + out.writeTriple(subject, RDF.TYPE, type); + } } for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) { String propName = itemProps.getKey(); @@ -454,11 +456,6 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { return subject; } - private static IRI getType(ItemScope scope) { - URL type = scope.getType(); - return type == null ? null : RDFUtils.iri(type.toString()); - } - private static Resource createSubjectForItemId(IRI documentIRI, String itemId) { if (itemId == null) { return RDFUtils.bnode(); http://git-wip-us.apache.org/repos/asf/any23/blob/8b951d8e/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java index dfb9de6..95fd94b 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java @@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata; import org.apache.any23.extractor.html.DomUtils; import org.apache.any23.rdf.RDFUtils; import org.apache.commons.lang.StringUtils; +import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; import org.eclipse.rdf4j.model.vocabulary.XMLSchema; @@ -31,7 +32,6 @@ import org.w3c.dom.traversal.NodeFilter; import org.w3c.dom.traversal.TreeWalker; import java.io.PrintStream; -import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -47,6 +47,7 @@ import java.util.Set; * nodes contained within a <i>DOM</i> document. * * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ public class MicrodataParser { @@ -578,12 +579,31 @@ public class MicrodataParser { itemProps.add(deferredProperty); } - URL type; - try { - type = ItemScope.stringToUrl(itemType); - } catch (IllegalArgumentException e) { - manageError(new MicrodataParserException(e.getMessage(), node)); - type = null; + List<IRI> types; + if (itemType == null) { + types = Collections.emptyList(); + } else { + types = new ArrayList<>(); + boolean canConcatWithPrev = false; + for (String s : itemType.trim().split("\\s+")) { + try { + canConcatWithPrev = types.addAll(ItemScope.stringToSingletonIRI(s)); + } catch (RuntimeException e) { + if (canConcatWithPrev) { + int lastInd = types.size() - 1; + try { + List<IRI> secondTry = ItemScope.stringToSingletonIRI(types.get(lastInd).stringValue() + " " + s); + types.remove(lastInd); + canConcatWithPrev = types.addAll(secondTry); + } catch (RuntimeException e2) { + manageError(new MicrodataParserException(e.getMessage(), node)); + canConcatWithPrev = false; + } + } else { + manageError(new MicrodataParserException(e.getMessage(), node)); + } + } + } } final ItemScope newItemScope = new ItemScope( @@ -591,7 +611,7 @@ public class MicrodataParser { itemProps.toArray(new ItemProp[itemProps.size()]), id, itemrefIDs, - type, + types, itemId ); itemScopes.put(node, newItemScope);
