Merge branch 'master' into ANY23-311 - Resolve conflict in YAMLExtractor.java
Signed-off-by:Jacek Grzebyta <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/94caa68e Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/94caa68e Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/94caa68e Branch: refs/heads/master Commit: 94caa68ec6f0a86281c147667e75bbd044e4f658 Parents: a56d549 c40b788 Author: Jacek Grzebyta <[email protected]> Authored: Tue Aug 29 12:41:16 2017 +0100 Committer: Jacek Grzebyta <[email protected]> Committed: Tue Aug 29 12:41:16 2017 +0100 ---------------------------------------------------------------------- .../configuration/DefaultConfiguration.java | 23 +- .../DefaultModifiableConfiguration.java | 4 +- .../apache/any23/extractor/ExtractorGroup.java | 14 +- .../java/org/apache/any23/vocab/Vocabulary.java | 26 +- .../resources/default-configuration.properties | 4 + cli/pom.xml | 22 + .../org/apache/any23/cli/PluginVerifier.java | 8 +- .../main/java/org/apache/any23/cli/Rover.java | 10 +- .../java/org/apache/any23/cli/ToolRunner.java | 7 +- .../org/apache/any23/cli/ToolRunnerTest.java | 15 +- .../any23/extractor/ExtractorRegistryImpl.java | 30 +- .../extractor/SingleDocumentExtraction.java | 6 +- .../extractor/html/EmbeddedJSONLDExtractor.java | 4 +- .../any23/extractor/html/GeoExtractor.java | 7 +- .../any23/extractor/html/TagSoupParser.java | 2 - .../any23/extractor/xpath/XPathExtractor.java | 3 +- .../any23/extractor/yaml/YAMLExtractor.java | 18 +- .../java/org/apache/any23/rdf/RDFUtils.java | 50 +- .../java/org/apache/any23/util/StreamUtils.java | 69 +- .../any23/extractor/ExtractionAPITest.java | 4 +- .../extractor/ExtractionResultImplTest.java | 2 +- csvutils/pom.xml | 2 +- encoding/pom.xml | 2 +- openie/pom.xml | 153 +++++ .../any23/extractor/openie/OpenIEExtractor.java | 130 ++++ .../openie/OpenIEExtractorFactory.java | 52 ++ .../org.apache.any23.extractor.ExtractorFactory | 1 + .../any23/openie/OpenIEExtractorTest.java | 88 +++ plugins/basic-crawler/pom.xml | 53 +- plugins/html-scraper/pom.xml | 19 - plugins/integration-test/pom.xml | 16 +- .../java/org/apache/any23/plugin/PluginIT.java | 40 +- plugins/office-scraper/pom.xml | 19 - pom.xml | 19 +- service/pom.xml | 2 +- src/site/apt/any23-plugins.apt | 16 +- src/site/apt/configuration.apt | 8 +- src/site/apt/dev-csv-extractor.apt | 2 +- src/site/apt/dev-data-conversion.apt | 20 +- src/site/apt/dev-data-extraction.apt | 20 +- src/site/apt/dev-microformat-extractors.apt | 12 +- src/site/apt/dev-validation-fix.apt | 12 +- src/site/apt/dev-xpath-extractor.apt | 2 +- src/site/apt/extractors.apt | 50 +- src/site/apt/getting-started.apt | 2 +- src/site/apt/plugin-basic-crawler.apt | 4 +- src/site/apt/plugin-office-scraper.apt | 2 +- .../any23/extractor/openie/example-openie.html | 638 +++++++++++++++++++ 48 files changed, 1451 insertions(+), 261 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/94caa68e/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java ---------------------------------------------------------------------- diff --cc core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java index bf70b63,1e968c0..4eae6b9 --- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java @@@ -75,10 -74,10 +76,10 @@@ public class YAMLExtractor implements E // Iterate over page(s) for (Object p : docIterate) { - Resource pageNode = YAMLExtractor.this.makeUri("document", documentURI); + Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true); out.writeTriple(documentRoot, vocab.contains, pageNode); out.writeTriple(pageNode, RDF.TYPE, vocab.document); - buildNode(documentURI, p, out, pageNode); - out.writeTriple(pageNode, vocab.contains, buildNode(documentIRI, p, out)); ++ buildNode(documentIRI, p, out, pageNode); } } @@@ -117,13 -116,12 +118,14 @@@ } } - private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out) { - Resource nodeURI = RDFUtils.makeIRI(file); + private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out, Resource... parent) { + Resource nodeURI = Arrays.asList(parent).isEmpty() ? YAMLExtractor.this.makeUri(file) : parent[0]; + ++ for (String k : node.keySet()) { - Resource predicate = makeUri(k, file, false); + Resource predicate = RDFUtils.makeIRI(k, file, true); Value value = buildNode(file, node.get(k), out); - out.writeTriple(nodeURI, RDF.TYPE, vocab.node); + out.writeTriple(nodeURI, RDF.TYPE, vocab.mapping); out.writeTriple(nodeURI, (IRI) predicate, value); out.writeTriple(predicate, RDF.TYPE, RDF.PREDICATE); out.writeTriple(predicate, RDFS.LABEL, RDFUtils.literal(k)); @@@ -172,36 -158,4 +174,36 @@@ nodeId++; return bnode; } + + private Resource makeUri(IRI docUri) { + return makeUri("node", docUri); - } ++} + + private Resource makeUri(String type, IRI docUri) { + return makeUri(type, docUri, true); + } + + private Resource makeUri(String type, IRI docUri, boolean addId) { + + // preprocess string: converts - -> _ + // converts <space>: word1 word2 -> word1Word2 + String newType = StringUtils.implementJavaNaming(type); + + String uriString; + if (docUri.toString().endsWith("/")) { + uriString = docUri.toString() + newType; + } else { + uriString = docUri.toString() + "#" + newType; + } + + if (addId) { + uriString = uriString + "_" + Integer.toString(nodeId); + } + + Resource node = RDFUtils.uri(uriString); + if (addId) { + nodeId++; + } + return node; + } }
