changes according to mentor michele feedback
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ff816027 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ff816027 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ff816027 Branch: refs/heads/master Commit: ff816027510f731f3e3f6a3c410feb5c48ffd972 Parents: 5b10339 Author: Nisala Nirmana <[email protected]> Authored: Sun Jun 28 22:33:29 2015 +0530 Committer: Nisala Nirmana <[email protected]> Committed: Sun Jun 28 22:33:29 2015 +0530 ---------------------------------------------------------------------- .../html/microformats2/HAdrExtractor.java | 69 +++++++++----------- .../html/microformats2/HGeoExtractor.java | 52 +++++++++------ .../microformats2/Microformats2Prefixes.java | 26 ++++++++ .../html/microformats2/HAdrExtractorTest.java | 2 +- .../html/microformats2/HGeoExtractorTest.java | 2 +- .../microformats2/h-adr/h-adr-test.html | 21 +++--- .../microformats2/h-geo/h-geo-test.html | 8 +-- 7 files changed, 105 insertions(+), 75 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java index 022bf47..d0d9257 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java @@ -17,12 +17,14 @@ package org.apache.any23.extractor.html.microformats2; +import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.TagSoupExtractionResult; import org.apache.any23.extractor.html.microformats2.annotations.Includes; import org.apache.any23.vocab.VCard; import org.openrdf.model.BNode; +import org.openrdf.model.Resource; import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; @@ -40,17 +42,23 @@ public class HAdrExtractor extends EntityBasedMicroformatExtractor { private static final VCard vVCARD = VCard.getInstance(); private static final String[] addressFields = { - "p-street-address", - "p-extended-address", - "p-locality", - "p-region", - "p-postal-code", - "p-country-name", - "p-geo" + "street-address", + "extended-address", + "locality", + "region", + "postal-code", + "country-name", + "geo" + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" }; protected String getBaseClassName() { - return "h-adr"; + return Microformats2Prefixes.CLASS_PREFIX+"adr"; } @Override @@ -58,60 +66,45 @@ public class HAdrExtractor extends EntityBasedMicroformatExtractor { // Empty. } - protected boolean extractEntity(Node node, ExtractionResult out) { + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { if (null == node) return false; final HTMLDocument document = new HTMLDocument(node); BNode adr = getBlankNodeFor(node); out.writeTriple(adr, RDF.TYPE, vVCARD.Address); final String extractorName = getDescription().getExtractorName(); for (String field : addressFields) { - HTMLDocument.TextField[] values = document.getPluralTextField(field); + HTMLDocument.TextField[] values = document.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); for (HTMLDocument.TextField val : values) { - if(!field.equals("p-geo")) { + if(!field.equals("geo")) { conditionallyAddStringProperty( val.source(), - adr, vVCARD.getProperty(field.replaceFirst("p-", "")), val.value() + adr, vVCARD.getProperty(field), val.value() ); }else { String[] composed = val.value().split(";"); - if (composed.length == 3){ - conditionallyAddStringProperty( - val.source(), - adr, vVCARD.latitude, composed[0] - ); + for(int counter=0;counter<composed.length;counter++){ conditionallyAddStringProperty( val.source(), - adr, vVCARD.longitude, composed[1] - ); - conditionallyAddStringProperty( - val.source(), - adr, vVCARD.altitude, composed[2] + adr, vVCARD.getProperty(geoFields[counter]), composed[counter] ); - }else if (composed.length == 2){ - conditionallyAddStringProperty( - val.source(), - adr, vVCARD.latitude, composed[0] - ); - conditionallyAddStringProperty( - val.source(), - adr, vVCARD.longitude, composed[1] - ); - }else { - //we discard if only length is 1 } - } - } } - + addGeoAsUrlResource(adr,document); final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); - tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass() ); - + tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass()); return true; } + private void addGeoAsUrlResource(Resource card,HTMLDocument document) throws ExtractionException { + HTMLDocument.TextField[] links = document.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX+"geo"); + for (HTMLDocument.TextField link : links) { + conditionallyAddResourceProperty(card, vVCARD.geo, getHTMLDocument().resolveURI(link.value())); + } + } + @Override public ExtractorDescription getDescription() { return HAdrExtractorFactory.getDescriptionInstance(); http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java index 4a1fbfd..c9c061a 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java @@ -26,6 +26,9 @@ import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; import org.apache.any23.extractor.html.HTMLDocument; + +import java.util.ArrayList; + /** * Extractor for the <a href="http://microformats.org/wiki/h-geo">h-geo</a> * microformat. @@ -36,13 +39,19 @@ public class HGeoExtractor extends EntityBasedMicroformatExtractor { private static final VCard vVCARD = VCard.getInstance(); + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" + }; + @Override public ExtractorDescription getDescription() { return HGeoExtractorFactory.getDescriptionInstance(); } protected String getBaseClassName() { - return "h-geo"; + return Microformats2Prefixes.CLASS_PREFIX+"geo"; } @Override @@ -53,31 +62,32 @@ public class HGeoExtractor extends EntityBasedMicroformatExtractor { protected boolean extractEntity(Node node, ExtractionResult out) { if (null == node) return false; final HTMLDocument document = new HTMLDocument(node); - HTMLDocument.TextField latNode = document.getSingularTextField("p-latitude"); - HTMLDocument.TextField lonNode = document.getSingularTextField("p-longitude"); - HTMLDocument.TextField altNode = document.getSingularTextField("p-altitude"); - String lat = latNode.value(); - String lon = lonNode.value(); - String alt = altNode.value(); BNode geo = getBlankNodeFor(node); out.writeTriple(geo, RDF.TYPE, vVCARD.Location); final String extractorName = getDescription().getExtractorName(); - conditionallyAddStringProperty( - latNode.source(), - geo, vVCARD.latitude , lat - ); - conditionallyAddStringProperty( - lonNode.source(), - geo, vVCARD.longitude, lon - ); - conditionallyAddStringProperty( - altNode.source(), - geo, vVCARD.altitude, alt - ); - + ArrayList<HTMLDocument.TextField> geoNodes = new ArrayList<HTMLDocument.TextField>(); + for(String field : geoFields){ + geoNodes.add(document.getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX+field)); + } + if(geoNodes.get(0).source()==null){ + String[] composed = document.getSingularUrlField(Microformats2Prefixes.CLASS_PREFIX +"geo") + .value().split(";"); + for(int counter=0;counter<composed.length;counter++){ + conditionallyAddStringProperty( + document.getSingularUrlField(Microformats2Prefixes.CLASS_PREFIX+"geo").source(), + geo, vVCARD.getProperty(geoFields[counter]), composed[counter] + ); + } + }else{ + for(int counter=0;counter<geoNodes.size();counter++){ + conditionallyAddStringProperty( + geoNodes.get(counter).source(), + geo, vVCARD.getProperty(geoFields[counter]) , geoNodes.get(counter).value() + ); + } + } final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); tser.addResourceRoot( document.getPathToLocalRoot(), geo, this.getClass() ); - return true; } http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java new file mode 100644 index 0000000..18ac1b1 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +public class Microformats2Prefixes { + public static final String CLASS_PREFIX = "h-"; + public static final String PROPERTY_PREFIX = "p-"; + public static final String URL_PROPERTY_PREFIX = "u-"; + public static final String EMBEDDED_PROPERTY_PREFIX = "e-"; + public static final String TIME_PROPERTY_PREFIX = "dt-"; +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java index 0fb3625..69abb55 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java @@ -32,6 +32,6 @@ public class HAdrExtractorTest extends AbstractExtractorTestCase { public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { assertExtract("/microformats2/h-adr/h-adr-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 10); + assertStatementsSize(null, null, null, 11); } } http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java index eba89de..0d29fda 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java @@ -41,7 +41,7 @@ public class HGeoExtractorTest extends AbstractExtractorTestCase { public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { assertExtract("/microformats2/h-geo/h-geo-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 4); + assertStatementsSize(null, null, null, 8); } } http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html index b5c095a..5438b90 100644 --- a/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html +++ b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html @@ -18,16 +18,17 @@ <html> <body> - <!-- Microformats 2 --> - <div class="h-adr"> - <span class="p-street-address">349/B</span> - <span class="p-extended-address">Batagama,North</span> - <span class="p-locality">Jaela</span> - <span class="p-region">Western</span> - <span class="p-postal-code">11325</span> - <span class="p-country-name">SL</span></span> - <span class="p-geo">51.526421;-0.081067;25</span> - </div> + <!-- Microformats 2 --> + <div class="h-adr"> + <span class="p-street-address">349/B</span> + <span class="p-extended-address">Batagama,North</span> + <span class="p-locality">Jaela</span> + <span class="p-region">Western</span> + <span class="p-postal-code">11325</span> + <span class="p-country-name">SL</span></span> + <span class="p-geo">51.526421;-0.081067;25</span> + <a class="u-geo" href="geo:51.526421;-0.081067;crs=wgs84;u=40">Home</a> + </div> </body> </html> http://git-wip-us.apache.org/repos/asf/any23/blob/ff816027/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html index c0181fe..38d906f 100644 --- a/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html +++ b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html @@ -20,13 +20,13 @@ <body> <!-- Microformats 2 --> - <p> - <span class="h-geo"> + <span class="h-geo"> <span class="p-latitude">54.155278</span>, <span class="p-longitude">-2.249722</span> <span class="p-altitude">694</span> - </span> - </p> + </span> + + <span class="h-geo">51.513458;-0.14812;50</span> </body>
