added HAdr and HGeo Extractors support
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/5b10339b Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/5b10339b Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/5b10339b Branch: refs/heads/master Commit: 5b10339b55ea04e097a960fd722e8553573daccf Parents: a03bafa Author: nisalanirmana <[email protected]> Authored: Mon Jun 22 00:09:17 2015 +0530 Committer: nisalanirmana <[email protected]> Committed: Mon Jun 22 00:09:17 2015 +0530 ---------------------------------------------------------------------- .../main/java/org/apache/any23/vocab/VCard.java | 5 + .../html/microformats2/HAdrExtractor.java | 120 +++++++++++++++++++ .../microformats2/HAdrExtractorFactory.java | 57 +++++++++ .../html/microformats2/HGeoExtractor.java | 84 +++++++++++++ .../microformats2/HGeoExtractorFactory.java | 57 +++++++++ .../microformats2/annotations/Includes.java | 41 +++++++ .../microformats2/annotations/package-info.java | 24 ++++ .../html/microformats2/package-info.java | 24 ++++ .../html/microformats2/example-mf2-h-adr.html | 27 +++++ .../html/microformats2/example-mf2-h-geo.html | 22 ++++ .../html/microformats2/HAdrExtractorTest.java | 37 ++++++ .../html/microformats2/HGeoExtractorTest.java | 47 ++++++++ .../apache/any23/vocab/RDFSchemaUtilsTest.java | 4 +- .../microformats2/h-adr/h-adr-test.html | 33 +++++ .../microformats2/h-geo/h-geo-test.html | 33 +++++ 15 files changed, 613 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/api/src/main/java/org/apache/any23/vocab/VCard.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/vocab/VCard.java b/api/src/main/java/org/apache/any23/vocab/VCard.java index f43c5eb..10d3c94 100644 --- a/api/src/main/java/org/apache/any23/vocab/VCard.java +++ b/api/src/main/java/org/apache/any23/vocab/VCard.java @@ -59,6 +59,11 @@ public class VCard extends Vocabulary { public final URI agent = createProperty("agent"); /** + * The altitude of a geographic location. + */ + public final URI altitude = createProperty("altitude"); + + /** * The birthday of a person. */ public final URI bday = createProperty("bday"); http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java new file mode 100644 index 0000000..022bf47 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.extractor.html.microformats2.annotations.Includes; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; + +/** + * Extractor for the <a href="http://microformats.org/wiki/h-adr">h-adr</a> + * microformat. + * + * @author Nisala Nirmana + */ +@Includes( extractors = HGeoExtractor.class ) +public class HAdrExtractor extends EntityBasedMicroformatExtractor { + + private static final VCard vVCARD = VCard.getInstance(); + + private static final String[] addressFields = { + "p-street-address", + "p-extended-address", + "p-locality", + "p-region", + "p-postal-code", + "p-country-name", + "p-geo" + }; + + protected String getBaseClassName() { + return "h-adr"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + protected boolean extractEntity(Node node, ExtractionResult out) { + if (null == node) return false; + final HTMLDocument document = new HTMLDocument(node); + BNode adr = getBlankNodeFor(node); + out.writeTriple(adr, RDF.TYPE, vVCARD.Address); + final String extractorName = getDescription().getExtractorName(); + for (String field : addressFields) { + HTMLDocument.TextField[] values = document.getPluralTextField(field); + for (HTMLDocument.TextField val : values) { + if(!field.equals("p-geo")) { + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.getProperty(field.replaceFirst("p-", "")), val.value() + ); + }else { + String[] composed = val.value().split(";"); + if (composed.length == 3){ + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.latitude, composed[0] + ); + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.longitude, composed[1] + ); + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.altitude, composed[2] + ); + + }else if (composed.length == 2){ + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.latitude, composed[0] + ); + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.longitude, composed[1] + ); + }else { + //we discard if only length is 1 + } + + } + + } + } + + final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); + tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass() ); + + return true; + } + + @Override + public ExtractorDescription getDescription() { + return HAdrExtractorFactory.getDescriptionInstance(); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java new file mode 100644 index 0000000..3b28fb5 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HAdrExtractorFactory extends SimpleExtractorFactory<HAdrExtractor> implements + ExtractorFactory<HAdrExtractor> { + + public static final String NAME = "html-mf2-h-adr"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "vcard"); + + private static final ExtractorDescription descriptionInstance = new HAdrExtractorFactory(); + + public HAdrExtractorFactory() { + super( + HAdrExtractorFactory.NAME, + HAdrExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-adr.html"); + } + + @Override + public HAdrExtractor createExtractor() { + return new HAdrExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java new file mode 100644 index 0000000..4a1fbfd --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; +/** + * Extractor for the <a href="http://microformats.org/wiki/h-geo">h-geo</a> + * microformat. + * + * @author Nisala Nirmana + */ +public class HGeoExtractor extends EntityBasedMicroformatExtractor { + + private static final VCard vVCARD = VCard.getInstance(); + + @Override + public ExtractorDescription getDescription() { + return HGeoExtractorFactory.getDescriptionInstance(); + } + + protected String getBaseClassName() { + return "h-geo"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + protected boolean extractEntity(Node node, ExtractionResult out) { + if (null == node) return false; + final HTMLDocument document = new HTMLDocument(node); + HTMLDocument.TextField latNode = document.getSingularTextField("p-latitude"); + HTMLDocument.TextField lonNode = document.getSingularTextField("p-longitude"); + HTMLDocument.TextField altNode = document.getSingularTextField("p-altitude"); + String lat = latNode.value(); + String lon = lonNode.value(); + String alt = altNode.value(); + BNode geo = getBlankNodeFor(node); + out.writeTriple(geo, RDF.TYPE, vVCARD.Location); + final String extractorName = getDescription().getExtractorName(); + conditionallyAddStringProperty( + latNode.source(), + geo, vVCARD.latitude , lat + ); + conditionallyAddStringProperty( + lonNode.source(), + geo, vVCARD.longitude, lon + ); + conditionallyAddStringProperty( + altNode.source(), + geo, vVCARD.altitude, alt + ); + + final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); + tser.addResourceRoot( document.getPathToLocalRoot(), geo, this.getClass() ); + + return true; + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java new file mode 100644 index 0000000..5b60b7d --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HGeoExtractorFactory extends SimpleExtractorFactory<HGeoExtractor> implements + ExtractorFactory<HGeoExtractor> { + + public static final String NAME = "html-mf2-h-geo"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "vcard"); + + private static final ExtractorDescription descriptionInstance = new HGeoExtractorFactory(); + + public HGeoExtractorFactory() { + super( + HGeoExtractorFactory.NAME, + HGeoExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-geo.html"); + } + + @Override + public HGeoExtractor createExtractor() { + return new HGeoExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java new file mode 100644 index 0000000..ff9d738 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2.annotations; + +import org.apache.any23.extractor.html.MicroformatExtractor; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * This annotation could be used to decorate a {@link MicroformatExtractor} to + * represent which of the other <i>Microformats</i> could it nest. + * + * @author Davide Palmisano ( [email protected] ) + */ +@Documented +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Includes { + + Class<? extends MicroformatExtractor>[] extractors(); + +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java new file mode 100644 index 0000000..3311c98 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains the annotations needed to describe the + * single nesting relations among different <a href="http://microformats.org/">Microformats</a>. + * + * @see org.apache.any23.extractor.html.MicroformatExtractor + */ +package org.apache.any23.extractor.microformats2.annotations; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java new file mode 100644 index 0000000..b961373 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * All the various {@link org.apache.any23.extractor.Extractor} needed to distill <i>RDF</i> + * from <a href="http://microformats.org/">Microformats</a> in HTML pages are contained in this package. + * + */ +package org.apache.any23.extractor.html.microformats2; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html ---------------------------------------------------------------------- diff --git a/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html new file mode 100644 index 0000000..d6f2c06 --- /dev/null +++ b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html @@ -0,0 +1,27 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<div class="h-adr"> + <span class="p-street-address">349/B</span> + <span class="p-extended-address">Batagama,North</span> + <span class="p-locality">Jaela</span> + <span class="p-region">Western</span> + <span class="p-postal-code">11325</span> + <span class="p-country-name">SL</span></span> + <span class="p-label">349/B,Jaela</span> + <span class="p-geo">51.526421;-0.081067;25</span> +</div> http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html ---------------------------------------------------------------------- diff --git a/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html new file mode 100644 index 0000000..c8b2607 --- /dev/null +++ b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html @@ -0,0 +1,22 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<div class="h-geo"> + <span>Latitude</span><div class="p-latitude">7.066622</div> + <span>Longitude</span><div class="p-longitude">79.903048</div> + <span>Altitude</span><div class="p-altitude">15</div> +<div> http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java new file mode 100644 index 0000000..0fb3625 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HAdrExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory<?> getExtractorFactory() { + return new HAdrExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { + assertExtract("/microformats2/h-adr/h-adr-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 10); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java new file mode 100644 index 0000000..eba89de --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.apache.any23.extractor.html.microformats2.HGeoExtractorFactory; +import org.apache.any23.vocab.VCard; +import org.junit.Assert; +import org.junit.Test; +import org.openrdf.model.Resource; +import org.openrdf.model.vocabulary.RDF; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +import java.util.List; + + +public class HGeoExtractorTest extends AbstractExtractorTestCase { + + protected ExtractorFactory<?> getExtractorFactory() { + return new HGeoExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { + assertExtract("/microformats2/h-geo/h-geo-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 4); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index 3971191..b4f8b7a 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesNTriples() { - serializeVocabularies(RDFFormat.NTRIPLES, 1918); + serializeVocabularies(RDFFormat.NTRIPLES, 1920); } /** @@ -53,7 +53,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesRDFXML() { - serializeVocabularies(RDFFormat.RDFXML, 4987); // Effective lines + separators. + serializeVocabularies(RDFFormat.RDFXML, 4992); // Effective lines + separators. } private void serializeVocabularies(RDFFormat format, int expectedLines) { http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html new file mode 100644 index 0000000..b5c095a --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html @@ -0,0 +1,33 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!DOCTYPE html> +<html> + + <body> + <!-- Microformats 2 --> + <div class="h-adr"> + <span class="p-street-address">349/B</span> + <span class="p-extended-address">Batagama,North</span> + <span class="p-locality">Jaela</span> + <span class="p-region">Western</span> + <span class="p-postal-code">11325</span> + <span class="p-country-name">SL</span></span> + <span class="p-geo">51.526421;-0.081067;25</span> + </div> + </body> + +</html> http://git-wip-us.apache.org/repos/asf/any23/blob/5b10339b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html new file mode 100644 index 0000000..c0181fe --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html @@ -0,0 +1,33 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!DOCTYPE html> +<html> + +<body> + <!-- Microformats 2 --> + + <p> + <span class="h-geo"> + <span class="p-latitude">54.155278</span>, + <span class="p-longitude">-2.249722</span> + <span class="p-altitude">694</span> + </span> + </p> + +</body> + +</html>
