add HCard extractor and completed all the toDos related to hcard dependencies
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/cf48a5bf Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/cf48a5bf Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/cf48a5bf Branch: refs/heads/master Commit: cf48a5bf88b40bc327108a4daa857e14d914d654 Parents: 417b71a Author: Nisala <[email protected]> Authored: Wed Aug 26 23:41:22 2015 +0530 Committer: Nisala <[email protected]> Committed: Wed Aug 26 23:41:22 2015 +0530 ---------------------------------------------------------------------- .../main/java/org/apache/any23/vocab/HCard.java | 86 ++++ .../html/microformats2/HCardExtractor.java | 450 +++++++++++++++++++ .../microformats2/HCardExtractorFactory.java | 57 +++ .../html/microformats2/HEntryExtractor.java | 20 +- .../html/microformats2/HEventExtractor.java | 64 ++- .../html/microformats2/HProductExtractor.java | 21 +- .../html/microformats2/HResumeExtractor.java | 37 +- .../apache/any23/prefixes/prefixes.properties | 1 + .../html/microformats2/HAdrExtractorTest.java | 2 +- .../html/microformats2/HCardExtractorTest.java | 37 ++ .../html/microformats2/HEntryExtractorTest.java | 2 +- .../html/microformats2/HEventExtractorTest.java | 2 +- .../apache/any23/vocab/RDFSchemaUtilsTest.java | 4 +- .../microformats2/h-card/h-card-test.html | 45 ++ .../microformats2/h-entry/h-entry-test.html | 21 + 15 files changed, 829 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/api/src/main/java/org/apache/any23/vocab/HCard.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/vocab/HCard.java b/api/src/main/java/org/apache/any23/vocab/HCard.java new file mode 100644 index 0000000..b22e58c --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HCard.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the <a href="http://microformats.org/wiki/hcard">h-card</a> microformat. + * + * @author Nisala Nirmana + */ +public class HCard extends Vocabulary { + public static final String NS = SINDICE.NS + "hcard/"; + + private static HCard instance; + + public static HCard getInstance() { + if(instance == null) { + instance = new HCard(); + } + return instance; + } + + public URI Card = createClass(NS, "Card"); + public URI Address = createClass(NS, "Address"); + public URI Geo = createClass(NS, "Geo"); + + + public URI name = createProperty(NS, "name"); + public URI honorific_prefix = createProperty(NS, "honorific-prefix"); + public URI given_name = createProperty(NS, "given-name"); + public URI additional_name = createProperty(NS, "additional-name"); + public URI family_name = createProperty(NS, "family-name"); + public URI sort_string = createProperty(NS, "sort-string"); + public URI honorific_suffix = createProperty(NS, "honorific-suffix"); + public URI nickname = createProperty(NS, "nickname"); + public URI email = createProperty(NS, "email"); + public URI logo = createProperty(NS, "logo"); + public URI photo = createProperty(NS, "photo"); + public URI url = createProperty(NS, "url"); + public URI uid = createProperty(NS, "uid"); + public URI category = createProperty(NS, "category"); + public URI tel = createProperty(NS, "tel"); + public URI note = createProperty(NS, "note"); + public URI bday = createProperty(NS, "bday"); + public URI key = createProperty(NS, "key"); + public URI org = createProperty(NS, "org"); + public URI job_title = createProperty(NS, "job-title"); + public URI role = createProperty(NS, "role"); + public URI impp = createProperty(NS, "impp"); + public URI sex = createProperty(NS, "sex"); + public URI gender_identity = createProperty(NS, "gender-identity"); + public URI anniversary = createProperty(NS, "anniversary"); + public URI geo = createProperty(NS, "geo"); + public URI adr = createProperty(NS, "adr"); + + public URI street_address = createProperty(NS, "street-address"); + public URI extended_address = createProperty(NS, "extended-address"); + public URI locality = createProperty(NS, "locality"); + public URI region = createProperty(NS, "region"); + public URI postal_code = createProperty(NS, "postal-code"); + public URI country_name = createProperty(NS, "country-name"); + + public URI latitude = createProperty(NS, "latitude"); + public URI longitude = createProperty(NS, "longitude"); + public URI altitude = createProperty(NS, "altitude"); + + private HCard() { + super(NS); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java new file mode 100644 index 0000000..ebdd77b --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java @@ -0,0 +1,450 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.extractor.html.HTMLDocument; +import org.apache.any23.vocab.HCard; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.Resource; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.DomUtils; + +import java.util.List; + + +/** + * Extractor for the <a href="http://microformats.org/wiki/hcard">h-Card</a> + * microformat. + * + * @author Nisala Nirmana + */ +public class HCardExtractor extends EntityBasedMicroformatExtractor { + + private static final HCard vCARD = HCard.getInstance(); + + private static final String[] cardFields = { + "name", + "honorific-prefix", + "given-name", + "additional-name", + "family-name", + "sort-string", + "honorific-suffix", + "nickname", + "email", + "logo", + "photo", + "url", + "uid", + "category", + "tel", + "note", + "bday", + "key", + "org", + "job-title", + "role", + "impp", + "sex", + "gender-identity", + "anniversary", + "adr", + "geo" + }; + + private static final String[] addressFields = { + "street-address", + "extended-address", + "locality", + "region", + "postal-code", + "country-name", + "geo" + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" + }; + + + + @Override + public ExtractorDescription getDescription() { + return HCardExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"card"; + } + + @Override + protected void resetExtractor() { + //empty + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode card = getBlankNodeFor(node); + conditionallyAddResourceProperty(card, RDF.TYPE, vCARD.Card); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, card); + addHonorificPrefix(fragment, card); + addGivenName(fragment, card); + addAdditionalName(fragment, card); + addFamilyName(fragment, card); + addSortString(fragment, card); + addHonorificSuffix(fragment, card); + addNickname(fragment, card); + addEmails(fragment, card); + addLogo(fragment, card); + addPhoto(fragment, card); + addURLs(fragment, card); + addUID(fragment, card); + addCategories(fragment, card); + addTelephones(fragment, card); + addNotes(fragment, card); + addBday(fragment, card); + addKey(fragment, card); + addOrg(fragment, card); + addJobTitle(fragment, card); + addRole(fragment, card); + addImpp(fragment, card); + addSex(fragment, card); + addGenderIdentity(fragment, card); + addAnniversary(fragment, card); + addGeo(fragment, card); + addAdr(fragment, card); + final TagSoupExtractionResult tser = (TagSoupExtractionResult) out; + tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() ); + return true; + } + + public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode card, + ExtractionResult out) + throws ExtractionException { + this.setCurrentExtractionResult(out); + addName(fragment, card); + addHonorificPrefix(fragment, card); + addGivenName(fragment, card); + addAdditionalName(fragment, card); + addFamilyName(fragment, card); + addSortString(fragment, card); + addHonorificSuffix(fragment, card); + addNickname(fragment, card); + addEmails(fragment, card); + addLogo(fragment, card); + addPhoto(fragment, card); + addURLs(fragment, card); + addUID(fragment, card); + addCategories(fragment, card); + addTelephones(fragment, card); + addNotes(fragment, card); + addBday(fragment, card); + addKey(fragment, card); + addOrg(fragment, card); + addJobTitle(fragment, card); + addRole(fragment, card); + addImpp(fragment, card); + addSex(fragment, card); + addGenderIdentity(fragment, card); + addAnniversary(fragment, card); + addGeo(fragment, card); + addAdr(fragment, card); + return card; + } + + + + private void mapFieldWithProperty(HTMLDocument fragment, BNode card, String fieldClass, + URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), card, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[0], vCARD.name); + } + + private void addHonorificPrefix(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[1], vCARD.honorific_prefix); + } + + private void addGivenName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[2], vCARD.given_name); + } + + private void addAdditionalName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[3], vCARD.additional_name); + } + + private void addFamilyName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[4], vCARD.family_name); + } + + private void addSortString(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[5], vCARD.sort_string); + } + + private void addHonorificSuffix(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[6], vCARD.honorific_suffix); + } + + private void addNickname(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[7], vCARD.nickname); + } + + private void addEmails(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField[] emails = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[8]); + for(HTMLDocument.TextField email : emails) { + addURIProperty(card, vCARD.email, fragment.resolveURI(email.value())); + + } + } + + private void addLogo(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField logo = fragment.getSingularUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[9]); + if(logo.source()==null) + return; + addURIProperty(card, vCARD.logo, fragment.resolveURI(logo.value())); + } + + private void addPhoto(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField photo = fragment.getSingularUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[10]); + if(photo.source()==null) + return; + addURIProperty(card, vCARD.photo, fragment.resolveURI(photo.value())); + } + + private void addURLs(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField[] urls = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[11]); + for(HTMLDocument.TextField url : urls) { + addURIProperty(card, vCARD.url, fragment.resolveURI(url.value())); + + } + } + + private void addUID(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField uid = fragment.getSingularUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[12]); + if(uid.source()==null) + return; + addURIProperty(card, vCARD.uid, fragment.resolveURI(uid.value())); + } + + + private void addCategories(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[13]); + for (HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), entry, vCARD.category, category.value() + ); + } + } + + private void addTelephones(HTMLDocument fragment, BNode card) { + final HTMLDocument.TextField[] telephones = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[14]); + for (HTMLDocument.TextField tel : telephones) { + Node attribute=tel.source().getAttributes().getNamedItem("value"); + if (attribute==null){ + conditionallyAddStringProperty( + tel.source(), card, vCARD.tel, tel.value() + ); + }else{ + conditionallyAddStringProperty( + tel.source(), card, vCARD.tel, attribute.getNodeValue() + ); + } + } + } + + private void addNotes(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[15]); + for (HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), entry, vCARD.note, category.value() + ); + } + } + + private void addBday(HTMLDocument fragment, BNode card) { + final HTMLDocument.TextField bday = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[16]); + if (bday.source() == null) + return; + + Node attribute = bday.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + bday.source(), + card, vCARD.bday, bday.value() + ); + } else { + conditionallyAddStringProperty( + bday.source(), + card, vCARD.bday, attribute.getNodeValue() + ); + + } + } + + private void addKey(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField uid = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[17]); + if(uid.source()==null) + return; + addURIProperty(card, vCARD.key, fragment.resolveURI(uid.value())); + } + + private void addOrg(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[18], vCARD.org); + } + + private void addJobTitle(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[19], vCARD.job_title); + } + + private void addRole(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[20], vCARD.role); + } + + private void addImpp(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField impp = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[21]); + if(impp.source()==null) + return; + addURIProperty(card, vCARD.impp, fragment.resolveURI(impp.value())); + } + + private void addSex(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[22], vCARD.sex); + } + + private void addGenderIdentity(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[23], vCARD.gender_identity); + } + + + private void addAnniversary(HTMLDocument fragment, BNode card) { + final HTMLDocument.TextField anniversary = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[24]); + if (anniversary.source() == null) + return; + + Node attribute = anniversary.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + anniversary.source(), + card, vCARD.bday, anniversary.value() + ); + } else { + conditionallyAddStringProperty( + anniversary.source(), + card, vCARD.bday, attribute.getNodeValue() + ); + + } + } + + private void addAdr(HTMLDocument doc, Resource card) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[25] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[25]); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vCARD.Address); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : addressFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + if(!field.equals("geo")) { + conditionallyAddStringProperty( + val.source(), + location, vCARD.getProperty(field), val.value() + ); + }else { + addGeo(new HTMLDocument(node),card); + } + } + } + } + } + + private void addGeo(HTMLDocument doc, Resource card) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[26] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[26]); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vCARD.Geo); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : geoFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + Node attribute=val.source().getAttributes().getNamedItem("title"); + if (attribute==null){ + conditionallyAddStringProperty( + val.source(), + location, vCARD.getProperty(field), val.value() + ); + }else{ + conditionallyAddStringProperty( + val.source(), + location, vCARD.getProperty(field), attribute.getNodeValue() + ); + } + } + } + } + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java new file mode 100644 index 0000000..5a7d63e --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HCardExtractorFactory extends SimpleExtractorFactory<HCardExtractor> implements + ExtractorFactory<HCardExtractor> { + + public static final String NAME = "html-mf2-h-card"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hcard"); + + private static final ExtractorDescription descriptionInstance = new HCardExtractorFactory(); + + public HCardExtractorFactory() { + super( + HCardExtractorFactory.NAME, + HCardExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-card.html"); + } + + @Override + public HCardExtractor createExtractor() { + return new HCardExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java index 8c0c50f..3a85b5b 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java @@ -55,7 +55,7 @@ public class HEntryExtractor extends EntityBasedMicroformatExtractor { "uid", "syndication", "in-reply-to", - "author", //toDo HCard + "author", "location", }; @@ -96,10 +96,26 @@ public class HEntryExtractor extends EntityBasedMicroformatExtractor { addUID(fragment, entry); addSyndications(fragment, entry); addInReplyTo(fragment, entry); - addLocations(fragment,entry); + addLocations(fragment, entry); + addAuthors(fragment, entry); return true; } + private void addAuthors(HTMLDocument doc, Resource entry) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[10] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode author = valueFactory.createBNode(); + addURIProperty(author, RDF.TYPE, vEntry.author); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), author, + getCurrentExtractionResult()); + } + } + private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass, URI property) { HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java index ea90716..3f4d817 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -23,6 +23,7 @@ import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.TagSoupExtractionResult; import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; import org.apache.any23.vocab.HEvent; +import org.apache.any23.vocab.VCard; import org.openrdf.model.BNode; import org.openrdf.model.Resource; import org.openrdf.model.URI; @@ -30,6 +31,8 @@ import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; import org.apache.any23.extractor.html.HTMLDocument; +import java.util.List; + import static org.apache.any23.extractor.html.HTMLDocument.TextField; @@ -42,6 +45,7 @@ import static org.apache.any23.extractor.html.HTMLDocument.TextField; public class HEventExtractor extends EntityBasedMicroformatExtractor { private static final HEvent vEvent = HEvent.getInstance(); + private static final VCard vVCARD = VCard.getInstance(); private String[] eventFields = { "name", @@ -52,8 +56,14 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { "description", "url", "category", - "location", //toDO - "attendee" //toDO + "location", + "attendee" + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" }; @@ -85,7 +95,7 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { addDescription(fragment, event); addURLs(fragment, event); addCategories(fragment, event); - addLocation(fragment, event); + addLocations(fragment, event); return true; } @@ -102,10 +112,26 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { addDescription(fragment, event); addURLs(fragment, event); addCategories(fragment, event); - addLocation(fragment, event); + addLocations(fragment, event); + addAttendees(fragment,event); return event; } + private void addAttendees(HTMLDocument doc, Resource entry) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[9] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode attendee = valueFactory.createBNode(); + addURIProperty(attendee, RDF.TYPE, vEvent.attendee); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), attendee, + getCurrentExtractionResult()); + } + } + private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, URI property) { HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); @@ -204,9 +230,33 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { } } - private void addLocation(HTMLDocument fragment, BNode event) { - mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + - eventFields[8], vEvent.location); + private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[8] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo"); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vEvent.location); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : geoFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + Node attribute=val.source().getAttributes().getNamedItem("title"); + if (attribute==null){ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), val.value() + ); + }else{ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), attribute.getNodeValue() + ); + } + } + } + } } } http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java index 0e93935..0673a1d 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java @@ -24,10 +24,13 @@ import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; import org.apache.any23.extractor.html.HTMLDocument; import org.apache.any23.vocab.HProduct; import org.openrdf.model.BNode; +import org.openrdf.model.Resource; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; +import java.util.List; + /** * Extractor for the <a href="http://microformats.org/wiki/h-product">h-product</a> * microformat. @@ -41,7 +44,7 @@ public class HProductExtractor extends EntityBasedMicroformatExtractor { private static final String[] productFields = { "name", "photo", - "brand", //toDo + "brand", "category", "description", "url", @@ -77,6 +80,7 @@ public class HProductExtractor extends EntityBasedMicroformatExtractor { addURLs(fragment, product); addIdentifiers(fragment, product); addPrice(fragment, product); + addBrand(fragment,product); return true; } @@ -150,4 +154,19 @@ public class HProductExtractor extends EntityBasedMicroformatExtractor { ); } } + + private void addBrand(HTMLDocument doc, Resource product) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + productFields[2] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode brand = valueFactory.createBNode(); + addURIProperty(brand, RDF.TYPE, vProduct.brand); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), brand, + getCurrentExtractionResult()); + } + } } http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java index 44b463d..2026219 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java @@ -21,10 +21,7 @@ import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.TagSoupExtractionResult; -import org.apache.any23.vocab.DOAC; -import org.apache.any23.vocab.FOAF; import org.apache.any23.vocab.HResume; -import org.apache.commons.lang.UnhandledException; import org.openrdf.model.BNode; import org.openrdf.model.Resource; import org.openrdf.model.vocabulary.RDF; @@ -47,11 +44,11 @@ public class HResumeExtractor extends EntityBasedMicroformatExtractor { private static final String[] resumeFields = { "name", "summary", - "contact",//toDo Hcard + "contact", "education", "experience", "skill", - "affiliation"//toDo Hcard + "affiliation" }; @Override @@ -94,6 +91,36 @@ public class HResumeExtractor extends EntityBasedMicroformatExtractor { return true; } + private void addContacts(HTMLDocument doc, Resource entry) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[2] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode contact = valueFactory.createBNode(); + addURIProperty(contact, RDF.TYPE, vResume.contact); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), contact, + getCurrentExtractionResult()); + } + } + + private void addAffiliations(HTMLDocument doc, Resource entry) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[6] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode affiliation = valueFactory.createBNode(); + addURIProperty(affiliation, RDF.TYPE, vResume.affiliation); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), affiliation, + getCurrentExtractionResult()); + } + } + private void addName(HTMLDocument doc, Resource person) { HTMLDocument.TextField name = doc.getSingularTextField( Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]); http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties ---------------------------------------------------------------------- diff --git a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties index c7eaf54..2f9183d 100644 --- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties +++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties @@ -33,6 +33,7 @@ wo=http://purl.org/ontology/wo/ skos=http://www.w3.org/2004/02/skos/core# hrecipe=http://sindice.com/hrecipe/ hevent=http://sindice.com/hevent/ +hcard=http://sindice.com/hcard/ hproduct=http://sindice.com/hproduct/ hitem=http://sindice.com/hitem/ hentry=http://sindice.com/hentry/ http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java index 69abb55..e857105 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java @@ -34,4 +34,4 @@ public class HAdrExtractorTest extends AbstractExtractorTestCase { assertModelNotEmpty(); assertStatementsSize(null, null, null, 11); } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java new file mode 100644 index 0000000..9c9dc06 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HCardExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory<?> getExtractorFactory() { + return new HCardExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { + assertExtract("/microformats2/h-card/h-card-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 9); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java index cc2974d..96f3a6e 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java @@ -32,6 +32,6 @@ public class HEntryExtractorTest extends AbstractExtractorTestCase { public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { assertExtract("/microformats2/h-entry/h-entry-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 10); + assertStatementsSize(null, null, null, 20); } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java index 6c13909..70b212e 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java @@ -32,6 +32,6 @@ public class HEventExtractorTest extends AbstractExtractorTestCase { public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { assertExtract("/microformats2/h-event/h-event-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 9); + assertStatementsSize(null, null, null, 8); } } http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index c58e2a1..64fb4b7 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesNTriples() { - serializeVocabularies(RDFFormat.NTRIPLES, 2012);//1920 + serializeVocabularies(RDFFormat.NTRIPLES, 2090); } /** @@ -53,7 +53,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesRDFXML() { - serializeVocabularies(RDFFormat.RDFXML, 5252); // Effective lines + separators. //4992 + serializeVocabularies(RDFFormat.RDFXML, 5453); // Effective lines + separators. } private void serializeVocabularies(RDFFormat format, int expectedLines) { http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/test-resources/src/test/resources/microformats2/h-card/h-card-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-card/h-card-test.html b/test-resources/src/test/resources/microformats2/h-card/h-card-test.html new file mode 100644 index 0000000..f5ffb56 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-card/h-card-test.html @@ -0,0 +1,45 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!DOCTYPE html> + +<html> + +<body> + <div class="h-card"> + + + <!-- Microformats 2 --> + + <p class="p-name">Joe Bloggs</p> + + <p><img class="u-photo" src="http://Joe.net/picture.jpg" /></p> + + <p class="u-url" href="http://linkedin.au/Joe/">Professional Profile</p> + + <time class="dt-bday" datetime="1989-10-27">27 Oct 1989</time> + + <p class="p-adr h-adr"> + <span class="p-street-address">17 Austerstræti</span> + <span class="p-locality">ReykjavÃk</span> + <span class="p-country-name">Iceland</span> + </p> + + + </div> + +</body> + +</html> http://git-wip-us.apache.org/repos/asf/any23/blob/cf48a5bf/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html index f3c8cf7..adc2a1a 100644 --- a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html +++ b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html @@ -23,6 +23,27 @@ <time class="dt-published" datetime="2012-03-25T17:08:26">March 25th, 2012</time> </p> + <div class="p-author h-card"> + + + <!-- Microformats 2 --> + + <p class="p-name">Joe Bloggs</p> + + <p><img class="u-photo" src="http://Joe.net/picture.jpg" /></p> + + <p class="u-url" href="http://linkedin.au/Joe/">Professional Profile</p> + + <time class="dt-bday" datetime="1989-10-27">27 Oct 1989</time> + + <p class="p-adr h-adr"> + <span class="p-street-address">17 Austerstræti</span> + <span class="p-locality">ReykjavÃk</span> + <span class="p-country-name">Iceland</span> + </p> + + + </div> <div class="e-content"> <p class="p-summary">Last week the microformats.org community celebrated its 7th birthday at a gathering hosted by Mozilla in
