adding HEntry and HResume extractors
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/417b71a7 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/417b71a7 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/417b71a7 Branch: refs/heads/master Commit: 417b71a757ecb444a98cebeb25f48faa1c27524f Parents: 0008c7c Author: Nisala <[email protected]> Authored: Sun Aug 23 21:39:34 2015 +0530 Committer: Nisala <[email protected]> Committed: Sun Aug 23 21:39:34 2015 +0530 ---------------------------------------------------------------------- .../java/org/apache/any23/vocab/HEntry.java | 60 +++++ .../main/java/org/apache/any23/vocab/HItem.java | 17 ++ .../java/org/apache/any23/vocab/HResume.java | 54 +++++ .../extractor/html/MicroformatExtractor.java | 5 + .../html/microformats2/HEntryExtractor.java | 234 +++++++++++++++++++ .../microformats2/HEntryExtractorFactory.java | 60 +++++ .../html/microformats2/HEventExtractor.java | 17 ++ .../microformats2/HItemExtractorFactory.java | 2 +- .../html/microformats2/HResumeExtractor.java | 162 +++++++++++++ .../microformats2/HResumeExtractorFactory.java | 57 +++++ .../microformats2/Microformats2Prefixes.java | 1 + .../apache/any23/prefixes/prefixes.properties | 2 + .../html/microformats2/HEntryExtractorTest.java | 37 +++ .../microformats2/HProductExtractorTest.java | 2 +- .../microformats2/HResumeExtractorTest.java | 37 +++ .../apache/any23/vocab/RDFSchemaUtilsTest.java | 4 +- .../microformats2/h-entry/h-entry-test.html | 53 +++++ .../microformats2/h-resume/h-resume-test.html | 49 ++++ 18 files changed, 849 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/api/src/main/java/org/apache/any23/vocab/HEntry.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/vocab/HEntry.java b/api/src/main/java/org/apache/any23/vocab/HEntry.java new file mode 100644 index 0000000..e63907b --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HEntry.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the <a href="http://microformats.org/wiki/hentry">h-entry</a> microformat. + * + * @author Nisala Nirmana + */ +public class HEntry extends Vocabulary { + + public static final String NS = SINDICE.NS + "hentry/"; + + private static HEntry instance; + + public static HEntry getInstance() { + if(instance == null) { + instance = new HEntry(); + } + return instance; + } + + public URI Entry = createClass(NS, "Entry"); + public URI author = createClass(NS, "author"); + public URI location = createClass(NS, "location"); + + + public URI name = createProperty(NS, "name"); + public URI summary = createProperty(NS, "summary"); + public URI content = createProperty(NS, "content"); + public URI published = createProperty(NS, "published"); + public URI updated = createProperty(NS, "updated"); + public URI category = createProperty(NS, "category"); + public URI url = createProperty(NS, "url"); + public URI uid = createProperty(NS, "uid"); + public URI syndication = createProperty(NS, "syndication"); + public URI in_reply_to = createProperty(NS, "in-reply-to"); + + private HEntry() { + super(NS); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/api/src/main/java/org/apache/any23/vocab/HItem.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/vocab/HItem.java b/api/src/main/java/org/apache/any23/vocab/HItem.java index db54e65..01bc5a2 100644 --- a/api/src/main/java/org/apache/any23/vocab/HItem.java +++ b/api/src/main/java/org/apache/any23/vocab/HItem.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.any23.vocab; import org.openrdf.model.URI; http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/api/src/main/java/org/apache/any23/vocab/HResume.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/vocab/HResume.java b/api/src/main/java/org/apache/any23/vocab/HResume.java new file mode 100644 index 0000000..1a50157 --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HResume.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * @author Nisala Nirmana + * + */ +public class HResume extends Vocabulary { + + public static final String NS = SINDICE.NS + "hresume/"; + + private static HResume instance; + + public static HResume getInstance() { + if(instance == null) { + instance = new HResume(); + } + return instance; + } + + public URI Resume = createClass(NS, "Resume"); + public URI education = createClass(NS, "education"); + public URI experience = createClass(NS, "experience"); + public URI contact = createClass(NS, "contact"); + public URI affiliation = createClass(NS, "affiliation"); + + + public URI name = createProperty(NS, "name"); + public URI summary = createProperty(NS, "summary"); + public URI skill = createProperty(NS, "skill"); + + + private HResume() { + super(NS); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java index 51ee910..4de6e21 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java @@ -113,6 +113,10 @@ public abstract class MicroformatExtractor implements TagSoupDOMExtractor { return out; } + protected void setCurrentExtractionResult(ExtractionResult out) { + this.out = out; + } + protected ExtractionResult openSubResult(ExtractionContext context) { return out.openSubResult(context); } @@ -265,4 +269,5 @@ public abstract class MicroformatExtractor implements TagSoupDOMExtractor { return false; } + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java new file mode 100644 index 0000000..8c0c50f --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; +import org.apache.any23.vocab.HEntry; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.openrdf.model.Resource; + +import java.util.List; + +/** + * Extractor for the <a href="http://microformats.org/wiki/h-entry">h-entry</a> + * microformat. + * + * @author Nisala Nirmana + */ +public class HEntryExtractor extends EntityBasedMicroformatExtractor { + + private static final HEntry vEntry = HEntry.getInstance(); + private static final VCard vVCARD = VCard.getInstance(); + + private static final String[] entryFields = { + "name", + "summary", + "content", + "published", + "updated", + "category", + "url", + "uid", + "syndication", + "in-reply-to", + "author", //toDo HCard + "location", + + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" + }; + + @Override + public ExtractorDescription getDescription() { + return HEntryExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"entry"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode entry = getBlankNodeFor(node); + conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, entry); + addSummary(fragment, entry); + addContent(fragment, entry); + addPublished(fragment, entry); + addUpdated(fragment, entry); + addCategories(fragment, entry); + addURLs(fragment, entry); + addUID(fragment, entry); + addSyndications(fragment, entry); + addInReplyTo(fragment, entry); + addLocations(fragment,entry); + return true; + } + + private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass, + URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), entry, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode entry) { + mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + + entryFields[0], vEntry.name); + } + + private void addSummary(HTMLDocument fragment, BNode entry) { + mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1], + vEntry.summary); + } + + private void addContent(HTMLDocument fragment, BNode entry) { + mapFieldWithProperty(fragment, entry, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2], + vEntry.content); + } + + private void addPublished(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] durations = fragment.getPluralTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]); + for(HTMLDocument.TextField duration : durations) { + Node attribute=duration.source().getAttributes().getNamedItem("datetime"); + if (attribute==null){ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.published, duration.value() + ); + }else{ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.published, attribute.getNodeValue() + ); + } + } + } + + private void addUpdated(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] durations = fragment.getPluralTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]); + for(HTMLDocument.TextField duration : durations) { + Node attribute=duration.source().getAttributes().getNamedItem("datetime"); + if (attribute==null){ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.updated, duration.value() + ); + }else{ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.updated, attribute.getNodeValue() + ); + } + } + } + + private void addCategories(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]); + for (HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), entry, vEntry.category, category.value() + ); + } + } + + private void addURLs(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField[] urls = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]); + for(HTMLDocument.TextField url : urls) { + addURIProperty(entry, vEntry.url, fragment.resolveURI(url.value())); + } + } + + private void addUID(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField uid = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]); + if(uid.source()==null) + return; + addURIProperty(entry, vEntry.uid, fragment.resolveURI(uid.value())); + } + + private void addSyndications(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField[] syndications = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]); + for(HTMLDocument.TextField syndication : syndications) { + addURIProperty(entry, vEntry.syndication, fragment.resolveURI(syndication.value())); + } + } + + private void addInReplyTo(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField inReplyTo = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]); + if(inReplyTo.source()==null) + return; + addURIProperty(entry, vEntry.in_reply_to, fragment.resolveURI(inReplyTo.value())); + } + + private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[11] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo"); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vEntry.location); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : geoFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + Node attribute=val.source().getAttributes().getNamedItem("title"); + if (attribute==null){ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), val.value() + ); + }else{ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), attribute.getNodeValue() + ); + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java new file mode 100644 index 0000000..e2d4556 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +import java.util.Arrays; + +/** + * Extractor for the <a href="http://microformats.org/wiki/h-entry">h-entry</a> + * microformat. + * + * @author Nisala Nirmana + */ +public class HEntryExtractorFactory extends SimpleExtractorFactory<HEntryExtractor> implements + ExtractorFactory<HEntryExtractor> { + + public static final String NAME = "html-mf2-h-entry"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hentry"); + + private static final ExtractorDescription descriptionInstance = new HEntryExtractorFactory(); + + public HEntryExtractorFactory() { + super( + HEntryExtractorFactory.NAME, + HEntryExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-entry.html"); + } + + @Override + public HEntryExtractor createExtractor() { + return new HEntryExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java index ce67d86..ea90716 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -24,6 +24,7 @@ import org.apache.any23.extractor.TagSoupExtractionResult; import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; import org.apache.any23.vocab.HEvent; import org.openrdf.model.BNode; +import org.openrdf.model.Resource; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; @@ -89,6 +90,22 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { return true; } + public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode event, + ExtractionResult out) + throws ExtractionException { + this.setCurrentExtractionResult(out); + addName(fragment, event); + addSummary(fragment, event); + addStart(fragment, event); + addEnd(fragment, event); + addDuration(fragment, event); + addDescription(fragment, event); + addURLs(fragment, event); + addCategories(fragment, event); + addLocation(fragment, event); + return event; + } + private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, URI property) { HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java index 8423686..14f20bd 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java @@ -17,7 +17,7 @@ public class HItemExtractorFactory extends SimpleExtractorFactory<HItemExtractor public static final String NAME = "html-mf2-h-item"; - public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "vcard"); + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hitem"); private static final ExtractorDescription descriptionInstance = new HItemExtractorFactory(); http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java new file mode 100644 index 0000000..44b463d --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.vocab.DOAC; +import org.apache.any23.vocab.FOAF; +import org.apache.any23.vocab.HResume; +import org.apache.commons.lang.UnhandledException; +import org.openrdf.model.BNode; +import org.openrdf.model.Resource; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; +import org.apache.any23.extractor.html.DomUtils; +import java.util.List; + +/** + * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a> + * microformat. + * + * @author Nisala Nirmana + */ +public class HResumeExtractor extends EntityBasedMicroformatExtractor { + + private static final HResume vResume = HResume.getInstance(); + + private static final String[] resumeFields = { + "name", + "summary", + "contact",//toDo Hcard + "education", + "experience", + "skill", + "affiliation"//toDo Hcard + }; + + @Override + public ExtractorDescription getDescription() { + return HResumeExtractorFactory.getDescriptionInstance(); + } + + @Override + public String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX + "resume"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + if (null == node) return false; + BNode person = getBlankNodeFor(node); + out.writeTriple(person, RDF.TYPE, vResume.Resume); + final HTMLDocument fragment = new HTMLDocument(node); + + addName(fragment, person); + addSummary(fragment, person); + addSkills(fragment, person); + + addExperiences(fragment, person); + addEducations(fragment, person); + + + final TagSoupExtractionResult tser = (TagSoupExtractionResult) out; + tser.addResourceRoot( + DomUtils.getXPathListForNode(node), + person, + this.getClass() + ); + + return true; + } + + private void addName(HTMLDocument doc, Resource person) { + HTMLDocument.TextField name = doc.getSingularTextField( + Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]); + conditionallyAddStringProperty( + name.source(), + person, + vResume.name, + name.value() + ); + } + + private void addSummary(HTMLDocument doc, Resource person) { + HTMLDocument.TextField summary = doc.getSingularTextField( + Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[1]); + conditionallyAddStringProperty( + summary.source(), + person, + vResume.summary, + summary.value() + ); + } + + private void addSkills(HTMLDocument doc, Resource person) { + final HTMLDocument.TextField[] skills = doc.getPluralTextField( + Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[5]); + for (HTMLDocument.TextField skill : skills) { + conditionallyAddStringProperty( + skill.source(), + person, + vResume.skill, + skill.value() + ); + } + + } + + private void addExperiences(HTMLDocument doc, Resource person) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[4] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event"); + if (nodes.isEmpty()) + return; + HEventExtractorFactory factory = new HEventExtractorFactory(); + HEventExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode event = valueFactory.createBNode(); + addURIProperty(event, RDF.TYPE, vResume.experience); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, + getCurrentExtractionResult()); + } + } + + private void addEducations(HTMLDocument doc, Resource person) throws ExtractionException { + List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[3] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event"); + if (nodes.isEmpty()) + return; + HEventExtractorFactory factory = new HEventExtractorFactory(); + HEventExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode event = valueFactory.createBNode(); + addURIProperty(event, RDF.TYPE, vResume.education); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, + getCurrentExtractionResult()); + } + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java new file mode 100644 index 0000000..a8120eb --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HResumeExtractorFactory extends SimpleExtractorFactory<HResumeExtractor> implements + ExtractorFactory<HResumeExtractor> { + + public static final String NAME = "html-mf2-h-resume"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "doac", "foaf"); + + private static final ExtractorDescription descriptionInstance = new HResumeExtractorFactory(); + + public HResumeExtractorFactory() { + super( + HResumeExtractorFactory.NAME, + HResumeExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-resume.html"); + } + + @Override + public HResumeExtractor createExtractor() { + return new HResumeExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java index 18ac1b1..d6b3349 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java @@ -23,4 +23,5 @@ public class Microformats2Prefixes { public static final String URL_PROPERTY_PREFIX = "u-"; public static final String EMBEDDED_PROPERTY_PREFIX = "e-"; public static final String TIME_PROPERTY_PREFIX = "dt-"; + public static final String SPACE_SEPARATOR = " "; } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties ---------------------------------------------------------------------- diff --git a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties index 34e3975..c7eaf54 100644 --- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties +++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties @@ -34,6 +34,8 @@ skos=http://www.w3.org/2004/02/skos/core# hrecipe=http://sindice.com/hrecipe/ hevent=http://sindice.com/hevent/ hproduct=http://sindice.com/hproduct/ +hitem=http://sindice.com/hitem/ +hentry=http://sindice.com/hentry/ sindice=http://vocab.sindice.net/ og=http://opengraphprotocol.org/schema/ fb=http://www.facebook.com/2008/fbml# http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java new file mode 100644 index 0000000..cc2974d --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HEntryExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory<?> getExtractorFactory() { + return new HEntryExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-entry/h-entry-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 10); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java index 3b46a7a..49c1755 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java @@ -32,6 +32,6 @@ public class HProductExtractorTest extends AbstractExtractorTestCase { public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { assertExtract("/microformats2/h-product/h-product-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 11); + assertStatementsSize(null, null, null, 6); } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java new file mode 100644 index 0000000..dd2f5d1 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HResumeExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory<?> getExtractorFactory() { + return new HResumeExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-resume/h-resume-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 12); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index b4f8b7a..c58e2a1 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesNTriples() { - serializeVocabularies(RDFFormat.NTRIPLES, 1920); + serializeVocabularies(RDFFormat.NTRIPLES, 2012);//1920 } /** @@ -53,7 +53,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesRDFXML() { - serializeVocabularies(RDFFormat.RDFXML, 4992); // Effective lines + separators. + serializeVocabularies(RDFFormat.RDFXML, 5252); // Effective lines + separators. //4992 } private void serializeVocabularies(RDFFormat format, int expectedLines) { http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html new file mode 100644 index 0000000..f3c8cf7 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html @@ -0,0 +1,53 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!DOCTYPE html> +<html> + +<body> +<div class="h-entry"> + <h1><a class="p-name u-url" href="http://microformats.org/2012/06/25/microformats-org-at-7">microformats.org at 7</a></h1> + + <p>Published + <time class="dt-published" datetime="2012-03-25T17:08:26">March 25th, 2012</time> + </p> + + <div class="e-content"> + <p class="p-summary">Last week the microformats.org community + celebrated its 7th birthday at a gathering hosted by Mozilla in + San Francisco and recognized accomplishments, challenges, and + opportunities.</p> + + <p>The microformats tagline âhumans first, machines secondâ + forms the basis of many of our + <a href="http://microformats.org/wiki/principles">principles</a>, and + in that regard, weâd like to recognize a few people and + thank them for their years of volunteer service </p> + </div> + + <p>Updated + <time class="dt-updated" datetime="2012-06-25T17:08:26">June 25th, 2012</time> + </p> + + <div class="p-location h-geo"> + <p>Location + <abbr class="p-latitude" title="37.408183">N 37° 24.491</abbr>, + <abbr class="p-longitude" title="-122.13855">W 122° 08.313</abbr> + </p> + </div> +</div> + +</body> + +</html> http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html b/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html new file mode 100644 index 0000000..15dd835 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html @@ -0,0 +1,49 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!DOCTYPE html> +<html> + +<body> +<div class="h-resume"> + <p class="p-name">Tim Berners-Lee</p> + + <p class="p-summary">Invented the World Wide Web.</p><hr /> + + <div class="p-education h-event"> + Education : + <time class="dt-start" datetime="1973-09">1973</time> + <time class="dt-end" datetime="1976-06">1976</time> + </div> + + <div class="p-experience h-event"> + <p>Experiance : + <time class="dt-start" datetime="2009-01-18">Jan 2009</time> Present + <time class="dt-duration" datetime="P2Y11M">(2 years 11 month)</time> + </p> + </div> + + <div> + Skills: + <ul> + <li class="p-skill">information systems</li> + <li class="p-skill">advocacy</li> + <li class="p-skill">leadership</li> + <ul> + </div> + +</div> +</body> + +</html>
