adding HEntry and HResume extractors

Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/417b71a7
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/417b71a7
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/417b71a7

Branch: refs/heads/master
Commit: 417b71a757ecb444a98cebeb25f48faa1c27524f
Parents: 0008c7c
Author: Nisala <[email protected]>
Authored: Sun Aug 23 21:39:34 2015 +0530
Committer: Nisala <[email protected]>
Committed: Sun Aug 23 21:39:34 2015 +0530

----------------------------------------------------------------------
 .../java/org/apache/any23/vocab/HEntry.java     |  60 +++++
 .../main/java/org/apache/any23/vocab/HItem.java |  17 ++
 .../java/org/apache/any23/vocab/HResume.java    |  54 +++++
 .../extractor/html/MicroformatExtractor.java    |   5 +
 .../html/microformats2/HEntryExtractor.java     | 234 +++++++++++++++++++
 .../microformats2/HEntryExtractorFactory.java   |  60 +++++
 .../html/microformats2/HEventExtractor.java     |  17 ++
 .../microformats2/HItemExtractorFactory.java    |   2 +-
 .../html/microformats2/HResumeExtractor.java    | 162 +++++++++++++
 .../microformats2/HResumeExtractorFactory.java  |  57 +++++
 .../microformats2/Microformats2Prefixes.java    |   1 +
 .../apache/any23/prefixes/prefixes.properties   |   2 +
 .../html/microformats2/HEntryExtractorTest.java |  37 +++
 .../microformats2/HProductExtractorTest.java    |   2 +-
 .../microformats2/HResumeExtractorTest.java     |  37 +++
 .../apache/any23/vocab/RDFSchemaUtilsTest.java  |   4 +-
 .../microformats2/h-entry/h-entry-test.html     |  53 +++++
 .../microformats2/h-resume/h-resume-test.html   |  49 ++++
 18 files changed, 849 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/api/src/main/java/org/apache/any23/vocab/HEntry.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/vocab/HEntry.java 
b/api/src/main/java/org/apache/any23/vocab/HEntry.java
new file mode 100644
index 0000000..e63907b
--- /dev/null
+++ b/api/src/main/java/org/apache/any23/vocab/HEntry.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.vocab;
+
+import org.openrdf.model.URI;
+
+/**
+ * Vocabulary to map the <a 
href="http://microformats.org/wiki/hentry";>h-entry</a> microformat.
+ *
+ * @author Nisala Nirmana
+ */
+public class HEntry extends Vocabulary {
+
+    public static final String NS = SINDICE.NS + "hentry/";
+
+    private static HEntry instance;
+
+    public static HEntry getInstance() {
+        if(instance == null) {
+            instance = new HEntry();
+        }
+        return instance;
+    }
+
+    public URI Entry  = createClass(NS, "Entry");
+    public URI author   = createClass(NS, "author");
+    public URI location = createClass(NS, "location");
+
+
+    public URI name  = createProperty(NS, "name");
+    public URI summary   = createProperty(NS, "summary");
+    public URI content   = createProperty(NS, "content");
+    public URI published   = createProperty(NS, "published");
+    public URI updated   = createProperty(NS, "updated");
+    public URI category   = createProperty(NS, "category");
+    public URI url   = createProperty(NS, "url");
+    public URI uid  = createProperty(NS, "uid");
+    public URI syndication   = createProperty(NS, "syndication");
+    public URI in_reply_to   = createProperty(NS, "in-reply-to");
+
+    private HEntry() {
+        super(NS);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/api/src/main/java/org/apache/any23/vocab/HItem.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/vocab/HItem.java 
b/api/src/main/java/org/apache/any23/vocab/HItem.java
index db54e65..01bc5a2 100644
--- a/api/src/main/java/org/apache/any23/vocab/HItem.java
+++ b/api/src/main/java/org/apache/any23/vocab/HItem.java
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.any23.vocab;
 
 import org.openrdf.model.URI;

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/api/src/main/java/org/apache/any23/vocab/HResume.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/vocab/HResume.java 
b/api/src/main/java/org/apache/any23/vocab/HResume.java
new file mode 100644
index 0000000..1a50157
--- /dev/null
+++ b/api/src/main/java/org/apache/any23/vocab/HResume.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.vocab;
+
+import org.openrdf.model.URI;
+
+/**
+ * @author Nisala Nirmana
+ *
+ */
+public class HResume extends Vocabulary {
+
+    public static final String NS = SINDICE.NS + "hresume/";
+
+    private static HResume instance;
+
+    public static HResume getInstance() {
+        if(instance == null) {
+            instance = new HResume();
+        }
+        return instance;
+    }
+
+    public URI Resume  = createClass(NS, "Resume");
+    public URI education   = createClass(NS, "education");
+    public URI experience = createClass(NS, "experience");
+    public URI contact = createClass(NS, "contact");
+    public URI affiliation = createClass(NS, "affiliation");
+
+
+    public URI name  = createProperty(NS, "name");
+    public URI summary   = createProperty(NS, "summary");
+    public URI skill   = createProperty(NS, "skill");
+
+
+    private HResume() {
+        super(NS);
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java
index 51ee910..4de6e21 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java
@@ -113,6 +113,10 @@ public abstract class MicroformatExtractor implements 
TagSoupDOMExtractor {
         return out;
     }
 
+    protected void setCurrentExtractionResult(ExtractionResult out) {
+        this.out = out;
+    }
+
     protected ExtractionResult openSubResult(ExtractionContext context) {
         return out.openSubResult(context);
     }
@@ -265,4 +269,5 @@ public abstract class MicroformatExtractor implements 
TagSoupDOMExtractor {
         return false;
     }
 
+
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java
new file mode 100644
index 0000000..8c0c50f
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.any23.extractor.html.microformats2;
+
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
+import org.apache.any23.extractor.html.HTMLDocument;
+import org.apache.any23.vocab.HEntry;
+import org.apache.any23.vocab.VCard;
+import org.openrdf.model.BNode;
+import org.openrdf.model.URI;
+import org.openrdf.model.vocabulary.RDF;
+import org.w3c.dom.Node;
+import org.openrdf.model.Resource;
+
+import java.util.List;
+
+/**
+ * Extractor for the <a href="http://microformats.org/wiki/h-entry";>h-entry</a>
+ * microformat.
+ *
+ * @author Nisala Nirmana
+ */
+public class HEntryExtractor extends EntityBasedMicroformatExtractor {
+
+    private static final HEntry vEntry = HEntry.getInstance();
+    private static final VCard vVCARD = VCard.getInstance();
+
+    private static final String[] entryFields = {
+            "name",
+            "summary",
+            "content",
+            "published",
+            "updated",
+            "category",
+            "url",
+            "uid",
+            "syndication",
+            "in-reply-to",
+            "author", //toDo HCard
+            "location",
+
+    };
+
+    private static final String[] geoFields = {
+            "latitude",
+            "longitude",
+            "altitude"
+    };
+
+    @Override
+    public ExtractorDescription getDescription() {
+        return HEntryExtractorFactory.getDescriptionInstance();
+    }
+
+    @Override
+    protected String getBaseClassName() {
+        return Microformats2Prefixes.CLASS_PREFIX+"entry";
+    }
+
+    @Override
+    protected void resetExtractor() {
+        // Empty.
+    }
+
+    @Override
+    protected boolean extractEntity(Node node, ExtractionResult out) throws 
ExtractionException {
+        final BNode entry = getBlankNodeFor(node);
+        conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry);
+        final HTMLDocument fragment = new HTMLDocument(node);
+        addName(fragment, entry);
+        addSummary(fragment, entry);
+        addContent(fragment, entry);
+        addPublished(fragment, entry);
+        addUpdated(fragment, entry);
+        addCategories(fragment, entry);
+        addURLs(fragment, entry);
+        addUID(fragment, entry);
+        addSyndications(fragment, entry);
+        addInReplyTo(fragment, entry);
+        addLocations(fragment,entry);
+        return true;
+    }
+
+    private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, 
String fieldClass,
+                                      URI property) {
+        HTMLDocument.TextField title = 
fragment.getSingularTextField(fieldClass);
+        conditionallyAddStringProperty(
+                title.source(), entry, property, title.value()
+        );
+    }
+
+    private void addName(HTMLDocument fragment, BNode entry) {
+        mapFieldWithProperty(fragment, entry, 
Microformats2Prefixes.PROPERTY_PREFIX +
+                entryFields[0], vEntry.name);
+    }
+
+    private void addSummary(HTMLDocument fragment, BNode entry) {
+        mapFieldWithProperty(fragment, entry, 
Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1],
+                vEntry.summary);
+    }
+
+    private void addContent(HTMLDocument fragment, BNode entry) {
+        mapFieldWithProperty(fragment, entry, 
Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2],
+                vEntry.content);
+    }
+
+    private void addPublished(HTMLDocument fragment, BNode entry) {
+        final HTMLDocument.TextField[] durations = fragment.getPluralTextField(
+                Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]);
+        for(HTMLDocument.TextField duration : durations) {
+            Node 
attribute=duration.source().getAttributes().getNamedItem("datetime");
+            if (attribute==null){
+                conditionallyAddStringProperty(
+                        duration.source(),
+                        entry, vEntry.published, duration.value()
+                );
+            }else{
+                conditionallyAddStringProperty(
+                        duration.source(),
+                        entry, vEntry.published, attribute.getNodeValue()
+                );
+            }
+        }
+    }
+
+    private void addUpdated(HTMLDocument fragment, BNode entry) {
+        final HTMLDocument.TextField[] durations = fragment.getPluralTextField(
+                Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]);
+        for(HTMLDocument.TextField duration : durations) {
+            Node 
attribute=duration.source().getAttributes().getNamedItem("datetime");
+            if (attribute==null){
+                conditionallyAddStringProperty(
+                        duration.source(),
+                        entry, vEntry.updated, duration.value()
+                );
+            }else{
+                conditionallyAddStringProperty(
+                        duration.source(),
+                        entry, vEntry.updated, attribute.getNodeValue()
+                );
+            }
+        }
+    }
+
+    private void addCategories(HTMLDocument fragment, BNode entry) {
+        final HTMLDocument.TextField[] categories = fragment.getPluralTextField
+                (Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]);
+        for (HTMLDocument.TextField category : categories) {
+            conditionallyAddStringProperty(
+                    category.source(), entry, vEntry.category, category.value()
+            );
+        }
+    }
+
+    private void addURLs(HTMLDocument fragment, BNode entry) throws 
ExtractionException {
+        final HTMLDocument.TextField[] urls = fragment.getPluralUrlField
+                (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]);
+        for(HTMLDocument.TextField url : urls) {
+            addURIProperty(entry, vEntry.url, 
fragment.resolveURI(url.value()));
+        }
+    }
+
+    private void addUID(HTMLDocument fragment, BNode entry) throws 
ExtractionException {
+        final HTMLDocument.TextField uid = fragment.getSingularTextField
+                (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]);
+        if(uid.source()==null)
+            return;
+        addURIProperty(entry, vEntry.uid, fragment.resolveURI(uid.value()));
+    }
+
+    private void addSyndications(HTMLDocument fragment, BNode entry) throws 
ExtractionException {
+        final HTMLDocument.TextField[] syndications = 
fragment.getPluralUrlField
+                (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]);
+        for(HTMLDocument.TextField syndication : syndications) {
+            addURIProperty(entry, vEntry.syndication, 
fragment.resolveURI(syndication.value()));
+        }
+    }
+
+    private void addInReplyTo(HTMLDocument fragment, BNode entry) throws 
ExtractionException {
+        final HTMLDocument.TextField inReplyTo = fragment.getSingularTextField
+                (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]);
+        if(inReplyTo.source()==null)
+            return;
+        addURIProperty(entry, vEntry.in_reply_to, 
fragment.resolveURI(inReplyTo.value()));
+    }
+
+    private void addLocations(HTMLDocument doc, Resource entry) throws 
ExtractionException {
+        List<Node> nodes = 
doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[11] +
+                Microformats2Prefixes.SPACE_SEPARATOR + 
Microformats2Prefixes.CLASS_PREFIX + "geo");
+        if (nodes.isEmpty())
+            return;
+        for (Node node : nodes) {
+            BNode location = valueFactory.createBNode();
+            addURIProperty(location, RDF.TYPE, vEntry.location);
+            HTMLDocument fragment = new HTMLDocument(node);
+            for (String field : geoFields) {
+                HTMLDocument.TextField[] values = 
fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field);
+                for (HTMLDocument.TextField val : values) {
+                    Node 
attribute=val.source().getAttributes().getNamedItem("title");
+                    if (attribute==null){
+                        conditionallyAddStringProperty(
+                                val.source(),
+                                location, vVCARD.getProperty(field), 
val.value()
+                        );
+                    }else{
+                        conditionallyAddStringProperty(
+                                val.source(),
+                                location, vVCARD.getProperty(field), 
attribute.getNodeValue()
+                        );
+                    }
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java
new file mode 100644
index 0000000..e2d4556
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.any23.extractor.html.microformats2;
+
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.PopularPrefixes;
+import org.apache.any23.rdf.Prefixes;
+
+import java.util.Arrays;
+
+/**
+ * Extractor for the <a href="http://microformats.org/wiki/h-entry";>h-entry</a>
+ * microformat.
+ *
+ * @author Nisala Nirmana
+ */
+public class HEntryExtractorFactory extends 
SimpleExtractorFactory<HEntryExtractor> implements
+        ExtractorFactory<HEntryExtractor> {
+
+    public static final String NAME = "html-mf2-h-entry";
+
+    public static final Prefixes PREFIXES = 
PopularPrefixes.createSubset("rdf", "hentry");
+
+    private static final ExtractorDescription descriptionInstance = new 
HEntryExtractorFactory();
+
+    public HEntryExtractorFactory() {
+        super(
+                HEntryExtractorFactory.NAME,
+                HEntryExtractorFactory.PREFIXES,
+                Arrays.asList("text/html;q=0.1", 
"application/xhtml+xml;q=0.1"),
+                "example-mf2-h-entry.html");
+    }
+
+    @Override
+    public HEntryExtractor createExtractor() {
+        return new HEntryExtractor();
+    }
+
+    public static ExtractorDescription getDescriptionInstance() {
+        return descriptionInstance;
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java
index ce67d86..ea90716 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java
@@ -24,6 +24,7 @@ import org.apache.any23.extractor.TagSoupExtractionResult;
 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
 import org.apache.any23.vocab.HEvent;
 import org.openrdf.model.BNode;
+import org.openrdf.model.Resource;
 import org.openrdf.model.URI;
 import org.openrdf.model.vocabulary.RDF;
 import org.w3c.dom.Node;
@@ -89,6 +90,22 @@ public class HEventExtractor extends 
EntityBasedMicroformatExtractor {
         return true;
     }
 
+    public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, 
BNode event,
+                                                    ExtractionResult out)
+            throws ExtractionException {
+        this.setCurrentExtractionResult(out);
+        addName(fragment, event);
+        addSummary(fragment, event);
+        addStart(fragment, event);
+        addEnd(fragment, event);
+        addDuration(fragment, event);
+        addDescription(fragment, event);
+        addURLs(fragment, event);
+        addCategories(fragment, event);
+        addLocation(fragment, event);
+        return event;
+    }
+
     private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, 
String fieldClass,
                                       URI property) {
         HTMLDocument.TextField title = 
fragment.getSingularTextField(fieldClass);

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java
index 8423686..14f20bd 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java
@@ -17,7 +17,7 @@ public class HItemExtractorFactory extends 
SimpleExtractorFactory<HItemExtractor
 
     public static final String NAME = "html-mf2-h-item";
 
-    public static final Prefixes PREFIXES = 
PopularPrefixes.createSubset("rdf", "vcard");
+    public static final Prefixes PREFIXES = 
PopularPrefixes.createSubset("rdf", "hitem");
 
     private static final ExtractorDescription descriptionInstance = new 
HItemExtractorFactory();
 

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java
new file mode 100644
index 0000000..44b463d
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.html.microformats2;
+
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.TagSoupExtractionResult;
+import org.apache.any23.vocab.DOAC;
+import org.apache.any23.vocab.FOAF;
+import org.apache.any23.vocab.HResume;
+import org.apache.commons.lang.UnhandledException;
+import org.openrdf.model.BNode;
+import org.openrdf.model.Resource;
+import org.openrdf.model.vocabulary.RDF;
+import org.w3c.dom.Node;
+import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
+import org.apache.any23.extractor.html.HTMLDocument;
+import org.apache.any23.extractor.html.DomUtils;
+import java.util.List;
+
+/**
+ * Extractor for the <a href="http://microformats.org/wiki/hresume";>hResume</a>
+ * microformat.
+ *
+ * @author Nisala Nirmana
+ */
+public class HResumeExtractor extends EntityBasedMicroformatExtractor {
+
+    private static final HResume vResume = HResume.getInstance();
+
+    private static final String[] resumeFields = {
+            "name",
+            "summary",
+            "contact",//toDo Hcard
+            "education",
+            "experience",
+            "skill",
+            "affiliation"//toDo Hcard
+    };
+
+    @Override
+    public ExtractorDescription getDescription() {
+        return HResumeExtractorFactory.getDescriptionInstance();
+    }
+
+    @Override
+    public String getBaseClassName() {
+        return Microformats2Prefixes.CLASS_PREFIX + "resume";
+    }
+
+    @Override
+    protected void resetExtractor() {
+        // Empty.
+    }
+
+    @Override
+    protected boolean extractEntity(Node node, ExtractionResult out) throws 
ExtractionException {
+        if (null == node) return false;
+        BNode person = getBlankNodeFor(node);
+        out.writeTriple(person, RDF.TYPE, vResume.Resume);
+        final HTMLDocument fragment = new HTMLDocument(node);
+
+        addName(fragment, person);
+        addSummary(fragment, person);
+        addSkills(fragment, person);
+
+        addExperiences(fragment, person);
+        addEducations(fragment, person);
+
+
+        final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
+        tser.addResourceRoot(
+                DomUtils.getXPathListForNode(node),
+                person,
+                this.getClass()
+        );
+
+        return true;
+    }
+
+    private void addName(HTMLDocument doc, Resource person) {
+        HTMLDocument.TextField name = doc.getSingularTextField(
+                Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]);
+        conditionallyAddStringProperty(
+                name.source(),
+                person,
+                vResume.name,
+                name.value()
+        );
+    }
+
+    private void addSummary(HTMLDocument doc, Resource person) {
+        HTMLDocument.TextField summary = doc.getSingularTextField(
+                Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[1]);
+        conditionallyAddStringProperty(
+                summary.source(),
+                person,
+                vResume.summary,
+                summary.value()
+        );
+    }
+
+    private void addSkills(HTMLDocument doc, Resource person) {
+        final HTMLDocument.TextField[] skills = doc.getPluralTextField(
+                Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[5]);
+        for (HTMLDocument.TextField skill : skills) {
+            conditionallyAddStringProperty(
+                    skill.source(),
+                    person,
+                    vResume.skill,
+                    skill.value()
+            );
+        }
+
+    }
+
+    private void addExperiences(HTMLDocument doc, Resource person) throws 
ExtractionException {
+        List<Node> nodes = 
doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[4] +
+                Microformats2Prefixes.SPACE_SEPARATOR + 
Microformats2Prefixes.CLASS_PREFIX + "event");
+        if (nodes.isEmpty())
+            return;
+        HEventExtractorFactory factory = new HEventExtractorFactory();
+        HEventExtractor extractor = factory.createExtractor();
+        for (Node node : nodes) {
+            BNode event = valueFactory.createBNode();
+            addURIProperty(event, RDF.TYPE, vResume.experience);
+            extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), 
event,
+                    getCurrentExtractionResult());
+        }
+    }
+
+    private void addEducations(HTMLDocument doc, Resource person) throws 
ExtractionException {
+        List<Node> nodes = 
doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[3] +
+                Microformats2Prefixes.SPACE_SEPARATOR + 
Microformats2Prefixes.CLASS_PREFIX + "event");
+        if (nodes.isEmpty())
+            return;
+        HEventExtractorFactory factory = new HEventExtractorFactory();
+        HEventExtractor extractor = factory.createExtractor();
+        for (Node node : nodes) {
+            BNode event = valueFactory.createBNode();
+            addURIProperty(event, RDF.TYPE, vResume.education);
+            extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), 
event,
+                    getCurrentExtractionResult());
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java
new file mode 100644
index 0000000..a8120eb
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.html.microformats2;
+
+import java.util.Arrays;
+
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.PopularPrefixes;
+import org.apache.any23.rdf.Prefixes;
+
+/**
+ * @author Nisala Nirmana
+ *
+ */
+public class HResumeExtractorFactory extends 
SimpleExtractorFactory<HResumeExtractor> implements
+        ExtractorFactory<HResumeExtractor> {
+
+    public static final String NAME = "html-mf2-h-resume";
+    
+    public static final Prefixes PREFIXES = 
PopularPrefixes.createSubset("rdf", "doac", "foaf");
+
+    private static final ExtractorDescription descriptionInstance = new 
HResumeExtractorFactory();
+    
+    public HResumeExtractorFactory() {
+        super(
+                HResumeExtractorFactory.NAME, 
+                HResumeExtractorFactory.PREFIXES,
+                Arrays.asList("text/html;q=0.1", 
"application/xhtml+xml;q=0.1"),
+                "example-mf2-h-resume.html");
+    }
+    
+    @Override
+    public HResumeExtractor createExtractor() {
+        return new HResumeExtractor();
+    }
+
+    public static ExtractorDescription getDescriptionInstance() {
+        return descriptionInstance;
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java
 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java
index 18ac1b1..d6b3349 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java
@@ -23,4 +23,5 @@ public class Microformats2Prefixes {
     public static final String URL_PROPERTY_PREFIX = "u-";
     public static final String EMBEDDED_PROPERTY_PREFIX = "e-";
     public static final String TIME_PROPERTY_PREFIX = "dt-";
+    public static final String SPACE_SEPARATOR = " ";
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
----------------------------------------------------------------------
diff --git 
a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties 
b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
index 34e3975..c7eaf54 100644
--- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
+++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
@@ -34,6 +34,8 @@ skos=http://www.w3.org/2004/02/skos/core#
 hrecipe=http://sindice.com/hrecipe/
 hevent=http://sindice.com/hevent/
 hproduct=http://sindice.com/hproduct/
+hitem=http://sindice.com/hitem/
+hentry=http://sindice.com/hentry/
 sindice=http://vocab.sindice.net/
 og=http://opengraphprotocol.org/schema/
 fb=http://www.facebook.com/2008/fbml#

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java
new file mode 100644
index 0000000..cc2974d
--- /dev/null
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.html.microformats2;
+
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.html.AbstractExtractorTestCase;
+import org.junit.Test;
+import org.openrdf.repository.RepositoryException;
+import org.openrdf.rio.RDFHandlerException;
+
+public class HEntryExtractorTest extends AbstractExtractorTestCase {
+    protected ExtractorFactory<?> getExtractorFactory() {
+        return new HEntryExtractorFactory();
+    }
+
+    @Test
+    public void testModelNotEmpty() throws RepositoryException, 
RDFHandlerException {
+        assertExtract("/microformats2/h-entry/h-entry-test.html");
+        assertModelNotEmpty();
+        assertStatementsSize(null, null, null, 10);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java
index 3b46a7a..49c1755 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java
@@ -32,6 +32,6 @@ public class HProductExtractorTest extends 
AbstractExtractorTestCase {
     public void testModelNotEmpty() throws RepositoryException, 
RDFHandlerException {
         assertExtract("/microformats2/h-product/h-product-test.html");
         assertModelNotEmpty();
-        assertStatementsSize(null, null, null, 11);
+        assertStatementsSize(null, null, null, 6);
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java
new file mode 100644
index 0000000..dd2f5d1
--- /dev/null
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.html.microformats2;
+
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.html.AbstractExtractorTestCase;
+import org.junit.Test;
+import org.openrdf.repository.RepositoryException;
+import org.openrdf.rio.RDFHandlerException;
+
+public class HResumeExtractorTest extends AbstractExtractorTestCase {
+    protected ExtractorFactory<?> getExtractorFactory() {
+        return new HResumeExtractorFactory();
+    }
+
+    @Test
+    public void testModelNotEmpty() throws RepositoryException, 
RDFHandlerException {
+        assertExtract("/microformats2/h-resume/h-resume-test.html");
+        assertModelNotEmpty();
+        assertStatementsSize(null, null, null, 12);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java 
b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java
index b4f8b7a..c58e2a1 100644
--- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java
+++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java
@@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest {
      */
     @Test
     public void testSerializeVocabulariesNTriples() {
-        serializeVocabularies(RDFFormat.NTRIPLES, 1920);
+        serializeVocabularies(RDFFormat.NTRIPLES, 2012);//1920
     }
 
     /**
@@ -53,7 +53,7 @@ public class RDFSchemaUtilsTest {
      */
     @Test
     public void testSerializeVocabulariesRDFXML() {
-        serializeVocabularies(RDFFormat.RDFXML, 4992); // Effective lines + 
separators.
+        serializeVocabularies(RDFFormat.RDFXML, 5252); // Effective lines + 
separators. //4992
     }
 
     private void serializeVocabularies(RDFFormat format, int expectedLines) {

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html 
b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html
new file mode 100644
index 0000000..f3c8cf7
--- /dev/null
+++ b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html
@@ -0,0 +1,53 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!DOCTYPE html>
+<html>
+
+<body>
+<div class="h-entry">
+    <h1><a class="p-name u-url" 
href="http://microformats.org/2012/06/25/microformats-org-at-7";>microformats.org
 at 7</a></h1>
+       
+       <p>Published 
+        <time class="dt-published" datetime="2012-03-25T17:08:26">March 25th, 
2012</time> 
+    </p>
+       
+    <div class="e-content">
+        <p class="p-summary">Last week the microformats.org community 
+            celebrated its 7th birthday at a gathering hosted by Mozilla in 
+            San Francisco and recognized accomplishments, challenges, and 
+            opportunities.</p>
+
+        <p>The microformats tagline “humans first, machines second” 
+            forms the basis of many of our 
+                               <a 
href="http://microformats.org/wiki/principles";>principles</a>, and 
+            in that regard, we’d like to recognize a few people and 
+            thank them for their years of volunteer service </p>
+    </div>
+       
+    <p>Updated 
+        <time class="dt-updated" datetime="2012-06-25T17:08:26">June 25th, 
2012</time> 
+    </p>
+       
+       <div class="p-location h-geo">
+               <p>Location
+                       <abbr class="p-latitude" title="37.408183">N 37° 
24.491</abbr>,  
+                       <abbr class="p-longitude" title="-122.13855">W 122° 
08.313</abbr>
+               </p>
+       </div>
+</div>
+
+</body>
+
+</html>

http://git-wip-us.apache.org/repos/asf/any23/blob/417b71a7/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html 
b/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html
new file mode 100644
index 0000000..15dd835
--- /dev/null
+++ 
b/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html
@@ -0,0 +1,49 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!DOCTYPE html>
+<html>
+
+<body>
+<div class="h-resume">
+    <p class="p-name">Tim Berners-Lee</p>
+
+    <p class="p-summary">Invented the World Wide Web.</p><hr />
+       
+    <div class="p-education h-event">
+       Education : 
+        <time class="dt-start" datetime="1973-09">1973</time> 
+        <time class="dt-end" datetime="1976-06">1976</time>
+    </div>
+
+    <div class="p-experience h-event">
+        <p>Experiance :
+            <time class="dt-start" datetime="2009-01-18">Jan 2009</time>  
Present
+            <time class="dt-duration" datetime="P2Y11M">(2 years 11 
month)</time>
+        </p>
+    </div>
+
+    <div>
+        Skills:     
+       <ul>
+               <li class="p-skill">information systems</li>
+               <li class="p-skill">advocacy</li>
+               <li class="p-skill">leadership</li>
+       <ul>
+    </div>   
+
+</div>
+</body>
+
+</html>

Reply via email to