[1/8] any23 git commit: ANY23-226 Extract JSON-LD embedded in HTML

ansell Fri, 20 Mar 2015 22:07:30 -0700

Repository: any23
Updated Branches:
  refs/heads/master f88cc51f3 -> 93c38a69e



ANY23-226 Extract JSON-LD embedded in HTML


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/1e3eb9c3
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/1e3eb9c3
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/1e3eb9c3

Branch: refs/heads/master
Commit: 1e3eb9c31af2f93906eee1081179d73c30a0881b
Parents: f88cc51
Author: Lewis John McGibbney <[email protected]>
Authored: Fri Mar 20 10:55:29 2015 -0500
Committer: Lewis John McGibbney <[email protected]>
Committed: Fri Mar 20 10:55:29 2015 -0500

----------------------------------------------------------------------
 .../apache/any23/extractor/html/DomUtils.java   |  70 ++++++
 .../extractor/html/EmbeddedJSONLDExtractor.java | 242 +++++++++++++++++++
 .../html/EmbeddedJSONLDExtractorFactory.java    |  58 +++++
 .../any23/extractor/rdf/BaseRDFExtractor.java   |   3 +-
 .../extractor/html/example-embedded-jsonld.html |  34 +++
 .../apache/any23/prefixes/prefixes.properties   |   1 +
 .../html/EmbeddedJSONLDExtractorTest.java       |  50 ++++
 .../java/org/apache/any23/plugin/PluginIT.java  |   2 +-
 .../html/html-embedded-jsonld-extractor.html    |  34 +++
 9 files changed, 492 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java 
b/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
index 0fd8cdc..be27fda 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
@@ -17,6 +17,7 @@
 
 package org.apache.any23.extractor.html;
 
+import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -25,17 +26,25 @@ import org.w3c.dom.traversal.NodeFilter;
 import org.w3c.dom.traversal.NodeIterator;
 
 import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Result;
 import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.TransformerFactoryConfigurationError;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;
@@ -459,5 +468,66 @@ public class DomUtils {
 
         return result;
     }
+    
+    /**
+     * Given a {@link org.w3c.dom.Document} this method will return an
+     * input stream representing that document.
+     * @param doc the input {@link org.w3c.dom.Document}
+     * @return an {@link java.io.InputStream}
+     */
+    public static InputStream documentToInputStream(Document doc) {
+      DOMSource source = new DOMSource(doc);
+      StringWriter xmlAsWriter = new StringWriter();
+      StreamResult result = new StreamResult(xmlAsWriter);
+      try {
+        TransformerFactory.newInstance().newTransformer().transform(source, 
result);
+      } catch (TransformerConfigurationException e) {
+        throw new RuntimeException("Error within Document to InputStream 
transformation configuration!");
+      } catch (TransformerException e) {
+        throw new RuntimeException("Error whilst transforming the Document to 
InputStream!");
+      } catch (TransformerFactoryConfigurationError e) {
+        throw new RuntimeException("Error within Document to InputStream 
transformation configuration!");
+      }
+       
+      InputStream is = null;
+      try {
+        is = new 
ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8"));
+      } catch (UnsupportedEncodingException e) {
+        e.printStackTrace();
+      }
+      return is;
+    }
+    
+
+    /**
+     * Convert a w3c dom node to a InputStream
+     * @param node
+     * @return
+     * @throws TransformerException
+     */
+    public static InputStream nodeToInputStream(Node node) {
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        Result outputTarget = new StreamResult(outputStream);
+        Transformer t = null;
+        try {
+          t = TransformerFactory.newInstance().newTransformer();
+        } catch (TransformerConfigurationException e) {
+          // TODO Auto-generated catch block
+          e.printStackTrace();
+        } catch (TransformerFactoryConfigurationError e) {
+          // TODO Auto-generated catch block
+          e.printStackTrace();
+        }
+        t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
+        try {
+          t.transform(new DOMSource(node), outputTarget);
+        } catch (TransformerException e) {
+          // TODO Auto-generated catch block
+          e.printStackTrace();
+        }
+        return new ByteArrayInputStream(outputStream.toByteArray());
+    }
+
+
 
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
new file mode 100644
index 0000000..5506a10
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.html;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.Extractor;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.rdf.JSONLDExtractor;
+import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.vocab.SINDICE;
+import org.openrdf.model.URI;
+import org.openrdf.model.impl.LiteralImpl;
+import org.openrdf.model.impl.URIImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * This extractor represents the HTML script tags used to embed blocks of data 
in documents. 
+ * This way, JSON-LD content can be easily embedded in HTML by placing it in a 
script element 
+ * with the type attribute set to application/ld+json 
+ * according the <a 
href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents";>JSON-LD
 specification</a>.
+ *
+ */
+public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
+
+    private static final SINDICE vSINDICE = SINDICE.getInstance();
+
+    private URI profile;
+
+    private Map<String, URI> prefixes = new HashMap<String, URI>();
+
+    private String documentLang;
+    
+    private JSONLDExtractor extractor;
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void run(
+            ExtractionParameters extractionParameters,
+            ExtractionContext extractionContext,
+            Document in,
+            ExtractionResult out
+    ) throws IOException, ExtractionException {
+        profile = extractProfile(in);
+        documentLang = getDocumentLanguage(in);
+        extractLinkDefinedPrefixes(in);
+
+        String baseProfile = vSINDICE.NS;
+        if(profile != null) {
+            baseProfile = profile.toString();
+        }
+
+        final URI documentURI = extractionContext.getDocumentURI();
+        Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, 
extractionParameters, extractionContext, out);
+        for(JSONLDScript jsonldScript : jsonldScripts) {
+            String lang = documentLang;
+            if(jsonldScript.getLang() != null) {
+                lang = jsonldScript.getLang();
+            }
+            out.writeTriple(
+                    documentURI,
+                    jsonldScript.getName(),
+                    new LiteralImpl(jsonldScript.getContent(), lang)
+            );
+        }
+    }
+
+    /**
+     * Returns the {@link Document} language if declared, <code>null</code> 
otherwise.
+     *
+     * @param in a instance of {@link Document}.
+     * @return the language declared, could be <code>null</code>.
+     */
+    private String getDocumentLanguage(Document in) {
+        String lang = DomUtils.find(in, "string(/HTML/@lang)");
+        if (lang.equals("")) {
+            return null;
+        }
+        return lang;
+    }
+
+    private URI extractProfile(Document in) {
+        String profile = DomUtils.find(in, "string(/HTML/@profile)");
+        if (profile.equals("")) {
+            return null;
+        }
+        return new URIImpl(profile);
+    }
+
+    /**
+     * It extracts prefixes defined in the <i>LINK</i> meta tags.
+     *
+     * @param in
+     */
+    private void extractLinkDefinedPrefixes(Document in) {
+        List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
+        for(Node linkNode : linkNodes) {
+            NamedNodeMap attributes = linkNode.getAttributes();
+            String rel = attributes.getNamedItem("rel").getTextContent();
+            String href = attributes.getNamedItem("href").getTextContent();
+            if(rel != null && href !=null && RDFUtils.isAbsoluteURI(href)) {
+                prefixes.put(rel, new URIImpl(href));
+            }
+        }
+    }
+
+    private Set<JSONLDScript> extractJSONLDScript(Document in, String 
baseProfile, ExtractionParameters extractionParameters, 
+            ExtractionContext extractionContext, ExtractionResult out) throws 
IOException, ExtractionException {
+        List<Node> scriptNodes = DomUtils.findAll(in, "/HTML/HEAD/SCRIPT");
+        Set<JSONLDScript> result = new HashSet<JSONLDScript>();
+        extractor = new JSONLDExtractorFactory().createExtractor();
+        for (Node jsonldNode : scriptNodes) {
+            NamedNodeMap attributes = jsonldNode.getAttributes();
+            for (int i = 0; i < attributes.getLength(); i++) {
+              if 
(attributes.item(i).getTextContent().equalsIgnoreCase("application/ld+json")) {
+              extractor.run(extractionParameters, extractionContext, 
DomUtils.nodeToInputStream(jsonldNode), out);
+              }
+            }
+            Node nameAttribute = attributes.getNamedItem("name");
+            Node contentAttribute = attributes.getNamedItem("content");
+            if (nameAttribute == null || contentAttribute == null) {
+                continue;
+            }
+            String name = nameAttribute.getTextContent();
+            String content = contentAttribute.getTextContent();
+            String xpath = DomUtils.getXPathForNode(jsonldNode);
+            URI nameAsURI = getPrefixIfExists(name);
+            if (nameAsURI == null) {
+                nameAsURI = new URIImpl(baseProfile + name);
+             }
+            JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsURI, 
content);
+            result.add(jsonldScript);
+        }
+        return result;
+    }
+
+    private URI getPrefixIfExists(String name) {
+        String[] split = name.split("\\.");
+        if(split.length == 2 && prefixes.containsKey(split[0])) {
+            return new URIImpl(prefixes.get(split[0]) + split[1]);
+        }
+        return null;
+    }
+
+    @Override
+    public ExtractorDescription getDescription() {
+        return HTMLMetaExtractorFactory.getDescriptionInstance();
+    }
+
+    private class JSONLDScript {
+
+        private String xpath;
+
+        private URI name;
+
+        private String lang;
+
+        private String content;
+
+        public JSONLDScript(String xpath, URI name, String content) {
+            this.xpath = xpath;
+            this.name = name;
+            this.content = content;
+        }
+
+        public JSONLDScript(String xpath, URI name, String content, String 
lang) {
+            this(xpath, name, content);
+            this.lang = lang;
+        }
+
+        public URI getName() {
+            return name;
+        }
+
+        public void setName(URI name) {
+            this.name = name;
+        }
+
+        public String getLang() {
+            return lang;
+        }
+
+        public void setLang(String lang) {
+            this.lang = lang;
+        }
+
+        public String getContent() {
+            return content;
+        }
+
+        public void setContent(String content) {
+            this.content = content;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            JSONLDScript meta = (JSONLDScript) o;
+
+            if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != 
null) return false;
+
+            return true;
+        }
+
+        @Override
+        public int hashCode() {
+            return xpath != null ? xpath.hashCode() : 0;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorFactory.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorFactory.java
 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorFactory.java
new file mode 100644
index 0000000..2e7810f
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.html;
+
+import java.util.Arrays;
+
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.PopularPrefixes;
+import org.apache.any23.rdf.Prefixes;
+import org.kohsuke.MetaInfServices;
+
+/**
+ *
+ */
+@MetaInfServices(ExtractorFactory.class)
+public class EmbeddedJSONLDExtractorFactory extends 
SimpleExtractorFactory<EmbeddedJSONLDExtractor> implements
+        ExtractorFactory<EmbeddedJSONLDExtractor> {
+
+    public static final String NAME = "html-embedded-jsonld";
+    
+    public static final Prefixes PREFIXES = 
PopularPrefixes.createSubset("jsonld");
+
+    private static final ExtractorDescription descriptionInstance = new 
EmbeddedJSONLDExtractorFactory();
+    
+    public EmbeddedJSONLDExtractorFactory() {
+        super(
+                EmbeddedJSONLDExtractorFactory.NAME, 
+                EmbeddedJSONLDExtractorFactory.PREFIXES,
+                Arrays.asList("text/html;q=0.02", 
"application/xhtml+xml;q=0.02"),
+                "example-embedded-jsonld.html");
+    }
+    
+    @Override
+    public EmbeddedJSONLDExtractor createExtractor() {
+        return new EmbeddedJSONLDExtractor();
+    }
+
+    public static ExtractorDescription getDescriptionInstance() {
+        return descriptionInstance;
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index d26f762..052bfa9 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -100,7 +100,8 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
             
parser.getParserConfig().addNonFatalError(BasicParserSettings.VERIFY_DATATYPE_VALUES);
             
parser.getParserConfig().set(BasicParserSettings.NORMALIZE_DATATYPE_VALUES, 
false);                
             
parser.getParserConfig().addNonFatalError(BasicParserSettings.NORMALIZE_DATATYPE_VALUES);
-            
+            //ByteBuffer seems to represent incorrect content. Need to make 
sure it is the content
+            //of the <script> node and not anything else!
             parser.parse(in, extractionContext.getDocumentURI().stringValue());
         } catch (RDFHandlerException ex) {
             throw new IllegalStateException("Unexpected exception.", ex);

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/main/resources/org/apache/any23/extractor/html/example-embedded-jsonld.html
----------------------------------------------------------------------
diff --git 
a/core/src/main/resources/org/apache/any23/extractor/html/example-embedded-jsonld.html
 
b/core/src/main/resources/org/apache/any23/extractor/html/example-embedded-jsonld.html
new file mode 100644
index 0000000..09859ff
--- /dev/null
+++ 
b/core/src/main/resources/org/apache/any23/extractor/html/example-embedded-jsonld.html
@@ -0,0 +1,34 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html>
+  <head>
+    <title>Hello World!</title>
+    <meta name="title" content="Embedded JSONLD extractor"/>
+    <!-- As per spec in 
http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents -->
+    <script type="application/ld+json">
+    {
+      "@context": "http://json-ld.org/contexts/person.jsonld";,
+      "@id": "http://dbpedia.org/resource/Robert_Millar";,
+      "name": "Robert Millar",
+      "born": "1958-09-13",
+      "birthPlace": "http://dbpedia.org/resource/Glasgow";
+    }
+    </script>
+  </head>
+  <h1>Embedded JSONLD Extractor</h1>
+  <p>It extracts only the embedded JSON-LD elements.
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
----------------------------------------------------------------------
diff --git 
a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties 
b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
index 1151056..58516ec 100644
--- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
+++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties
@@ -35,3 +35,4 @@ hrecipe=http://sindice.com/hrecipe/
 sindice=http://vocab.sindice.net/
 og=http://opengraphprotocol.org/schema/
 fb=http://www.facebook.com/2008/fbml#
+jsonld=http://www.w3.org/ns/json-ld#

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
new file mode 100644
index 0000000..b30840c
--- /dev/null
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.html;
+
+import static org.junit.Assert.*;
+
+import org.apache.any23.extractor.ExtractorFactory;
+import org.junit.Test;
+import org.openrdf.repository.RepositoryException;
+
+/**
+ * @author lmcgibbn
+ *
+ */
+public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase {
+
+  @Test
+  public void testEmbeddedJSONLDInHead() throws RepositoryException {
+    assertExtract("/html/html-embedded-jsonld-extractor.html");
+    assertModelNotEmpty();
+    assertStatementsSize(null, null, null, 7);
+  }
+  
+  @Test
+  public void testSeveralEmbeddedJSONLDInHead() throws RepositoryException {
+    assertExtract("/html/html-embedded-jsonld-extractor.html");
+    assertModelNotEmpty();
+    assertStatementsSize(null, null, null, 7);
+  }
+
+  @Override
+  protected ExtractorFactory<?> getExtractorFactory() {
+    return new EmbeddedJSONLDExtractorFactory();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
----------------------------------------------------------------------
diff --git 
a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java 
b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
index cbb9c32..a3e5675 100644
--- 
a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
+++ 
b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
@@ -41,7 +41,7 @@ import static org.junit.Assert.assertTrue;
  */
 public class PluginIT {
 
-    private static final int NUM_OF_EXTRACTORS = 30;
+    private static final int NUM_OF_EXTRACTORS = 31;
 
     private static final String PLUGIN_DIR = "target/plugins-build/";
 

http://git-wip-us.apache.org/repos/asf/any23/blob/1e3eb9c3/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html 
b/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html
new file mode 100644
index 0000000..09859ff
--- /dev/null
+++ b/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html
@@ -0,0 +1,34 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html>
+  <head>
+    <title>Hello World!</title>
+    <meta name="title" content="Embedded JSONLD extractor"/>
+    <!-- As per spec in 
http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents -->
+    <script type="application/ld+json">
+    {
+      "@context": "http://json-ld.org/contexts/person.jsonld";,
+      "@id": "http://dbpedia.org/resource/Robert_Millar";,
+      "name": "Robert Millar",
+      "born": "1958-09-13",
+      "birthPlace": "http://dbpedia.org/resource/Glasgow";
+    }
+    </script>
+  </head>
+  <h1>Embedded JSONLD Extractor</h1>
+  <p>It extracts only the embedded JSON-LD elements.
+</html>
\ No newline at end of file

[1/8] any23 git commit: ANY23-226 Extract JSON-LD embedded in HTML

Reply via email to