any23 git commit: ANY23-185 Add missing element attributes to HTMLMetaExtractor

lewismc Sat, 13 Jun 2015 11:09:20 -0700

Repository: any23
Updated Branches:
  refs/heads/master a03bafa9c -> 0d106d4f2



ANY23-185 Add missing <meta> element attributes to HTMLMetaExtractor


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0d106d4f
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0d106d4f
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0d106d4f

Branch: refs/heads/master
Commit: 0d106d4f2aa26de8b2626d2d38991717d1a0a0fe
Parents: a03bafa
Author: Lewis John McGibbney <[email protected]>
Authored: Sat Jun 13 11:08:34 2015 -0700
Committer: Lewis John McGibbney <[email protected]>
Committed: Sat Jun 13 11:08:34 2015 -0700

----------------------------------------------------------------------
 .../any23/extractor/html/HTMLMetaExtractor.java | 88 ++++++++++++++++----
 .../test/java/org/apache/any23/Any23Test.java   |  4 +-
 .../extractor/html/HTMLMetaExtractorTest.java   |  9 +-
 .../html/html-head-link-extractor.html          |  1 -
 ...-meta-extractor-with-mozilla-extensions.html | 34 ++++++++
 5 files changed, 117 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
index 16a0f6c..3e0c84e 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
@@ -81,11 +81,19 @@ public class HTMLMetaExtractor implements 
Extractor.TagSoupDOMExtractor {
             if(meta.getLang() != null) {
                 lang = meta.getLang();
             }
-            out.writeTriple(
-                    documentURI,
-                    meta.getName(),
-                    new LiteralImpl(meta.getContent(), lang)
-            );
+            if(meta.isPragmaDirective){
+                out.writeTriple(
+                        documentURI,
+                        meta.getHttpEquiv(),
+                        new LiteralImpl(meta.getContent(), lang)
+                );
+            }else {
+                out.writeTriple(
+                        documentURI,
+                        meta.getName(),
+                        new LiteralImpl(meta.getContent(), lang)
+                );
+            }
         }
     }
 
@@ -134,19 +142,37 @@ public class HTMLMetaExtractor implements 
Extractor.TagSoupDOMExtractor {
         for (Node metaNode : metaNodes) {
             NamedNodeMap attributes = metaNode.getAttributes();
             Node nameAttribute = attributes.getNamedItem("name");
+            Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
             Node contentAttribute = attributes.getNamedItem("content");
-            if (nameAttribute == null || contentAttribute == null) {
-                continue;
+            if (nameAttribute == null && httpEquivAttribute == null)
+                continue; //support HTML5 meta element nodes that do not have 
both name and http-equiv
+            if (nameAttribute != null || httpEquivAttribute != null){
+                if ( contentAttribute == null ){
+                    continue;
+                }
             }
-            String name = nameAttribute.getTextContent();
-            String content = contentAttribute.getTextContent();
-            String xpath = DomUtils.getXPathForNode(metaNode);
-            URI nameAsURI = getPrefixIfExists(name);
-            if (nameAsURI == null) {
-                nameAsURI = new URIImpl(baseProfile + name);
+            boolean isPragmaDirective = (httpEquivAttribute != null) ? true : 
false;
+            if (isPragmaDirective){
+                String httpEquiv = httpEquivAttribute.getTextContent();
+                String content = contentAttribute.getTextContent();
+                String xpath = DomUtils.getXPathForNode(metaNode);
+                URI httpEquivAsURI = getPrefixIfExists(httpEquiv);
+                if (httpEquivAsURI == null) {
+                    httpEquivAsURI = new URIImpl(baseProfile + httpEquiv);
+                }
+                Meta meta = new Meta(xpath, content, httpEquivAsURI);
+                result.add(meta);
+            } else {
+                String name = nameAttribute.getTextContent();
+                String content = contentAttribute.getTextContent();
+                String xpath = DomUtils.getXPathForNode(metaNode);
+                URI nameAsURI = getPrefixIfExists(name);
+                if (nameAsURI == null) {
+                    nameAsURI = new URIImpl(baseProfile + name);
+                }
+                Meta meta = new Meta(xpath, nameAsURI, content);
+                result.add(meta);
             }
-            Meta meta = new Meta(xpath, nameAsURI, content);
-            result.add(meta);
         }
         return result;
     }
@@ -170,10 +196,26 @@ public class HTMLMetaExtractor implements 
Extractor.TagSoupDOMExtractor {
 
         private URI name;
 
+        private URI httpEquiv;
+
         private String lang;
 
         private String content;
 
+        private boolean isPragmaDirective;
+
+        public Meta(String xpath, String content, URI httpEquiv) {
+            this.xpath = xpath;
+            this.content = content;
+            this.httpEquiv = httpEquiv;
+            this.setPragmaDirective(true);
+        }
+
+        public Meta(String xpath, String content, URI httpEquiv, String lang) {
+            this(xpath,content,httpEquiv);
+            this.lang = lang;
+        }
+
         public Meta(String xpath, URI name, String content) {
             this.xpath = xpath;
             this.name = name;
@@ -185,6 +227,22 @@ public class HTMLMetaExtractor implements 
Extractor.TagSoupDOMExtractor {
             this.lang = lang;
         }
 
+        public boolean isPragmaDirective(){
+            return isPragmaDirective;
+        }
+
+        private void setPragmaDirective(boolean value){
+            this.isPragmaDirective=value;
+        }
+
+        public URI getHttpEquiv(){
+            return httpEquiv;
+        }
+
+        public void setHttpEquiv(URI httpEquiv){
+            this.httpEquiv=httpEquiv;
+        }
+
         public URI getName() {
             return name;
         }

http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/test/java/org/apache/any23/Any23Test.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/Any23Test.java 
b/core/src/test/java/org/apache/any23/Any23Test.java
index 24bc913..c487ee8 100644
--- a/core/src/test/java/org/apache/any23/Any23Test.java
+++ b/core/src/test/java/org/apache/any23/Any23Test.java
@@ -286,7 +286,7 @@ public class Any23Test extends Any23OnlineTestBase {
 
         final String bufferContent = byteArrayOutputStream.toString();
         logger.debug(bufferContent);
-        Assert.assertSame("Unexpected number of triples.", 16,
+        Assert.assertSame("Unexpected number of triples.", 18,
                 StringUtils.countNL(bufferContent));
 
     }
@@ -368,7 +368,7 @@ public class Any23Test extends Any23OnlineTestBase {
     @Test
     public void testExtractionParametersWithNestingDisabled()
             throws IOException, ExtractionException, TripleHandlerException {
-        final int EXPECTED_TRIPLES = 19;
+        final int EXPECTED_TRIPLES = 20;
         Any23 runner = new Any23();
         DocumentSource source = getDocumentSourceFromResource(
                 "/microformats/nested-microformats-a1.html",

http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java 
b/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
index b35e33c..854360c 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java
@@ -40,7 +40,7 @@ public class HTMLMetaExtractorTest extends 
AbstractExtractorTestCase {
        public void testExtractPageMeta() throws Exception {
                assertExtract("/html/html-head-meta-extractor.html");
                assertModelNotEmpty();
-               assertStatementsSize(null, null, null, 7);
+               assertStatementsSize(null, null, null, 10);
                assertContains(new URIImpl("http://bob.example.com/";), new 
URIImpl(
                                "http://purl.org/dc/elements/1.1/title";), 
"XHTML+RDFa example",
                                "en");
@@ -70,4 +70,11 @@ public class HTMLMetaExtractorTest extends 
AbstractExtractorTestCase {
                assertModelEmpty();
        }
 
+       @Test
+       public void testExtractPageMetaWithExtensionsPerMozillaSpecification() 
throws Exception {
+               
assertExtract("/html/html-head-meta-extractor-with-mozilla-extensions.html");
+               assertModelNotEmpty();
+               assertStatementsSize(null, null, null, 2);
+       }
+
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/test-resources/src/test/resources/html/html-head-link-extractor.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/html/html-head-link-extractor.html 
b/test-resources/src/test/resources/html/html-head-link-extractor.html
index 86a76d6..59a374a 100644
--- a/test-resources/src/test/resources/html/html-head-link-extractor.html
+++ b/test-resources/src/test/resources/html/html-head-link-extractor.html
@@ -18,7 +18,6 @@
 -->
 <html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
 <head>
-    <meta http-equiv="content-type" content="text/html;charset=UTF-8"/>
     <title>myExperiment &#45; Workflows &#45; Pathways and Gene annotations 
for QTL region - Mouse (Paul Fisher)
         [Taverna 2 Workflow]</title>
     <link rel="alternate" href="http://www.myexperiment.org/workflows/16.rdf"; 
type="application/rdf+xml"

http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
 
b/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
new file mode 100644
index 0000000..87a1fac
--- /dev/null
+++ 
b/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html
@@ -0,0 +1,34 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+<head>
+    <title>test to check meta extraction with missing elements per mozilla 
specification</title>
+    <!-- Defining the charset in HTML4 -->
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+    <!-- In HTML5 -->
+    <meta charset="utf-8"/>
+
+    <!-- Redirect page after 3 seconds -->
+    <meta http-equiv="refresh" content="3;url=http://www.mozilla.org/"/>
+</head>
+<body>
+</body>
+</html>
+

any23 git commit: ANY23-185 Add missing element attributes to HTMLMetaExtractor

Reply via email to