Repository: any23 Updated Branches: refs/heads/master a03bafa9c -> 0d106d4f2
ANY23-185 Add missing <meta> element attributes to HTMLMetaExtractor Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0d106d4f Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0d106d4f Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0d106d4f Branch: refs/heads/master Commit: 0d106d4f2aa26de8b2626d2d38991717d1a0a0fe Parents: a03bafa Author: Lewis John McGibbney <[email protected]> Authored: Sat Jun 13 11:08:34 2015 -0700 Committer: Lewis John McGibbney <[email protected]> Committed: Sat Jun 13 11:08:34 2015 -0700 ---------------------------------------------------------------------- .../any23/extractor/html/HTMLMetaExtractor.java | 88 ++++++++++++++++---- .../test/java/org/apache/any23/Any23Test.java | 4 +- .../extractor/html/HTMLMetaExtractorTest.java | 9 +- .../html/html-head-link-extractor.html | 1 - ...-meta-extractor-with-mozilla-extensions.html | 34 ++++++++ 5 files changed, 117 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java index 16a0f6c..3e0c84e 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java @@ -81,11 +81,19 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor { if(meta.getLang() != null) { lang = meta.getLang(); } - out.writeTriple( - documentURI, - meta.getName(), - new LiteralImpl(meta.getContent(), lang) - ); + if(meta.isPragmaDirective){ + out.writeTriple( + documentURI, + meta.getHttpEquiv(), + new LiteralImpl(meta.getContent(), lang) + ); + }else { + out.writeTriple( + documentURI, + meta.getName(), + new LiteralImpl(meta.getContent(), lang) + ); + } } } @@ -134,19 +142,37 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor { for (Node metaNode : metaNodes) { NamedNodeMap attributes = metaNode.getAttributes(); Node nameAttribute = attributes.getNamedItem("name"); + Node httpEquivAttribute = attributes.getNamedItem("http-equiv"); Node contentAttribute = attributes.getNamedItem("content"); - if (nameAttribute == null || contentAttribute == null) { - continue; + if (nameAttribute == null && httpEquivAttribute == null) + continue; //support HTML5 meta element nodes that do not have both name and http-equiv + if (nameAttribute != null || httpEquivAttribute != null){ + if ( contentAttribute == null ){ + continue; + } } - String name = nameAttribute.getTextContent(); - String content = contentAttribute.getTextContent(); - String xpath = DomUtils.getXPathForNode(metaNode); - URI nameAsURI = getPrefixIfExists(name); - if (nameAsURI == null) { - nameAsURI = new URIImpl(baseProfile + name); + boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false; + if (isPragmaDirective){ + String httpEquiv = httpEquivAttribute.getTextContent(); + String content = contentAttribute.getTextContent(); + String xpath = DomUtils.getXPathForNode(metaNode); + URI httpEquivAsURI = getPrefixIfExists(httpEquiv); + if (httpEquivAsURI == null) { + httpEquivAsURI = new URIImpl(baseProfile + httpEquiv); + } + Meta meta = new Meta(xpath, content, httpEquivAsURI); + result.add(meta); + } else { + String name = nameAttribute.getTextContent(); + String content = contentAttribute.getTextContent(); + String xpath = DomUtils.getXPathForNode(metaNode); + URI nameAsURI = getPrefixIfExists(name); + if (nameAsURI == null) { + nameAsURI = new URIImpl(baseProfile + name); + } + Meta meta = new Meta(xpath, nameAsURI, content); + result.add(meta); } - Meta meta = new Meta(xpath, nameAsURI, content); - result.add(meta); } return result; } @@ -170,10 +196,26 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor { private URI name; + private URI httpEquiv; + private String lang; private String content; + private boolean isPragmaDirective; + + public Meta(String xpath, String content, URI httpEquiv) { + this.xpath = xpath; + this.content = content; + this.httpEquiv = httpEquiv; + this.setPragmaDirective(true); + } + + public Meta(String xpath, String content, URI httpEquiv, String lang) { + this(xpath,content,httpEquiv); + this.lang = lang; + } + public Meta(String xpath, URI name, String content) { this.xpath = xpath; this.name = name; @@ -185,6 +227,22 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor { this.lang = lang; } + public boolean isPragmaDirective(){ + return isPragmaDirective; + } + + private void setPragmaDirective(boolean value){ + this.isPragmaDirective=value; + } + + public URI getHttpEquiv(){ + return httpEquiv; + } + + public void setHttpEquiv(URI httpEquiv){ + this.httpEquiv=httpEquiv; + } + public URI getName() { return name; } http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/test/java/org/apache/any23/Any23Test.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java index 24bc913..c487ee8 100644 --- a/core/src/test/java/org/apache/any23/Any23Test.java +++ b/core/src/test/java/org/apache/any23/Any23Test.java @@ -286,7 +286,7 @@ public class Any23Test extends Any23OnlineTestBase { final String bufferContent = byteArrayOutputStream.toString(); logger.debug(bufferContent); - Assert.assertSame("Unexpected number of triples.", 16, + Assert.assertSame("Unexpected number of triples.", 18, StringUtils.countNL(bufferContent)); } @@ -368,7 +368,7 @@ public class Any23Test extends Any23OnlineTestBase { @Test public void testExtractionParametersWithNestingDisabled() throws IOException, ExtractionException, TripleHandlerException { - final int EXPECTED_TRIPLES = 19; + final int EXPECTED_TRIPLES = 20; Any23 runner = new Any23(); DocumentSource source = getDocumentSourceFromResource( "/microformats/nested-microformats-a1.html", http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java index b35e33c..854360c 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/HTMLMetaExtractorTest.java @@ -40,7 +40,7 @@ public class HTMLMetaExtractorTest extends AbstractExtractorTestCase { public void testExtractPageMeta() throws Exception { assertExtract("/html/html-head-meta-extractor.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 7); + assertStatementsSize(null, null, null, 10); assertContains(new URIImpl("http://bob.example.com/"), new URIImpl( "http://purl.org/dc/elements/1.1/title"), "XHTML+RDFa example", "en"); @@ -70,4 +70,11 @@ public class HTMLMetaExtractorTest extends AbstractExtractorTestCase { assertModelEmpty(); } + @Test + public void testExtractPageMetaWithExtensionsPerMozillaSpecification() throws Exception { + assertExtract("/html/html-head-meta-extractor-with-mozilla-extensions.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 2); + } + } http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/test-resources/src/test/resources/html/html-head-link-extractor.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-head-link-extractor.html b/test-resources/src/test/resources/html/html-head-link-extractor.html index 86a76d6..59a374a 100644 --- a/test-resources/src/test/resources/html/html-head-link-extractor.html +++ b/test-resources/src/test/resources/html/html-head-link-extractor.html @@ -18,7 +18,6 @@ --> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> - <meta http-equiv="content-type" content="text/html;charset=UTF-8"/> <title>myExperiment - Workflows - Pathways and Gene annotations for QTL region - Mouse (Paul Fisher) [Taverna 2 Workflow]</title> <link rel="alternate" href="http://www.myexperiment.org/workflows/16.rdf" type="application/rdf+xml" http://git-wip-us.apache.org/repos/asf/any23/blob/0d106d4f/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html b/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html new file mode 100644 index 0000000..87a1fac --- /dev/null +++ b/test-resources/src/test/resources/html/html-head-meta-extractor-with-mozilla-extensions.html @@ -0,0 +1,34 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> + <title>test to check meta extraction with missing elements per mozilla specification</title> + <!-- Defining the charset in HTML4 --> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> + + <!-- In HTML5 --> + <meta charset="utf-8"/> + + <!-- Redirect page after 3 seconds --> + <meta http-equiv="refresh" content="3;url=http://www.mozilla.org/"/> +</head> +<body> +</body> +</html> +
