Author: jukka
Date: Wed Oct 14 20:36:29 2009
New Revision: 825266

URL: http://svn.apache.org/viewvc?rev=825266&view=rev
Log:
TIKA-311: Broken handling of <a name="..."/> tags

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825266&r1=825265&r2=825266&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Wed Oct 14 20:36:29 2009
@@ -174,10 +174,16 @@
                     xhtml.startElement(SAFE_ELEMENTS.get(name));
                 } else if ("A".equals(name)) {
                     String href = atts.getValue("href");
-                    if (href == null) {
-                        href = "";
+                    if (href != null) {
+                        xhtml.startElement("a", "href", href);
+                    } else {
+                        String anchor = atts.getValue("name");
+                        if (anchor != null) {
+                            xhtml.startElement("a", "name", anchor);
+                        } else {
+                            xhtml.startElement("a");
+                        }
                     }
-                    xhtml.startElement("a", "href", href);
                 }
             }
 

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=825266&r1=825265&r2=825266&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Wed Oct 14 20:36:29 2009
@@ -39,6 +39,7 @@
     public void testParseAscii() throws Exception {
         String path = "/test-documents/testHTML.html";
         final StringWriter href = new StringWriter();
+        final StringWriter name = new StringWriter();
         ContentHandler body = new BodyContentHandler();
         Metadata metadata = new Metadata();
         InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
@@ -49,7 +50,11 @@
                         String u, String l, String n, Attributes a)
                         throws SAXException {
                     if ("a".equals(l)) {
-                        href.append(a.getValue("href"));
+                        if (a.getValue("href") != null) {
+                            href.append(a.getValue("href"));
+                        } else if (a.getValue("name") != null) {
+                            name.append(a.getValue("name"));
+                        }
                     }
                 }
             };
@@ -66,6 +71,7 @@
         assertEquals("5", metadata.get("refresh"));
 
         assertEquals("http://www.apache.org/";, href.toString());
+        assertEquals("test-anchor", name.toString());
 
         String content = body.toString();
         assertTrue(

Modified: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html?rev=825266&r1=825265&r2=825266&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html 
(original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html 
Wed Oct 14 20:36:29 2009
@@ -21,7 +21,7 @@
         <meta http-equiv="refresh" content="5">
     </head>
        <body>
-               <h1>Test Indexation Html</h1>
+               <h1><a name="test-anchor"></a>Test Indexation Html</h1>
                <p><a href="http://www.apache.org/";>Indexation</a> du 
fichier</p>
        </body>
 </html>
\ No newline at end of file


Reply via email to