Author: jukka
Date: Wed Oct 14 20:36:29 2009
New Revision: 825266
URL: http://svn.apache.org/viewvc?rev=825266&view=rev
Log:
TIKA-311: Broken handling of <a name="..."/> tags
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825266&r1=825265&r2=825266&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Wed Oct 14 20:36:29 2009
@@ -174,10 +174,16 @@
xhtml.startElement(SAFE_ELEMENTS.get(name));
} else if ("A".equals(name)) {
String href = atts.getValue("href");
- if (href == null) {
- href = "";
+ if (href != null) {
+ xhtml.startElement("a", "href", href);
+ } else {
+ String anchor = atts.getValue("name");
+ if (anchor != null) {
+ xhtml.startElement("a", "name", anchor);
+ } else {
+ xhtml.startElement("a");
+ }
}
- xhtml.startElement("a", "href", href);
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=825266&r1=825265&r2=825266&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Wed Oct 14 20:36:29 2009
@@ -39,6 +39,7 @@
public void testParseAscii() throws Exception {
String path = "/test-documents/testHTML.html";
final StringWriter href = new StringWriter();
+ final StringWriter name = new StringWriter();
ContentHandler body = new BodyContentHandler();
Metadata metadata = new Metadata();
InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
@@ -49,7 +50,11 @@
String u, String l, String n, Attributes a)
throws SAXException {
if ("a".equals(l)) {
- href.append(a.getValue("href"));
+ if (a.getValue("href") != null) {
+ href.append(a.getValue("href"));
+ } else if (a.getValue("name") != null) {
+ name.append(a.getValue("name"));
+ }
}
}
};
@@ -66,6 +71,7 @@
assertEquals("5", metadata.get("refresh"));
assertEquals("http://www.apache.org/", href.toString());
+ assertEquals("test-anchor", name.toString());
String content = body.toString();
assertTrue(
Modified:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html?rev=825266&r1=825265&r2=825266&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
Wed Oct 14 20:36:29 2009
@@ -21,7 +21,7 @@
<meta http-equiv="refresh" content="5">
</head>
<body>
- <h1>Test Indexation Html</h1>
+ <h1><a name="test-anchor"></a>Test Indexation Html</h1>
<p><a href="http://www.apache.org/">Indexation</a> du
fichier</p>
</body>
</html>
\ No newline at end of file