Author: jukka
Date: Sun Dec 13 22:10:33 2009
New Revision: 890130
URL: http://svn.apache.org/viewvc?rev=890130&view=rev
Log:
TIKA-339: HtmlParser & TXTParser should not use language returned by
CharsetDetector if language hint has been provided
Patch by Ken Krugler
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Sun Dec 13 22:10:33 2009
@@ -106,13 +106,18 @@
if (Charset.isSupported(match.getName())) {
metadata.set(Metadata.CONTENT_ENCODING, match.getName());
- // Is the encoding language-specific (KOI8-R, SJIS, etc.)?
+ // TIKA-339: Don't set language, as it's typically not a very
good
+ // guess, and it can create ambiguity if another (better)
language
+ // value is specified by a meta tag in the HTML (or via HTTP
response
+ // header).
+ /*
String language = match.getLanguage();
if (language != null) {
metadata.set(Metadata.CONTENT_LANGUAGE,
match.getLanguage());
metadata.set(Metadata.LANGUAGE, match.getLanguage());
}
-
+ */
+
break;
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Sun Dec 13 22:10:33 2009
@@ -100,8 +100,8 @@
// Is the encoding language-specific (KOI8-R, SJIS, etc.)?
String language = match.getLanguage();
if (language != null) {
- metadata.set(Metadata.CONTENT_LANGUAGE,
match.getLanguage());
- metadata.set(Metadata.LANGUAGE, match.getLanguage());
+ metadata.add(Metadata.CONTENT_LANGUAGE, language);
+ metadata.add(Metadata.LANGUAGE, language);
}
break;
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Sun Dec 13 22:10:33 2009
@@ -292,4 +292,19 @@
assertEquals("baz", parts[2]);
}
+ /**
+ * Test case for TIKA-339: Don't use language returned by CharsetDetector
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
+ */
+ public void testIgnoreCharsetDetectorLanguage() throws Exception {
+ String test = "<html><title>Simple
Content</title><body></body></html>";
+ Metadata metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_LANGUAGE, "en");
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+ }
+
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Sun Dec 13 22:10:33 2009
@@ -161,4 +161,22 @@
assertEquals(msg, expected, handler.toString());
}
+ /**
+ * Test case for TIKA-339: don't override incoming language
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+ */
+ public void testRetainIncomingLanguage() throws Exception {
+ final String test = "Simple Content";
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.LANGUAGE, "en");
+
+ parser.parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("en", metadata.get(Metadata.LANGUAGE));
+ }
+
}