Author: siren
Date: Mon Oct 15 09:07:31 2007
New Revision: 584811
URL: http://svn.apache.org/viewvc?rev=584811&view=rev
Log:
TIKA-65 - Add encode detection support for HTML parser
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
incubator/tika/trunk/src/test/resources/test-documents/testHTML_utf8.html
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584811&r1=584810&r2=584811&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Oct 15 09:07:31 2007
@@ -109,3 +109,5 @@
49. TIKA-56 - Mime type detection fails with upper case file extensions such
as "PDF"
(mattmann)
+
+50. TIKA-65 - Add encode detection support for HTML parser (siren)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=584811&r1=584810&r2=584811&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Mon Oct 15 09:07:31 2007
@@ -18,6 +18,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.io.Reader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -30,14 +31,33 @@
import org.xml.sax.SAXException;
/**
- * Simple HTML parser implemented with NekoHTML.
+ * Simple HTML parser that extracts title.
*/
public class HtmlParser implements Parser {
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata) throws IOException, SAXException, TikaException
{
+
final SAXParser parser = new SAXParser();
- final InputSource source = new InputSource(stream);
+
+ final InputSource source;
+
+ Reader utf8Reader;
+
+ try {
+ utf8Reader = org.apache.tika.utils.Utils.getUTF8Reader(
+ stream, metadata);
+ } catch (TikaException ex) {
+ utf8Reader = null;
+ }
+
+ if (utf8Reader == null) {
+ source = new InputSource(stream);
+ } else {
+ source = new InputSource(utf8Reader);
+ }
+
+
parser.setContentHandler(new TitleExtractingContentHandler(handler,
metadata));
parser.parse(source);
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=584811&r1=584810&r2=584811&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Mon Oct 15 09:07:31 2007
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.txt;
-import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
@@ -25,12 +24,10 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.XHTMLContentHandler;
+import org.apache.tika.utils.Utils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import com.ibm.icu.text.CharsetDetector;
-import com.ibm.icu.text.CharsetMatch;
-
/**
* Text parser
*/
@@ -39,38 +36,13 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
- CharsetDetector detector = new CharsetDetector();
-
- // Use the declared character encoding, if available
- String encoding = metadata.get(Metadata.CONTENT_ENCODING);
- if (encoding != null) {
- detector.setDeclaredEncoding(encoding);
- }
-
- // CharsetDetector expects a stream to support marks
- if (!stream.markSupported()) {
- stream = new BufferedInputStream(stream);
- }
-
- detector.setText(stream);
-
- CharsetMatch match = detector.detect();
- if (match == null) {
- throw new TikaException("Unable to detect character encoding");
- }
-
+
+ Reader reader = Utils.getUTF8Reader(stream, metadata);
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
- metadata.set(Metadata.CONTENT_ENCODING, match.getName());
- String language = match.getLanguage();
- if (language != null) {
- metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
- metadata.set(Metadata.LANGUAGE, match.getLanguage());
- }
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
- Reader reader = match.getReader();
char[] buffer = new char[4096];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
xhtml.characters(buffer, 0, n);
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=584811&r1=584810&r2=584811&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Mon Oct
15 09:07:31 2007
@@ -16,6 +16,7 @@
*/
package org.apache.tika.utils;
+import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -24,18 +25,25 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.log4j.Logger;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
import org.jdom.Document;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
/**
* Class util
*
@@ -119,6 +127,47 @@
logger.error(ex.getMessage());
}
+ }
+
+ /**
+ * Try to detect encoding from inputstream and return a UTF-8
+ * Reader. A metadata hint can be submitted as part of [EMAIL PROTECTED]
Metadata}
+ * under key [EMAIL PROTECTED] HttpHeaders#CONTENT_ENCODING}.
+ *
+ * After succesfull detection, fills Metadata with detected content
encoding
+ * and content language ([EMAIL PROTECTED] HttpHeaders#CONTENT_LANGUAGE}).
+ *
+ * @return Reader to utf8 encoded reader.
+ */
+ public static Reader getUTF8Reader(InputStream stream, Metadata metadata)
throws TikaException, IOException{
+ CharsetDetector detector = new CharsetDetector();
+
+ // Use the declared character encoding, if available
+ String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+ if (encoding != null) {
+ detector.setDeclaredEncoding(encoding);
+ }
+
+ // CharsetDetector expects a stream to support marks
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+
+ detector.setText(stream);
+
+ CharsetMatch match = detector.detect();
+ if (match == null) {
+ throw new TikaException("Unable to detect character encoding");
+ }
+
+ metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+ String language = match.getLanguage();
+ if (language != null) {
+ metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
+ metadata.set(Metadata.LANGUAGE, match.getLanguage());
+ }
+
+ return match.getReader();
}
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=584811&r1=584810&r2=584811&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Mon Oct
15 09:07:31 2007
@@ -175,19 +175,6 @@
ParserConfig config = tc.getParserConfig("text/html");
Parser parser = ParserFactory.getParser(config);
assertNotNull(parser);
-
- Metadata metadata = new Metadata();
- InputStream stream = new FileInputStream(file);
- try {
- parser.parse(stream, new DefaultHandler(), metadata);
- } finally {
- stream.close();
- }
- assertEquals("Title : Test Indexation Html",
metadata.get(Metadata.TITLE));
-
- final String text = metadata.toString();
- final String expected = "Test Indexation Html";
- assertTrue("text contains '" + expected + "'",
text.contains(expected));
}
public void testZipExtraction() throws Exception {
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=584811&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Mon Oct 15 09:07:31 2007
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.SAXException;
+
+public class HtmlParserTest extends TestCase {
+
+ private Parser parser = new HtmlParser();
+
+ private static InputStream getStream(String name) {
+ return Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ }
+
+ public void testParseAscii() throws IOException, SAXException,
+ TikaException {
+
+ StringWriter writer = new StringWriter();
+ Metadata metadata = new Metadata();
+
+ parser.parse(getStream("test-documents/testHTML.html"),
+ new WriteOutContentHandler(writer), metadata);
+ String content = writer.toString();
+
+ assertTrue("Did not contain expected text:"
+ + "Title : Test Indexation Html", content
+ .contains("Title : Test Indexation Html"));
+
+ assertTrue("Did not contain expected text:" + "Test Indexation Html",
+ content.contains("Test Indexation Html"));
+
+ assertTrue("Did not contain expected text:" + "Indexation du fichier",
+ content.contains("Indexation du fichier"));
+
+ }
+
+ public void testParseUTF8() throws IOException, SAXException,
TikaException {
+
+ StringWriter writer = new StringWriter();
+ Metadata metadata = new Metadata();
+
+ parser.parse(getStream("test-documents/testHTML_utf8.html"),
+ new WriteOutContentHandler(writer), metadata);
+ String content = writer.toString();
+
+ assertTrue("Did not contain expected text:"
+ + "Title : Tilte with UTF-8 chars öäå", content
+ .contains("Title : Tilte with UTF-8 chars öäå"));
+
+ assertTrue("Did not contain expected text:"
+ + "Content with UTF-8 chars", content
+ .contains("Content with UTF-8 chars"));
+
+ assertTrue("Did not contain expected text:" + "åäö", content
+ .contains("åäö"));
+
+ }
+
+ public void testParseEmpty() throws Exception {
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(new ByteArrayInputStream(new byte[0]),
+ new WriteOutContentHandler(writer), metadata);
+ String content = writer.toString();
+ assertEquals("", content);
+ }
+
+}
Added: incubator/tika/trunk/src/test/resources/test-documents/testHTML_utf8.html
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML_utf8.html?rev=584811&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testHTML_utf8.html
(added)
+++ incubator/tika/trunk/src/test/resources/test-documents/testHTML_utf8.html
Mon Oct 15 09:07:31 2007
@@ -0,0 +1,9 @@
+<html>
+ <head>
+ <title>Title : Tilte with UTF-8 chars öäå</title>
+ </head>
+ <body>
+ <h1>Content with UTF-8 chars</h1>
+ <p>åäö</p>
+ </body>
+</html>
\ No newline at end of file