Xiaohong Yang created TIKA-4245:
-----------------------------------
Summary: Tika does not get html content properly
Key: TIKA-4245
URL: https://issues.apache.org/jira/browse/TIKA-4245
Project: Tika
Issue Type: Bug
Reporter: Xiaohong Yang
Attachments: Sample html file and tika config xml.zip
We use org.apache.tika.parser.AutoDetectParser to get the content of html
files. And we found out that it does not get the content fo the sample file
properly.
Following is the sample code and attached is the tika-config.xml and the sample
html file. The content extracted with Tika reads
"㱨瑭氠硭汮猺景㴢桴瑰㨯⽷睷㌮潲术ㄹ㤹⽘卌⽆潲浡琢㸍ਉ़桥慤㸼䵅呁瑴瀭敱畩瘽≃潮瑥湴ⵔ祰攢潮瑥湴㴢瑥硴…". That is different
from the native file.
The operating system is Ubuntu 20.04. Java version is 21. Tika version is
2.9.2.
import org.apache.commons.io.FileUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class ExtractTxtFromHtml {
private static final Path inputFile = new
File("/home/ubuntu/testdirs/testdir_html/451434.html").toPath();
public static void main(String args[]) {
extactText(false);
extactText(true);
}
static void extactText(boolean largeFile) {
PrintWriter outputFileWriter = null;
try {
BodyContentHandler handler;
Path outputFilePath = null;
if (largeFile) {
// write tika output to disk
outputFilePath =
Paths.get("/home/ubuntu/testdirs/testdir_html/tika_parse_output.txt");
outputFileWriter = new
PrintWriter(Files.newOutputStream(outputFilePath));
handler = new BodyContentHandler(outputFileWriter);
} else {
// stream it in memory
handler = new BodyContentHandler(-1);
}
Metadata metadata = new Metadata();
FileInputStream inputData = new FileInputStream(inputFile.toFile());
TikaConfig config = new
TikaConfig("/home/ubuntu/testdirs/testdir_html/tika-config.xml");
Parser autoDetectParser = new AutoDetectParser(config);
ParseContext context = new ParseContext();
context.set(TikaConfig.class, config);
autoDetectParser.parse(inputData, handler, metadata, context);
String content;
if (largeFile) {
content = FileUtils.readFileToString(outputFilePath.toFile());
}
else {
content = handler.toString();
}
System.out.println("content = " + content);
}
catch(Exception ex) {
ex.printStackTrace();
} finally {
if (outputFileWriter != null) {
outputFileWriter.close();
}
}
}
}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)