[ https://issues.apache.org/jira/browse/TIKA-4245?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17841221#comment-17841221 ]
Tim Allison edited comment on TIKA-4245 at 4/26/24 1:23 PM: ------------------------------------------------------------ Oops, sorry. I didn't realize you sent your tika-config.xml. Y, one option is to turn off the HtmlEncodingDetector. I confirmed that works on _this_ file. Separately, see slide 19 of this presentation for some examples of when the HTMLEncodingDetector is a bad idea: https://www.slideshare.net/slideshow/evaluating-text-extraction-at-scale-a-case-study-from-apache-tika/238979661 was (Author: talli...@mitre.org): Oops, sorry. I didn't realize you sent your tika-config.xml. Y, one option is to turn off the HtmlEncodingDetector. > Tika does not get html content properly > ---------------------------------------- > > Key: TIKA-4245 > URL: https://issues.apache.org/jira/browse/TIKA-4245 > Project: Tika > Issue Type: Bug > Reporter: Xiaohong Yang > Priority: Major > Attachments: Sample html file and tika config xml.zip > > > We use org.apache.tika.parser.AutoDetectParser to get the content of html > files. And we found out that it does not get the content fo the sample file > properly. > Following is the sample code and attached is the tika-config.xml and the > sample html file. The content extracted with Tika reads > "㱨瑭氠硭汮猺景㴢桴瑰㨯⽷睷㌮潲术ㄹ㤹⽘卌⽆潲浡琢㸍ਉ़桥慤㸼䵅呁瑴瀭敱畩瘽≃潮瑥湴ⵔ祰攢潮瑥湴㴢瑥硴…". That is different > from the native file. > > > The operating system is Ubuntu 20.04. Java version is 21. Tika version is > 2.9.2. > {code:java} > import org.apache.commons.io.FileUtils; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.apache.tika.sax.BodyContentHandler; > > import java.io.File; > import java.io.FileInputStream; > import java.io.PrintWriter; > import java.nio.file.Files; > import java.nio.file.Path; > import java.nio.file.Paths; > > public class ExtractTxtFromHtml { > private static final Path inputFile = new > File("/home/ubuntu/testdirs/testdir_html/451434.html").toPath(); > > public static void main(String args[]) { > extactText(false); > extactText(true); > } > > static void extactText(boolean largeFile) { > PrintWriter outputFileWriter = null; > try { > BodyContentHandler handler; > Path outputFilePath = null; > > if (largeFile) { > // write tika output to disk > outputFilePath = > Paths.get("/home/ubuntu/testdirs/testdir_html/tika_parse_output.txt"); > outputFileWriter = new > PrintWriter(Files.newOutputStream(outputFilePath)); > handler = new BodyContentHandler(outputFileWriter); > } else { > // stream it in memory > handler = new BodyContentHandler(-1); > } > > Metadata metadata = new Metadata(); > FileInputStream inputData = new > FileInputStream(inputFile.toFile()); > TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_html/tika-config.xml"); > Parser autoDetectParser = new AutoDetectParser(config); > ParseContext context = new ParseContext(); > context.set(TikaConfig.class, config); > autoDetectParser.parse(inputData, handler, metadata, context); > > String content; > if (largeFile) { > content = FileUtils.readFileToString(outputFilePath.toFile()); > } > else { > content = handler.toString(); > } > System.out.println("content = " + content); > } > catch(Exception ex) { > ex.printStackTrace(); > } finally { > if (outputFileWriter != null) { > outputFileWriter.close(); > } > } > } > } > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)