[
https://issues.apache.org/jira/browse/TIKA-2080?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Tim Allison closed TIKA-2080.
-----------------------------
Resolution: Not A Problem
> PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters
> correctly
> -------------------------------------------------------------------------------------
>
> Key: TIKA-2080
> URL: https://issues.apache.org/jira/browse/TIKA-2080
> Project: Tika
> Issue Type: Bug
> Components: parser
> Affects Versions: 1.13
> Environment: Windows 8.1, jdk1.8.0_102
> Reporter: Kaleb Akalework
> Attachments: nihao2.pdf
>
>
> I'm trying to use tika to parse PDF files that contain Japanese and Chinese
> characters, but for some reason it does parse it correctly. Every character
> that is extracted is changed to the first letter in the line. For example if
> the document contains 早上好, this, the extracted text will correctly know that
> it has 3 characters but all 3 characters will be 早早早, the last two characters
> are replaced by the gfirst character. This same string is correctly parsed,
> in a word document. The follwoing is what I am using as java sample code
> (Don't forget to change the fdilename)
> package kaleb;
> import java.io.BufferedReader;
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.StringWriter;
> import java.nio.charset.Charset;
> import java.nio.charset.CharsetEncoder;
> import org.apache.commons.io.IOUtils;
> import org.apache.commons.io.input.ReaderInputStream;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.detect.Detector;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.io.TemporaryResources;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.parser.CompositeParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.pdf.PDFParser;
> import org.apache.tika.sax.BodyContentHandler;
> import org.apache.tika.sax.ContentHandlerDecorator;
> import org.apache.tika.parser.pdf.PDFParser;
> import org.xml.sax.SAXException;
> public class TestTika {
>
>
>
> /** character limit */
> private static int parserCharLimit = 10 * 1024 * 1024;
> public static int getParserCharLimit() {
> return parserCharLimit;
> }
> public static void setParserCharLimit(int l) {
> parserCharLimit = l;
> }
> private static StringBuilder sb = null;
>
> private static ContentHandlerDecorator handler = new
> ContentHandlerDecorator() {
> private void ensureLimit() throws SAXException {
> if (sb.length() > parserCharLimit) {
> throw new MaxContentExceededException(
> "Your document contained more than "
> +parserCharLimit+" characters: "+sb.length());
> }
> }
> @Override
> public void characters(char[] ch, int start, int length) throws
> SAXException {
> if (length == 5)
> length *=2;
> sb.append(ch, start, length );
> ensureLimit();
> }
> @Override
> public void ignorableWhitespace(char[] ch, int start,
> int length) throws SAXException {
> if (sb.length() > 0)
> sb.append(ch, start, length);
> ensureLimit();
> }
> };
>
> public static class MaxContentExceededException extends SAXException {
> public MaxContentExceededException() { super(); }
> public MaxContentExceededException(Exception e) { super(e); }
> public MaxContentExceededException(String message, Exception e) {
> super(message, e); }
> public MaxContentExceededException(String message) {super(message);}
> }
>
> public static void myTika() throws Exception{
> TikaConfig tikaConfig = null;
>
> try{
>
> InputStream stream = new FileInputStream(new
> File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb\\tika-config.xml")));
> try {
> tikaConfig = new TikaConfig(stream);
> } catch (IOException | SAXException | TikaException e) {
> tikaConfig = TikaConfig.getDefaultConfig();
> } finally {
> try { stream.close(); } catch (IOException e) { }
> }
> }catch(Exception e){}
>
> /** default Tika detector */
> Detector tikaDetector = tikaConfig.getDetector();
> /** default Tika parser */
> CompositeParser tikaParser = new
> CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser());
> TemporaryResources tmp = new TemporaryResources();
> InputStream stream = new FileInputStream(new
> File("C:\\Users\\kaleba\\Desktop\\Chin.docx"));
>
> TikaInputStream tis = TikaInputStream.get(stream, tmp);
>
> String type ="";
> // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler
> instead??
> Metadata metadata = new Metadata();
> ParseContext context = new ParseContext();
> context.set(org.apache.tika.parser.Parser.class, tikaParser);
> try {
> // TODO: limit by content type to reduce dependencies?
> // https://tika.apache.org/1.10/parser_guide.html
>
> type = tikaDetector.detect(tis, metadata).toString();
> metadata.set(Metadata.CONTENT_TYPE, type);
> }
> catch(Exception e){}
> sb = new StringBuilder();
> tikaParser.parse(tis, handler, metadata, context);
>
>
>
> String s = sb.toString();
>
> int i= 1;
> }
> public static void main(String[] args) {
> // TODO Auto-generated method stub
>
> /*try{
> File initialFile = new
> File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf");
> InputStream targetStream = new FileInputStream(initialFile);
> String s = parse(targetStream,null, null);
> int i=1;
> }
> catch (Exception e){}*/
> /* TestTika tk = new TestTika();
> tk.setFilePath("C:\\Users\\kaleba\\Desktop\\Rus3.pdf");
> try{
> System.out.println(tk.ToText());
> }
> catch(Exception e){}*/
> try{
> myTika();
> }
> catch (Exception e){
> System.out.print(e.getMessage());
> }
> }
> }
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)