[ 
https://issues.apache.org/jira/browse/TIKA-3187?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Akhil Poshetty updated TIKA-3187:
---------------------------------
    Attachment:     (was: greek_characters_symbol_font.doc)

> Tika cannot parse the characters which appear from the symbol font (Microsoft 
> word)
> -----------------------------------------------------------------------------------
>
>                 Key: TIKA-3187
>                 URL: https://issues.apache.org/jira/browse/TIKA-3187
>             Project: Tika
>          Issue Type: Bug
>          Components: tika-batch, tika-dl, tika-eval
>    Affects Versions: 1.24.1
>         Environment: OS : Windows 7,
> Software Platform : Java
> HardWare : 64 bit machine with 16 GB ram
>            Reporter: Akhil Poshetty
>            Priority: Blocker
>              Labels: Tika, tika, tika,text-extraction, tika-parsers
>         Attachments: greek_characters_input.PNG, greek_characters_output.PNG
>
>
> I am trying to parse a microsoft word document (.doc) which contains 
> characters from symbol font ,symbol font is a special font in microsoft word 
> which contains maths and greek characters
>  the code I am using for parsing the doc is below 
> {code:java}
>    import org.apache.commons.io.FileUtils;
>       import org.apache.tika.metadata.Metadata;
>       import org.apache.tika.parser.AutoDetectParser;
>       import org.apache.tika.parser.ParseContext;
>       import org.apache.tika.parser.Parser;
>       import org.apache.tika.sax.BodyContentHandler;
>       import java.io.File;
>       import java.io.FileInputStream;
>       import java.nio.charset.StandardCharsets;
>     public class Tika {
>     public static void main(String[] args) {
>         try {
>             String inputPath = args[0];
>             String outputPath = args[1];
>             File f = new File(inputPath);
>             System.out.println("path is : " + f.getAbsoluteFile());
>             FileInputStream fileInputStream = new FileInputStream(f);
>             Parser parser = new AutoDetectParser();
>             BodyContentHandler handler = new BodyContentHandler(-1);
>             ParseContext parseContext = new ParseContext();
>             parseContext.set(Parser.class, parser);
>             Metadata metadatafromtika = new Metadata();
>             metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
>             parser.parse(fileInputStream, handler, metadatafromtika, 
> parseContext);
>             String text = handler.toString();
>             System.out.println("done parsing for file : " + 
> f.getAbsolutePath());
>             System.out.println("text is : \n" + text);
>             byte[] bytes = text.getBytes();
>             String encodedText = new String(bytes, StandardCharsets.UTF_8);
>             System.out.println("encoded text is : " + encodedText);
>             FileUtils.writeStringToFile(new File(outputPath + File.separator 
> + f.getName() + "_content.txt"),
>                 text, "UTF-8");
>         }
>         catch (Exception e) {
>             e.printStackTrace();
>         }
>     }
> }
> {code}
>  
> the dependencies I am using are 
> {code:java}
> <dependencies>
> <dependency>
>   <groupId>org.apache.tika</groupId>
>   <artifactId>tika-parsers</artifactId>
>   <version>1.18</version>
> </dependency>
> <dependency>
>   <groupId>commons-collections</groupId>
>   <artifactId>commons-collections</artifactId>
>   <version>3.2.1</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.logging.log4j</groupId>
>   <artifactId>log4j-core</artifactId>
>   <version>2.9.1</version>
> </dependency>
> <dependency>
>   <groupId>org.antlr</groupId>
>   <artifactId>ST4</artifactId>
>   <version>4.0.8</version>
> </dependency>
> <dependency>
>   <groupId>org.postgresql</groupId>
>   <artifactId>postgresql</artifactId>
>   <version>42.1.4</version>
> </dependency>
> <dependency>
>   <groupId>com.zaxxer</groupId>
>   <artifactId>HikariCP</artifactId>
>   <version>2.7.2</version>
> </dependency>
> <dependency>
>   <groupId>commons-dbutils</groupId>
>   <artifactId>commons-dbutils</artifactId>
>   <version>1.6</version>
> </dependency>
> <dependency>
>   <groupId>commons-io</groupId>
>   <artifactId>commons-io</artifactId>
>   <version>2.5</version>
> </dependency>
> <dependency>
>   <groupId>org.json</groupId>
>   <artifactId>json</artifactId>
>   <version>20171018</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.hive</groupId>
>   <artifactId>hive-jdbc</artifactId>
>   <version>1.1.0-cdh5.10.1</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.hadoop</groupId>
>   <artifactId>hadoop-common</artifactId>
>   <version>2.6.0-cdh5.10.1</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.hadoop</groupId>
>   <artifactId>hadoop-hdfs</artifactId>
>   <version>2.6.0-cdh5.10.1</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.hadoop</groupId>
>   <artifactId>hadoop-mapreduce-client-core</artifactId>
>   <version>2.6.0-cdh5.10.1</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.hadoop</groupId>
>   <artifactId>hadoop-tools</artifactId>
>   <version>2.6.0-mr1-cdh5.10.1</version>
> </dependency>
> <dependency>
>   <groupId>org.apache.htrace</groupId>
>   <artifactId>htrace-core4</artifactId>
>   <version>4.0.1-incubating</version>
> </dependency>
> <dependency>
>   <groupId>com.google.code.gson</groupId>
>   <artifactId>gson</artifactId>
>   <version>2.8.1</version>
> </dependency>
> <dependency>
>   <groupId>com.levigo.jbig2</groupId>
>   <artifactId>levigo-jbig2-imageio</artifactId>
>   <version>1.6.5</version>
> </dependency>
> <dependency>
>   <groupId>com.github.jai-imageio</groupId>
>   <artifactId>jai-imageio-core</artifactId>
>   <version>1.3.1</version>
> </dependency>
> <dependency>
>   <groupId>com.fasterxml.jackson.core</groupId>
>   <artifactId>jackson-core</artifactId>
>   <version>2.9.5</version>
> </dependency>
> </dependencies
> {code}
> I have attached the input and output files



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to