Repository: tika Updated Branches: refs/heads/master 91cdce43d -> 1aff6380d
TIKA-2174 -- add .ppm to tesseract Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1aff6380 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1aff6380 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1aff6380 Branch: refs/heads/master Commit: 1aff6380d46b9104835909c31e7f2f36f621eca0 Parents: 91cdce4 Author: tballison <[email protected]> Authored: Thu Nov 10 08:03:29 2016 -0500 Committer: tballison <[email protected]> Committed: Thu Nov 10 08:03:29 2016 -0500 ---------------------------------------------------------------------- CHANGES.txt | 5 +++++ .../java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 2 +- .../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/1aff6380/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 6cd2de8..9e62fc7 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,7 +1,12 @@ Release 1.15 - ?? + * Add extraction of .jpx inline images from PDFs (TIKA-2175). + + * Add .jpx, .jp2, .ppm to formats handled by Tesseract (TIKA-2174). + * Upgrade SQLite "provided" dependency to 3.15.1. + Release 1.14 - 10/19/2016 * Extract all headers from MSG/RFC822 (TIKA-2122). http://git-wip-us.apache.org/repos/asf/tika/blob/1aff6380/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 90fe18c..ff9a755 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -96,7 +96,7 @@ public class TesseractOCRParser extends AbstractParser { new HashSet<MediaType>(Arrays.asList(new MediaType[] { MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"), MediaType.image("x-ms-bmp"), MediaType.image("gif"), MediaType.APPLICATION_XML.image("jp2"), - MediaType.image("jpx") + MediaType.image("jpx"), MediaType.image("x-portable-pixmap") }))); private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>(); http://git-wip-us.apache.org/repos/asf/tika/blob/1aff6380/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 92b1560..7607427 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -244,4 +244,6 @@ public class TesseractOCRParserTest extends TikaTest { assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); assertEquals("72 dots per inch", m.get("Y Resolution")); } + + //TODO: add unit tests for jp2/jpx/ppm TIKA-2174 }
