Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE Wed Jan 6 03:50:50 2016 @@ -0,0 +1,14 @@ +APACHE TIKA SUBCOMPONENTS + +Apache Tika includes a number of subcomponents with separate copyright notices +and license terms. Your use of these subcomponents is subject to the terms and +conditions of the following licenses. + +Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp) + are in the public domain. These files were retrieved from: + https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp + These photos are also available here: + https://developers.google.com/speed/webp/gallery2#webp_links + Credits for the photo: + "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers" + Image Author: Jon Sullivan
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.font; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import org.apache.fontbox.afm.AFMParser; +import org.apache.fontbox.afm.FontMetric; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for AFM Font Files + */ +public class AdobeFontMetricParser extends AbstractParser { + /** Serial version UID */ + private static final long serialVersionUID = -4820306522217196835L; + + private static final MediaType AFM_TYPE = + MediaType.application( "x-font-adobe-metric" ); + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(AFM_TYPE); + + // TIKA-1325 Replace these with properties, from a well known standard + static final String MET_AVG_CHAR_WIDTH = "AvgCharacterWidth"; + static final String MET_DOC_VERSION = "DocVersion"; + static final String MET_PS_NAME = "PSName"; + static final String MET_FONT_NAME = "FontName"; + static final String MET_FONT_FULL_NAME = "FontFullName"; + static final String MET_FONT_FAMILY_NAME = "FontFamilyName"; + static final String MET_FONT_SUB_FAMILY_NAME = "FontSubFamilyName"; + static final String MET_FONT_VERSION = "FontVersion"; + static final String MET_FONT_WEIGHT = "FontWeight"; + static final String MET_FONT_NOTICE = "FontNotice"; + static final String MET_FONT_UNDERLINE_THICKNESS = "FontUnderlineThickness"; + + public Set<MediaType> getSupportedTypes( ParseContext context ) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + FontMetric fontMetrics; + AFMParser parser = new AFMParser( stream ); + + // Have FontBox process the file + parser.parse(); + fontMetrics = parser.getResult(); + + // Get the comments in the file to display in xhtml + List<String> comments = fontMetrics.getComments(); + + // Get the creation date + extractCreationDate( metadata, comments ); + + metadata.set( Metadata.CONTENT_TYPE, AFM_TYPE.toString() ); + metadata.set( TikaCoreProperties.TITLE, fontMetrics.getFullName() ); + + // Add metadata associated with the font type + addMetadataByString( metadata, MET_AVG_CHAR_WIDTH, Float.toString( fontMetrics.getAverageCharacterWidth() ) ); + addMetadataByString( metadata, MET_DOC_VERSION, Float.toString( fontMetrics.getAFMVersion() ) ); + addMetadataByString( metadata, MET_FONT_NAME, fontMetrics.getFontName() ); + addMetadataByString( metadata, MET_FONT_FULL_NAME, fontMetrics.getFullName() ); + addMetadataByString( metadata, MET_FONT_FAMILY_NAME, fontMetrics.getFamilyName() ); + addMetadataByString( metadata, MET_FONT_VERSION, fontMetrics.getFontVersion() ); + addMetadataByString( metadata, MET_FONT_WEIGHT, fontMetrics.getWeight() ); + addMetadataByString( metadata, MET_FONT_NOTICE, fontMetrics.getNotice() ); + addMetadataByString( metadata, MET_FONT_UNDERLINE_THICKNESS, Float.toString( fontMetrics.getUnderlineThickness() ) ); + + // Output the remaining comments as text + XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata ); + xhtml.startDocument(); + + // Display the comments + if (comments.size() > 0) { + xhtml.element( "h1", "Comments" ); + xhtml.startElement("div", "class", "comments"); + for (String comment : comments) { + xhtml.element( "p", comment ); + } + xhtml.endElement("div"); + } + + xhtml.endDocument(); + } + + private void addMetadataByString( Metadata metadata, String name, String value ) { + // Add metadata if an appropriate value is passed + if (value != null) { + metadata.add( name, value ); + } + } + + private void addMetadataByProperty( Metadata metadata, Property property, String value ) { + // Add metadata if an appropriate value is passed + if (value != null) + { + metadata.set( property, value ); + } + } + + + private void extractCreationDate( Metadata metadata, List<String> comments ) { + String date = null; + + for (String value : comments) { + // Look for the creation date + if( value.matches( ".*Creation\\sDate.*" ) ) { + date = value.substring( value.indexOf( ":" ) + 2 ); + comments.remove( value ); + + break; + } + } + + // If appropriate date then store as metadata + if( date != null ) { + addMetadataByProperty( metadata, Metadata.CREATION_DATE, date ); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.font; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.fontbox.ttf.NameRecord; +import org.apache.fontbox.ttf.NamingTable; +import org.apache.fontbox.ttf.TTFParser; +import org.apache.fontbox.ttf.TrueTypeFont; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for TrueType font files (TTF). + */ +public class TrueTypeParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 44788554612243032L; + + private static final MediaType TYPE = + MediaType.application("x-font-ttf"); + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(TYPE); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + TikaInputStream tis = TikaInputStream.cast(stream); + + // Ask FontBox to parse the file for us + TrueTypeFont font; + TTFParser parser = new TTFParser(); + if (tis != null && tis.hasFile()) { + font = parser.parseTTF(tis.getFile()); + } else { + font = parser.parseTTF(stream); + } + + // Report the details of the font + metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); + metadata.set(TikaCoreProperties.CREATED, + font.getHeader().getCreated()); + metadata.set(TikaCoreProperties.MODIFIED, + font.getHeader().getModified()); + metadata.set(AdobeFontMetricParser.MET_DOC_VERSION, + Float.toString(font.getHeader().getVersion())); + + // Pull out the naming info + NamingTable fontNaming = font.getNaming(); + for (NameRecord nr : fontNaming.getNameRecords()) { + if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) { + metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString()); + } + if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) { + metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString()); + } + if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) { + metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString()); + metadata.set(TikaCoreProperties.TITLE, nr.getString()); + } + if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) { + metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString()); + } + if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) { + metadata.set("Copyright", nr.getString()); + } + if (nr.getNameId() == NameRecord.NAME_TRADEMARK) { + metadata.set("Trademark", nr.getString()); + } + } + + // For now, we only output metadata, no textual contents + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.endDocument(); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocr; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Locale; +import java.util.Properties; + +/** + * Configuration for TesseractOCRParser. + * + * This allows to enable TesseractOCRParser and set its parameters: + * <p> + * TesseractOCRConfig config = new TesseractOCRConfig();<br> + * config.setTesseractPath(tesseractFolder);<br> + * parseContext.set(TesseractOCRConfig.class, config);<br> + * </p> + * + * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in, + * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own + * and placing it in the package org/apache/tika/parser/ocr on the classpath. + * + */ +public class TesseractOCRConfig implements Serializable{ + + private static final long serialVersionUID = -4861942486845757891L; + + // Path to tesseract installation folder, if not on system path. + private String tesseractPath = ""; + + // Path to the 'tessdata' folder, which contains language files and config files. + private String tessdataPath = ""; + + // Language dictionary to be used. + private String language = "eng"; + + // Tesseract page segmentation mode. + private String pageSegMode = "1"; + + // Minimum file size to submit file to ocr. + private int minFileSizeToOcr = 0; + + // Maximum file size to submit file to ocr. + private int maxFileSizeToOcr = Integer.MAX_VALUE; + + // Maximum time (seconds) to wait for the ocring process termination + private int timeout = 120; + + /** + * Default contructor. + */ + public TesseractOCRConfig() { + init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties")); + } + + /** + * Loads properties from InputStream and then tries to close InputStream. + * If there is an IOException, this silently swallows the exception + * and goes back to the default. + * + * @param is + */ + public TesseractOCRConfig(InputStream is) { + init(is); + } + + private void init(InputStream is) { + if (is == null) { + return; + } + Properties props = new Properties(); + try { + props.load(is); + } catch (IOException e) { + } finally { + if (is != null) { + try { + is.close(); + } catch (IOException e) { + //swallow + } + } + } + + setTesseractPath( + getProp(props, "tesseractPath", getTesseractPath())); + setTessdataPath( + getProp(props, "tessdataPath", getTessdataPath())); + setLanguage( + getProp(props, "language", getLanguage())); + setPageSegMode( + getProp(props, "pageSegMode", getPageSegMode())); + setMinFileSizeToOcr( + getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr())); + setMaxFileSizeToOcr( + getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); + setTimeout( + getProp(props, "timeout", getTimeout())); + + } + + /** @see #setTesseractPath(String tesseractPath)*/ + public String getTesseractPath() { + return tesseractPath; + } + + /** + * Set the path to the Tesseract executable, needed if it is not on system path. + * <p> + * Note that if you set this value, it is highly recommended that you also + * set the path to the 'tessdata' folder using {@link #setTessdataPath}. + * </p> + */ + public void setTesseractPath(String tesseractPath) { + if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) + tesseractPath += File.separator; + + this.tesseractPath = tesseractPath; + } + + /** @see #setTessdataPath(String tessdataPath) */ + public String getTessdataPath() { + return tessdataPath; + } + + /** + * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such + * as on Windows), this folder is found in the Tesseract installation, but in other cases + * (such as when Tesseract is built from source), it may be located elsewhere. + */ + public void setTessdataPath(String tessdataPath) { + if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) + tessdataPath += File.separator; + + this.tessdataPath = tessdataPath; + } + + /** @see #setLanguage(String language)*/ + public String getLanguage() { + return language; + } + + /** + * Set tesseract language dictionary to be used. Default is "eng". + * Multiple languages may be specified, separated by plus characters. + */ + public void setLanguage(String language) { + if (!language.matches("([A-Za-z](\\+?))*")) { + throw new IllegalArgumentException("Invalid language code"); + } + this.language = language; + } + + /** @see #setPageSegMode(String pageSegMode)*/ + public String getPageSegMode() { + return pageSegMode; + } + + /** + * Set tesseract page segmentation mode. + * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection) + */ + public void setPageSegMode(String pageSegMode) { + if (!pageSegMode.matches("[1-9]|10")) { + throw new IllegalArgumentException("Invalid language code"); + } + this.pageSegMode = pageSegMode; + } + + /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/ + public int getMinFileSizeToOcr() { + return minFileSizeToOcr; + } + + /** + * Set minimum file size to submit file to ocr. + * Default is 0. + */ + public void setMinFileSizeToOcr(int minFileSizeToOcr) { + this.minFileSizeToOcr = minFileSizeToOcr; + } + + /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/ + public int getMaxFileSizeToOcr() { + return maxFileSizeToOcr; + } + + /** + * Set maximum file size to submit file to ocr. + * Default is Integer.MAX_VALUE. + */ + public void setMaxFileSizeToOcr(int maxFileSizeToOcr) { + this.maxFileSizeToOcr = maxFileSizeToOcr; + } + + /** + * Set maximum time (seconds) to wait for the ocring process to terminate. + * Default value is 120s. + */ + public void setTimeout(int timeout) { + this.timeout = timeout; + } + + /** @see #setTimeout(int timeout)*/ + public int getTimeout() { + return timeout; + } + + /** + * Get property from the properties file passed in. + * @param properties properties file to read from. + * @param property the property to fetch. + * @param defaultMissing default parameter to use. + * @return the value. + */ + private int getProp(Properties properties, String property, int defaultMissing) { + String p = properties.getProperty(property); + if (p == null || p.isEmpty()){ + return defaultMissing; + } + try { + return Integer.parseInt(p); + } catch (Throwable ex) { + throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value", + property), ex); + } + } + + /** + * Get property from the properties file passed in. + * @param properties properties file to read from. + * @param property the property to fetch. + * @param defaultMissing default parameter to use. + * @return the value. + */ + private String getProp(Properties properties, String property, String defaultMissing) { + return properties.getProperty(property, defaultMissing); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocr; + +import javax.imageio.ImageIO; + +import java.awt.Image; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.FutureTask; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.parser.image.ImageParser; +import org.apache.tika.parser.image.TiffParser; +import org.apache.tika.parser.jpeg.JpegParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser, + * create a {@link TesseractOCRConfig} object and pass it through a + * ParseContext. Tesseract-ocr must be installed and on system path or the path + * to its root folder must be provided: + * <p> + * TesseractOCRConfig config = new TesseractOCRConfig();<br> + * //Needed if tesseract is not on system path<br> + * config.setTesseractPath(tesseractFolder);<br> + * parseContext.set(TesseractOCRConfig.class, config);<br> + * </p> + * + * + */ +public class TesseractOCRParser extends AbstractParser { + private static final long serialVersionUID = -8167538283213097265L; + private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig(); + private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( + new HashSet<MediaType>(Arrays.asList(new MediaType[] { + MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"), + MediaType.image("x-ms-bmp"), MediaType.image("gif") + }))); + private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>(); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + // If Tesseract is installed, offer our supported image types + TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); + if (hasTesseract(config)) + return SUPPORTED_TYPES; + + // Otherwise don't advertise anything, so the other image parsers + // can be selected instead + return Collections.emptySet(); + } + + private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) { + String tessdataPrefix = "TESSDATA_PREFIX"; + Map<String, String> env = pb.environment(); + + if (!config.getTessdataPath().isEmpty()) { + env.put(tessdataPrefix, config.getTessdataPath()); + } + else if(!config.getTesseractPath().isEmpty()) { + env.put(tessdataPrefix, config.getTesseractPath()); + } + } + + private boolean hasTesseract(TesseractOCRConfig config) { + // Fetch where the config says to find Tesseract + String tesseract = config.getTesseractPath() + getTesseractProg(); + + // Have we already checked for a copy of Tesseract there? + if (TESSERACT_PRESENT.containsKey(tesseract)) { + return TESSERACT_PRESENT.get(tesseract); + } + + // Try running Tesseract from there, and see if it exists + works + String[] checkCmd = { tesseract }; + boolean hasTesseract = ExternalParser.check(checkCmd); + TESSERACT_PRESENT.put(tesseract, hasTesseract); + return hasTesseract; + + } + + public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + TemporaryResources tmp = new TemporaryResources(); + FileOutputStream fos = null; + TikaInputStream tis = null; + try { + int w = image.getWidth(null); + int h = image.getHeight(null); + BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB); + File file = tmp.createTemporaryFile(); + fos = new FileOutputStream(file); + ImageIO.write(bImage, "png", fos); + tis = TikaInputStream.get(file); + parse(tis, handler, metadata, context); + + } finally { + tmp.dispose(); + if (tis != null) + tis.close(); + if (fos != null) + fos.close(); + } + + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); + + // If Tesseract is not on the path with the current config, do not try to run OCR + // getSupportedTypes shouldn't have listed us as handling it, so this should only + // occur if someone directly calls this parser, not via DefaultParser or similar + if (! hasTesseract(config)) + return; + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + TemporaryResources tmp = new TemporaryResources(); + File output = null; + try { + TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); + File input = tikaStream.getFile(); + long size = tikaStream.getLength(); + + if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { + + output = tmp.createTemporaryFile(); + doOCR(input, output, config); + + // Tesseract appends .txt to output file name + output = new File(output.getAbsolutePath() + ".txt"); + + if (output.exists()) + extractOutput(new FileInputStream(output), xhtml); + + } + + // Temporary workaround for TIKA-1445 - until we can specify + // composite parsers with strategies (eg Composite, Try In Turn), + // always send the image onwards to the regular parser to have + // the metadata for them extracted as well + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context); + } finally { + tmp.dispose(); + if (output != null) { + output.delete(); + } + } + } + // TIKA-1445 workaround parser + private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser(); + private static class CompositeImageParser extends CompositeParser { + private static final long serialVersionUID = -2398203346206381382L; + private static List<Parser> imageParsers = Arrays.asList(new Parser[]{ + new ImageParser(), new JpegParser(), new TiffParser() + }); + CompositeImageParser() { + super(new MediaTypeRegistry(), imageParsers); + } + } + + /** + * Run external tesseract-ocr process. + * + * @param input + * File to be ocred + * @param output + * File to collect ocr result + * @param config + * Configuration of tesseract-ocr engine + * @throws TikaException + * if the extraction timed out + * @throws IOException + * if an input error occurred + */ + private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { + String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", + config.getLanguage(), "-psm", config.getPageSegMode() }; + + ProcessBuilder pb = new ProcessBuilder(cmd); + setEnv(config, pb); + final Process process = pb.start(); + + process.getOutputStream().close(); + InputStream out = process.getInputStream(); + InputStream err = process.getErrorStream(); + + logStream("OCR MSG", out, input); + logStream("OCR ERROR", err, input); + + FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() { + public Integer call() throws Exception { + return process.waitFor(); + } + }); + + Thread waitThread = new Thread(waitTask); + waitThread.start(); + + try { + waitTask.get(config.getTimeout(), TimeUnit.SECONDS); + + } catch (InterruptedException e) { + waitThread.interrupt(); + process.destroy(); + Thread.currentThread().interrupt(); + throw new TikaException("TesseractOCRParser interrupted", e); + + } catch (ExecutionException e) { + // should not be thrown + + } catch (TimeoutException e) { + waitThread.interrupt(); + process.destroy(); + throw new TikaException("TesseractOCRParser timeout", e); + } + + } + + /** + * Reads the contents of the given stream and write it to the given XHTML + * content handler. The stream is closed once fully processed. + * + * @param stream + * Stream where is the result of ocr + * @param xhtml + * XHTML content handler + * @throws SAXException + * if the XHTML SAX events could not be handled + * @throws IOException + * if an input error occurred + */ + private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException { + + xhtml.startDocument(); + xhtml.startElement("div"); + try (Reader reader = new InputStreamReader(stream, UTF_8)) { + char[] buffer = new char[1024]; + for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { + if (n > 0) + xhtml.characters(buffer, 0, n); + } + } + xhtml.endElement("div"); + xhtml.endDocument(); + } + + /** + * Starts a thread that reads the contents of the standard output or error + * stream of the given process to not block the process. The stream is closed + * once fully processed. + */ + private void logStream(final String logType, final InputStream stream, final File file) { + new Thread() { + public void run() { + Reader reader = new InputStreamReader(stream, UTF_8); + StringBuilder out = new StringBuilder(); + char[] buffer = new char[1024]; + try { + for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) + out.append(buffer, 0, n); + } catch (IOException e) { + + } finally { + IOUtils.closeQuietly(stream); + } + + String msg = out.toString(); + LogFactory.getLog(TesseractOCRParser.class).debug(msg); + } + }.start(); + } + + static String getTesseractProg() { + return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract"; + } + +} Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&r1=1723222&r2=1723223&view=diff ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -14,6 +14,8 @@ # limitations under the License. +org.apache.tika.parser.font.AdobeFontMetricParser +org.apache.tika.parser.font.TrueTypeParser org.apache.tika.parser.image.BPGParser org.apache.tika.parser.image.ImageParser org.apache.tika.parser.image.PSDParser @@ -22,6 +24,7 @@ org.apache.tika.parser.image.WebPParser org.apache.tika.parser.jpeg.JpegParser org.apache.tika.parser.audio.AudioParser org.apache.tika.parser.audio.MidiParser +org.apache.tika.parser.ocr.TesseractOCRParser org.apache.tika.parser.mp3.Mp3Parser org.apache.tika.parser.mp4.MP4Parser org.apache.tika.parser.video.FLVParser Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties Wed Jan 6 03:50:50 2016 @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +tesseractPath= +language=eng +pageSegMode=1 +maxFileSizeToOcr=2147483647 +minFileSizeToOcr=0 +timeout=120 \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.font; + +import static org.apache.tika.TikaTest.assertContains; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FAMILY_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FULL_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_VERSION; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_WEIGHT; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_PS_NAME; +import static org.junit.Assert.assertEquals; + +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing various different font files. + */ +public class FontParsersTest { + @Test + public void testAdobeFontMetricParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + try (TikaInputStream stream = TikaInputStream.get( + FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) { + parser.parse(stream, handler, metadata, context); + } + + assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE)); + + assertEquals("TestFontName", metadata.get(MET_FONT_NAME)); + assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME)); + assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME)); + + assertEquals("Medium", metadata.get(MET_FONT_WEIGHT)); + assertEquals("001.008", metadata.get(MET_FONT_VERSION)); + + String content = handler.toString(); + + // Test that the comments got extracted + assertContains("Comments", content); + assertContains("This is a comment in a sample file", content); + assertContains("UniqueID 12345", content); + } + + @Test + public void testTTFParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + //Open Sans font is ASL 2.0 according to + //http://www.google.com/fonts/specimen/Open+Sans + //...despite the copyright in the file's metadata. + + try (TikaInputStream stream = TikaInputStream.get( + FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) { + parser.parse(stream, handler, metadata, context); + } + + assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE)); + + assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED)); + + assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME)); + assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME)); + assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME)); + assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME)); + + assertEquals("Digitized", metadata.get("Copyright").substring(0, 9)); + assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9)); + + // Not extracted + assertEquals(null, metadata.get(MET_FONT_FULL_NAME)); + assertEquals(null, metadata.get(MET_FONT_WEIGHT)); + assertEquals(null, metadata.get(MET_FONT_VERSION)); + + // Currently, the parser doesn't extract any contents + String content = handler.toString(); + assertEquals("", content); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocr; + +import org.apache.tika.TikaTest; +import org.junit.Test; + +import java.io.File; +import java.io.InputStream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class TesseractOCRConfigTest extends TikaTest { + + @Test + public void testNoConfig() throws Exception { + TesseractOCRConfig config = new TesseractOCRConfig(); + assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath()); + assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath()); + assertEquals("Invalid default language value", "eng", config.getLanguage()); + assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode()); + assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr()); + assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr()); + assertEquals("Invalid default timeout value", 120, config.getTimeout()); + } + + @Test + public void testPartialConfig() throws Exception { + + InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream( + "/test-properties/TesseractOCRConfig-partial.properties"); + + TesseractOCRConfig config = new TesseractOCRConfig(stream); + assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath()); + assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath()); + assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage()); + assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode()); + assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr()); + assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr()); + assertEquals("Invalid overridden timeout value", 240, config.getTimeout()); + } + + @Test + public void testFullConfig() throws Exception { + + InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream( + "/test-properties/TesseractOCRConfig-full.properties"); + + TesseractOCRConfig config = new TesseractOCRConfig(stream); + assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath()); + assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath()); + assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage()); + assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode()); + assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr()); + assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr()); + assertEquals("Invalid overridden timeout value", 240, config.getTimeout()); + } + + @Test(expected=IllegalArgumentException.class) + public void testValidateLanguage() { + TesseractOCRConfig config = new TesseractOCRConfig(); + config.setLanguage("eng"); + config.setLanguage("eng+fra"); + assertTrue("Couldn't set valid values", true); + config.setLanguage("rm -Rf *"); + } + + @Test(expected=IllegalArgumentException.class) + public void testValidatePageSegMode() { + TesseractOCRConfig config = new TesseractOCRConfig(); + config.setPageSegMode("0"); + config.setPageSegMode("10"); + assertTrue("Couldn't set valid values", true); + config.setPageSegMode("11"); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocr; + +import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.junit.Assume.assumeTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.InputStream; +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.parser.image.ImageParser; +import org.apache.tika.parser.mail.RFC822Parser; +import org.apache.tika.parser.pdf.PDFParserConfig; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.junit.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +public class TesseractOCRParserTest extends TikaTest { + + public static boolean canRun() { + TesseractOCRConfig config = new TesseractOCRConfig(); + TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest(); + return tesseractOCRTest.canRun(config); + } + + private boolean canRun(TesseractOCRConfig config) { + String[] checkCmd = {config.getTesseractPath() + getTesseractProg()}; + // If Tesseract is not on the path, do not run the test. + return ExternalParser.check(checkCmd); + } + + /* + Check that if Tesseract is not found, the TesseractOCRParser claims to not support + any file types. So, the standard image parser is called instead. + */ + @Test + public void offersNoTypesIfNotFound() throws Exception { + TesseractOCRParser parser = new TesseractOCRParser(); + DefaultParser defaultParser = new DefaultParser(); + MediaType png = MediaType.image("png"); + + // With an invalid path, will offer no types + TesseractOCRConfig invalidConfig = new TesseractOCRConfig(); + invalidConfig.setTesseractPath("/made/up/path"); + + ParseContext parseContext = new ParseContext(); + parseContext.set(TesseractOCRConfig.class, invalidConfig); + + // No types offered + assertEquals(0, parser.getSupportedTypes(parseContext).size()); + + // And DefaultParser won't use us + assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); + } + + /* + If Tesseract is found, test we retrieve the proper number of supporting Parsers. + */ + @Test + public void offersTypesIfFound() throws Exception { + TesseractOCRParser parser = new TesseractOCRParser(); + DefaultParser defaultParser = new DefaultParser(); + + ParseContext parseContext = new ParseContext(); + MediaType png = MediaType.image("png"); + + // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. + assumeTrue(canRun()); + + assertEquals(5, parser.getSupportedTypes(parseContext).size()); + assertTrue(parser.getSupportedTypes(parseContext).contains(png)); + + // DefaultParser will now select the TesseractOCRParser. + assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); + } + + @Test + public void testPDFOCR() throws Exception { + String resource = "/test-documents/testOCR.pdf"; + String[] nonOCRContains = new String[0]; + testBasicOCR(resource, nonOCRContains, 2); + } + + @Test + public void testDOCXOCR() throws Exception { + String resource = "/test-documents/testOCR.docx"; + String[] nonOCRContains = { + "This is some text.", + "Here is an embedded image:" + }; + testBasicOCR(resource, nonOCRContains, 3); + } + + @Test + public void testPPTXOCR() throws Exception { + String resource = "/test-documents/testOCR.pptx"; + String[] nonOCRContains = { + "This is some text" + }; + testBasicOCR(resource, nonOCRContains, 3); + } + + private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { + TesseractOCRConfig config = new TesseractOCRConfig(); + Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + + PDFParserConfig pdfConfig = new PDFParserConfig(); + pdfConfig.setExtractInlineImages(true); + + ParseContext parseContext = new ParseContext(); + parseContext.set(TesseractOCRConfig.class, config); + parseContext.set(Parser.class, parser); + parseContext.set(PDFParserConfig.class, pdfConfig); + + try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) { + parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext); + } + List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata(); + assertEquals(numMetadatas, metadataList.size()); + + StringBuilder contents = new StringBuilder(); + for (Metadata m : metadataList) { + contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); + } + if (canRun()) { + assertTrue(contents.toString().contains("Happy New Year 2003!")); + } + for (String needle : nonOCRContains) { + assertContains(needle, contents.toString()); + } + assertTrue(metadataList.get(0).names().length > 10); + assertTrue(metadataList.get(1).names().length > 10); + //test at least one value + assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); + } + + @Test + public void testSingleImage() throws Exception { + assumeTrue(canRun()); + String xml = getXML("testOCR.jpg").xml; + assertContains("OCR Testing", xml); + } + + @Test + public void getNormalMetadataToo() throws Exception { + //this should be successful whether or not TesseractOCR is installed/active + //If tesseract is installed, the internal metadata extraction parser should + //work; and if tesseract isn't installed, the regular parsers should take over. + + //gif + Metadata m = getXML("testGIF.gif").metadata; + assertTrue(m.names().length > 20); + assertEquals("RGB", m.get("Chroma ColorSpaceType")); + + //jpg + m = getXML("testOCR.jpg").metadata; + assertEquals("136", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("66", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE)); + assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL)); + assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS)); + + //bmp + m = getXML("testBMP.bmp").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + + //png + m = getXML("testPNG.png").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("UnsignedIntegral", m.get("Data SampleFormat")); + + //tiff + m = getXML("testTIFF.tif").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("72 dots per inch", m.get("Y Resolution")); + } + + @Test + public void testMultipart() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822-multipart"); + ContentHandler handler = mock(XHTMLContentHandler.class); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + verify(handler).startDocument(); + int bodyExpectedTimes = 4, multipackExpectedTimes = 5; + // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked. + // But, different versions of Tesseract lead to a different number of invocations. So, we + // only verify the handler if Tesseract cannot run. + if (!TesseractOCRParserTest.canRun()) { + verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class)); + verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div"); + } + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + + //repeat, this time looking at content + parser = new RFC822Parser(); + metadata = new Metadata(); + stream = getStream("test-documents/testRFC822-multipart"); + handler = new BodyContentHandler(); + try { + parser.parse(stream, handler, metadata, new ParseContext()); + //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode + String bodyText = handler.toString(); + assertTrue(bodyText.contains("body 1")); + assertTrue(bodyText.contains("body 2")); + assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + private static InputStream getStream(String name) { + InputStream stream = Thread.currentThread().getContextClassLoader() + .getResourceAsStream(name); + assertNotNull("Test file not found " + name, stream); + return stream; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml Wed Jan 6 03:50:50 2016 @@ -0,0 +1,106 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-office-module</artifactId> + <name>Apache Tika Office Module</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-scratchpad</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-ooxml</artifactId> + <version>${poi.version}</version> + <exclusions> + <exclusion> + <groupId>stax</groupId> + <artifactId>stax-api</artifactId> + </exclusion> + <exclusion> + <groupId>xml-apis</groupId> + <artifactId>xml-apis</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>com.healthmarketscience.jackcess</groupId> + <artifactId>jackcess</artifactId> + <version>2.1.2</version> + </dependency> + <dependency> + <groupId>com.healthmarketscience.jackcess</groupId> + <artifactId>jackcess-encrypt</artifactId> + <version>2.1.1</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-package-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-web-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.core.ChmExtractor; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class ChmParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 5938777307516469802L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-htmlhelp"), + MediaType.application("chm"), + MediaType.application("x-chm")))); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + ChmExtractor chmExtractor = new ChmExtractor(stream); + + // metadata + metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp"); + + // content + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) { + final String entryName = entry.getName(); + if (entryName.endsWith(".html") + || entryName.endsWith(".htm") + ) { +// AttributesImpl attrs = new AttributesImpl(); +// attrs.addAttribute("", "name", "name", "String", entryName); +// xhtml.startElement("", "document", "document", attrs); + + byte[] data = chmExtractor.extractChmEntry(entry); + + parsePage(data, xhtml); + +// xhtml.endElement("", "", "document"); + } + } + + xhtml.endDocument(); + } + + + private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException + InputStream stream = null; + Metadata metadata = new Metadata(); + HtmlParser htmlParser = new HtmlParser(); + ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 + ParseContext parser = new ParseContext(); + try { + stream = new ByteArrayInputStream(byteObject); + htmlParser.parse(stream, handler, metadata, parser); + } catch (SAXException e) { + throw new RuntimeException(e); + } catch (IOException e) { + // Pushback overflow from tagsoup + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.io.Serializable; + +import org.apache.tika.exception.TikaException; + +/** + * + * Defines an accessor interface + * + * @param <T> + */ +public interface ChmAccessor<T> extends Serializable { + /** + * Parses chm accessor + * + * @param data + * chm file + * @param chmAccessor + * @throws TikaException + */ + void parse(byte[] data, T chmAccessor) throws TikaException; +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,398 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Holds chm listing entries + */ +public class ChmDirectoryListingSet { + private List<DirectoryListingEntry> dlel; + private byte[] data; + private int placeHolder = -1; + private long dataOffset = -1; + private int controlDataIndex = -1; + private int resetTableIndex = -1; + + private boolean isNotControlDataFound = true; + private boolean isNotResetTableFound = true; + + /** + * Constructs chm directory listing set + * + * @param data + * byte[] + * @param chmItsHeader + * @param chmItspHeader + * @throws TikaException + */ + public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader, + ChmItspHeader chmItspHeader) throws TikaException { + setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>()); + ChmCommons.assertByteArrayNotNull(data); + setData(data); + enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("list:=" + getDirectoryListingEntryList().toString() + + System.getProperty("line.separator")); + sb.append("number of list items:=" + + getDirectoryListingEntryList().size()); + return sb.toString(); + } + + /** + * Returns control data index that located in List + * + * @return control data index + */ + public int getControlDataIndex() { + return controlDataIndex; + } + + /** + * Sets control data index + * + * @param controlDataIndex + */ + protected void setControlDataIndex(int controlDataIndex) { + this.controlDataIndex = controlDataIndex; + } + + /** + * Return index of reset table + * + * @return reset table index + */ + public int getResetTableIndex() { + return resetTableIndex; + } + + /** + * Sets reset table index + * + * @param resetTableIndex + */ + protected void setResetTableIndex(int resetTableIndex) { + this.resetTableIndex = resetTableIndex; + } + + /** + * Sets place holder + * + * @param placeHolder + */ + private void setPlaceHolder(int placeHolder) { + this.placeHolder = placeHolder; + } + + private ChmPmglHeader PMGLheader; + /** + * Enumerates chm directory listing entries + * + * @param chmItsHeader + * chm itsf PMGLheader + * @param chmItspHeader + * chm itsp PMGLheader + */ + private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader, + ChmItspHeader chmItspHeader) { + try { + int startPmgl = chmItspHeader.getIndex_head(); + int stopPmgl = chmItspHeader.getUnknown_0024(); + int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader + .getHeader_len()); + setDataOffset(chmItsHeader.getDataOffset()); + + /* loops over all pmgls */ + byte[] dir_chunk = null; + for (int i = startPmgl; i>=0; ) { + dir_chunk = new byte[(int) chmItspHeader.getBlock_len()]; + int start = i * (int) chmItspHeader.getBlock_len() + dir_offset; + dir_chunk = ChmCommons + .copyOfRange(getData(), start, + start +(int) chmItspHeader.getBlock_len()); + + PMGLheader = new ChmPmglHeader(); + PMGLheader.parse(dir_chunk, PMGLheader); + enumerateOneSegment(dir_chunk); + + i=PMGLheader.getBlockNext(); + dir_chunk = null; + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + setData(null); + } + } + + /** + * Checks control data + * + * @param dle + * chm directory listing entry + */ + private void checkControlData(DirectoryListingEntry dle) { + if (isNotControlDataFound) { + if (dle.getName().contains(ChmConstants.CONTROL_DATA)) { + setControlDataIndex(getDirectoryListingEntryList().size()); + isNotControlDataFound = false; + } + } + } + + /** + * Checks reset table + * + * @param dle + * chm directory listing entry + */ + private void checkResetTable(DirectoryListingEntry dle) { + if (isNotResetTableFound) { + if (dle.getName().contains(ChmConstants.RESET_TABLE)) { + setResetTableIndex(getDirectoryListingEntryList().size()); + isNotResetTableFound = false; + } + } + } + + public static final boolean startsWith(byte[] data, String prefix) { + for (int i=0; i<prefix.length(); i++) { + if (data[i]!=prefix.charAt(i)) { + return false; + } + } + + return true; + } + /** + * Enumerates chm directory listing entries in single chm segment + * + * @param dir_chunk + */ + private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException { +// try { + if (dir_chunk != null) { + int header_len; + if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) { + header_len = ChmConstants.CHM_PMGI_LEN; + return; //skip PMGI + } + else if (startsWith(dir_chunk, ChmConstants.PMGL)) { + header_len = ChmConstants.CHM_PMGL_LEN; + } + else { + throw new ChmParsingException("Bad dir entry block."); + } + + placeHolder = header_len; + //setPlaceHolder(header_len); + while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace() + /*&& dir_chunk[placeHolder - 1] != 115*/) + { + //get entry name length + int strlen = 0;// = getEncint(data); + byte temp; + while ((temp=dir_chunk[placeHolder++]) >= 0x80) + { + strlen <<= 7; + strlen += temp & 0x7f; + } + + strlen = (strlen << 7) + temp & 0x7f; + + if (strlen>dir_chunk.length) { + throw new ChmParsingException("Bad data of a string length."); + } + + DirectoryListingEntry dle = new DirectoryListingEntry(); + dle.setNameLength(strlen); + dle.setName(new String(ChmCommons.copyOfRange( + dir_chunk, placeHolder, + (placeHolder + dle.getNameLength())), UTF_8)); + + checkControlData(dle); + checkResetTable(dle); + setPlaceHolder(placeHolder + + dle.getNameLength()); + + /* Sets entry type */ + if (placeHolder < dir_chunk.length + && dir_chunk[placeHolder] == 0) + dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED); + else + dle.setEntryType(ChmCommons.EntryType.COMPRESSED); + + setPlaceHolder(placeHolder + 1); + dle.setOffset(getEncint(dir_chunk)); + dle.setLength(getEncint(dir_chunk)); + getDirectoryListingEntryList().add(dle); + } + +// int indexWorkData = ChmCommons.indexOf(dir_chunk, +// "::".getBytes(UTF_8)); +// int indexUserData = ChmCommons.indexOf(dir_chunk, +// "/".getBytes(UTF_8)); +// +// if (indexUserData>=0 && indexUserData < indexWorkData) +// setPlaceHolder(indexUserData); +// else if (indexWorkData>=0) { +// setPlaceHolder(indexWorkData); +// } +// else { +// setPlaceHolder(indexUserData); +// } +// +// if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace() +// && dir_chunk[placeHolder - 1] != 115) {// #{ +// do { +// if (dir_chunk[placeHolder - 1] > 0) { +// DirectoryListingEntry dle = new DirectoryListingEntry(); +// +// // two cases: 1. when dir_chunk[placeHolder - +// // 1] == 0x73 +// // 2. when dir_chunk[placeHolder + 1] == 0x2f +// doNameCheck(dir_chunk, dle); +// +// // dle.setName(new +// // String(Arrays.copyOfRange(dir_chunk, +// // placeHolder, (placeHolder + +// // dle.getNameLength())))); +// dle.setName(new String(ChmCommons.copyOfRange( +// dir_chunk, placeHolder, +// (placeHolder + dle.getNameLength())), UTF_8)); +// checkControlData(dle); +// checkResetTable(dle); +// setPlaceHolder(placeHolder +// + dle.getNameLength()); +// +// /* Sets entry type */ +// if (placeHolder < dir_chunk.length +// && dir_chunk[placeHolder] == 0) +// dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED); +// else +// dle.setEntryType(ChmCommons.EntryType.COMPRESSED); +// +// setPlaceHolder(placeHolder + 1); +// dle.setOffset(getEncint(dir_chunk)); +// dle.setLength(getEncint(dir_chunk)); +// getDirectoryListingEntryList().add(dle); +// } else +// setPlaceHolder(placeHolder + 1); +// +// } while (nextEntry(dir_chunk)); +// } + } + +// } catch (Exception e) { +// e.printStackTrace(); +// } + } + + + /** + * Returns encrypted integer + * + * @param data_chunk + * + * @return + */ + private int getEncint(byte[] data_chunk) { + byte ob; + BigInteger bi = BigInteger.ZERO; + byte[] nb = new byte[1]; + + if (placeHolder < data_chunk.length) { + while ((ob = data_chunk[placeHolder]) < 0) { + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + setPlaceHolder(placeHolder + 1); + } + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + setPlaceHolder(placeHolder + 1); + } + return bi.intValue(); + } + + /** + * Sets chm directory listing entry list + * + * @param dlel + * chm directory listing entry list + */ + public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) { + this.dlel = dlel; + } + + /** + * Returns chm directory listing entry list + * + * @return List<DirectoryListingEntry> + */ + public List<DirectoryListingEntry> getDirectoryListingEntryList() { + return dlel; + } + + /** + * Sets data + * + * @param data + */ + private void setData(byte[] data) { + this.data = data; + } + + /** + * Returns data + * + * @return + */ + private byte[] getData() { + return data; + } + + /** + * Sets data offset + * + * @param dataOffset + */ + private void setDataOffset(long dataOffset) { + this.dataOffset = dataOffset; + } + + /** + * Returns data offset + * + * @return dataOffset + */ + public long getDataOffset() { + return dataOffset; + } +}
