Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Fri May 29 14:36:21 2015 @@ -17,33 +17,32 @@ package org.apache.tika.parser.pdf; * limitations under the License. */ -import org.apache.pdfbox.util.PDFTextStripper; - import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.Locale; import java.util.Properties; +import org.apache.pdfbox.util.PDFTextStripper; + /** * Config for PDFParser. - * + * <p/> * This allows parameters to be set programmatically: * <ol> * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li> * <li>Constructor of PDFParser</li> * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li> * </ol> - * + * <p/> * Parameters can also be set by modifying the PDFParserConfig.properties file, * which lives in the expected places, in trunk: * tika-parsers/src/main/resources/org/apache/tika/parser/pdf - * + * <p/> * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar: * org/apache/tika/parser/pdf - * */ -public class PDFParserConfig implements Serializable{ +public class PDFParserConfig implements Serializable { private static final long serialVersionUID = 6492570218190936986L; @@ -63,7 +62,7 @@ public class PDFParserConfig implements //True if we should use PDFBox's NonSequentialParser private boolean useNonSequentialParser = false; - + //True if acroform content should be extracted private boolean extractAcroFormContent = true; @@ -73,10 +72,10 @@ public class PDFParserConfig implements //True if inline images (as identified by their object id within //a pdf file) should only be extracted once. private boolean extractUniqueInlineImagesOnly = true; - + //The character width-based tolerance value used to estimate where spaces in text should be added private Float averageCharTolerance; - + //The space width-based tolerance value used to estimate where spaces in text should be added private Float spacingTolerance; @@ -90,7 +89,7 @@ public class PDFParserConfig implements * Loads properties from InputStream and then tries to close InputStream. * If there is an IOException, this silently swallows the exception * and goes back to the default. - * + * * @param is */ public PDFParserConfig(InputStream is) { @@ -109,7 +108,7 @@ public class PDFParserConfig implements } catch (IOException e) { } finally { if (is != null) { - try{ + try { is.close(); } catch (IOException e) { //swallow @@ -119,26 +118,26 @@ public class PDFParserConfig implements setEnableAutoSpace( getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); setSuppressDuplicateOverlappingText( - getProp(props.getProperty("suppressDuplicateOverlappingText"), + getProp(props.getProperty("suppressDuplicateOverlappingText"), getSuppressDuplicateOverlappingText())); setExtractAnnotationText( - getProp(props.getProperty("extractAnnotationText"), + getProp(props.getProperty("extractAnnotationText"), getExtractAnnotationText())); setSortByPosition( - getProp(props.getProperty("sortByPosition"), + getProp(props.getProperty("sortByPosition"), getSortByPosition())); setUseNonSequentialParser( - getProp(props.getProperty("useNonSequentialParser"), + getProp(props.getProperty("useNonSequentialParser"), getUseNonSequentialParser())); setExtractAcroFormContent( getProp(props.getProperty("extractAcroFormContent"), - getExtractAcroFormContent())); + getExtractAcroFormContent())); setExtractInlineImages( getProp(props.getProperty("extractInlineImages"), - getExtractInlineImages())); + getExtractInlineImages())); setExtractUniqueInlineImagesOnly( getProp(props.getProperty("extractUniqueInlineImagesOnly"), - getExtractUniqueInlineImagesOnly())); + getExtractUniqueInlineImagesOnly())); boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); @@ -151,10 +150,10 @@ public class PDFParserConfig implements accessChecker = new AccessChecker(allowExtractionForAccessibility); } } - + /** * Configures the given pdf2XHTML. - * + * * @param pdf2XHTML */ public void configure(PDF2XHTML pdf2XHTML) { @@ -174,108 +173,118 @@ public class PDFParserConfig implements pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); } - + /** + * @see #setExtractAcroFormContent(boolean) + */ + public boolean getExtractAcroFormContent() { + return extractAcroFormContent; + } + /** * If true (the default), extract content from AcroForms * at the end of the document. - * + * * @param extractAcroFormContent */ public void setExtractAcroFormContent(boolean extractAcroFormContent) { this.extractAcroFormContent = extractAcroFormContent; - + } - /** @see #setExtractAcroFormContent(boolean) */ - public boolean getExtractAcroFormContent() { - return extractAcroFormContent; + /** + * @see #setExtractInlineImages(boolean) + */ + public boolean getExtractInlineImages() { + return extractInlineImages; } /** * If true, extract inline embedded OBXImages. * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain - * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5, + * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5, * there can be surprisingly large memory consumption and/or out of memory errors. * Set to <code>true</code> with caution. - * <p> + * <p/> * The default is <code>false</code>. - * <p> + * <p/> * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)}; - * + * * @param extractInlineImages */ public void setExtractInlineImages(boolean extractInlineImages) { - this.extractInlineImages = extractInlineImages; + this.extractInlineImages = extractInlineImages; } - /** @see #setExtractInlineImages(boolean) */ - public boolean getExtractInlineImages() { - return extractInlineImages; + /** + * @see #setExtractUniqueInlineImagesOnly(boolean) + */ + public boolean getExtractUniqueInlineImagesOnly() { + return extractUniqueInlineImagesOnly; } /** * Multiple pages within a PDF file might refer to the same underlying image. * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the * parser will call the EmbeddedExtractor each time the image appears on a page. - * This might be desired for some use cases. However, to avoid duplication of + * This might be desired for some use cases. However, to avoid duplication of * extracted images, set this to <code>true</code>. The default is <code>true</code>. - * <p> - * Note that uniqueness is determined only by the underlying PDF COSObject id, not by + * <p/> + * Note that uniqueness is determined only by the underlying PDF COSObject id, not by * file hash or similar equality metric. - * If the PDF actually contains multiple copies of the same image + * If the PDF actually contains multiple copies of the same image * -- all with different object ids -- then all images will be extracted. - * <p> - * For this parameter to have any effect, {@link #extractInlineImages} must be + * <p/> + * For this parameter to have any effect, {@link #extractInlineImages} must be * set to <code>true</code>. - * + * * @param extractUniqueInlineImagesOnly */ public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly; - - } - /** @see #setExtractUniqueInlineImagesOnly(boolean) */ - public boolean getExtractUniqueInlineImagesOnly() { - return extractUniqueInlineImagesOnly; } - - /** @see #setEnableAutoSpace(boolean) */ + /** + * @see #setEnableAutoSpace(boolean) + */ public boolean getEnableAutoSpace() { return enableAutoSpace; } /** - * If true (the default), the parser should estimate - * where spaces should be inserted between words. For - * many PDFs this is necessary as they do not include - * explicit whitespace characters. + * If true (the default), the parser should estimate + * where spaces should be inserted between words. For + * many PDFs this is necessary as they do not include + * explicit whitespace characters. */ public void setEnableAutoSpace(boolean enableAutoSpace) { this.enableAutoSpace = enableAutoSpace; } - /** @see #setSuppressDuplicateOverlappingText(boolean)*/ + /** + * @see #setSuppressDuplicateOverlappingText(boolean) + */ public boolean getSuppressDuplicateOverlappingText() { return suppressDuplicateOverlappingText; } /** - * If true, the parser should try to remove duplicated - * text over the same region. This is needed for some - * PDFs that achieve bolding by re-writing the same - * text in the same area. Note that this can - * slow down extraction substantially (PDFBOX-956) and - * sometimes remove characters that were not in fact - * duplicated (PDFBOX-1155). By default this is disabled. + * If true, the parser should try to remove duplicated + * text over the same region. This is needed for some + * PDFs that achieve bolding by re-writing the same + * text in the same area. Note that this can + * slow down extraction substantially (PDFBOX-956) and + * sometimes remove characters that were not in fact + * duplicated (PDFBOX-1155). By default this is disabled. */ public void setSuppressDuplicateOverlappingText( boolean suppressDuplicateOverlappingText) { this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText; } - /** @see #setExtractAnnotationText(boolean)*/ + /** + * @see #setExtractAnnotationText(boolean) + */ public boolean getExtractAnnotationText() { return extractAnnotationText; } @@ -287,24 +296,29 @@ public class PDFParserConfig implements public void setExtractAnnotationText(boolean extractAnnotationText) { this.extractAnnotationText = extractAnnotationText; } - /** @see #setSortByPosition(boolean)*/ + + /** + * @see #setSortByPosition(boolean) + */ public boolean getSortByPosition() { return sortByPosition; } /** - * If true, sort text tokens by their x/y position - * before extracting text. This may be necessary for - * some PDFs (if the text tokens are not rendered "in - * order"), while for other PDFs it can produce the - * wrong result (for example if there are 2 columns, - * the text will be interleaved). Default is false. + * If true, sort text tokens by their x/y position + * before extracting text. This may be necessary for + * some PDFs (if the text tokens are not rendered "in + * order"), while for other PDFs it can produce the + * wrong result (for example if there are 2 columns, + * the text will be interleaved). Default is false. */ public void setSortByPosition(boolean sortByPosition) { this.sortByPosition = sortByPosition; } - /** @see #setUseNonSequentialParser(boolean)*/ + /** + * @see #setUseNonSequentialParser(boolean) + */ public boolean getUseNonSequentialParser() { return useNonSequentialParser; } @@ -312,18 +326,21 @@ public class PDFParserConfig implements /** * If true, uses PDFBox's non-sequential parser. * The non-sequential parser should be much faster than the traditional - * full doc parser. However, until PDFBOX-XXX is fixed, + * full doc parser. However, until PDFBOX-XXX is fixed, * the non-sequential parser fails * to extract some document metadata. - * <p> + * <p/> * Default is false (use the traditional parser) + * * @param useNonSequentialParser */ public void setUseNonSequentialParser(boolean useNonSequentialParser) { this.useNonSequentialParser = useNonSequentialParser; } - /** @see #setAverageCharTolerance(Float)*/ + /** + * @see #setAverageCharTolerance(Float) + */ public Float getAverageCharTolerance() { return averageCharTolerance; } @@ -335,7 +352,9 @@ public class PDFParserConfig implements this.averageCharTolerance = averageCharTolerance; } - /** @see #setSpacingTolerance(Float)*/ + /** + * @see #setSpacingTolerance(Float) + */ public Float getSpacingTolerance() { return spacingTolerance; } @@ -347,16 +366,16 @@ public class PDFParserConfig implements this.spacingTolerance = spacingTolerance; } - public void setAccessChecker(AccessChecker accessChecker) { - this.accessChecker = accessChecker; - } - public AccessChecker getAccessChecker() { return accessChecker; } - private boolean getProp(String p, boolean defaultMissing){ - if (p == null){ + public void setAccessChecker(AccessChecker accessChecker) { + this.accessChecker = accessChecker; + } + + private boolean getProp(String p, boolean defaultMissing) { + if (p == null) { return defaultMissing; } if (p.toLowerCase(Locale.ROOT).equals("true")) { @@ -375,7 +394,7 @@ public class PDFParserConfig implements result = prime * result + ((averageCharTolerance == null) ? 0 : averageCharTolerance - .hashCode()); + .hashCode()); result = prime * result + (enableAutoSpace ? 1231 : 1237); result = prime * result + (extractAcroFormContent ? 1231 : 1237); result = prime * result + (extractAnnotationText ? 1231 : 1237);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java Fri May 29 14:36:21 2015 @@ -59,7 +59,7 @@ class GroupState { list = other.list; listLevel = other.listLevel; fontCharset = other.fontCharset; - depth = 1+other.depth; + depth = 1 + other.depth; pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0; //do not inherit object, sn, sv or sp Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java Fri May 29 14:36:21 2015 @@ -29,8 +29,7 @@ public class ListDescriptor { public boolean isStyle; public int[] numberType = new int[9]; - public boolean isUnordered(int level) - { + public boolean isUnordered(int level) { return numberType[level] == NUMBER_TYPE_BULLET; } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java Fri May 29 14:36:21 2015 @@ -14,143 +14,134 @@ package org.apache.tika.parser.rtf; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.Detector; -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.io.FilenameUtils; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.RTFMetadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeType; -import org.apache.tika.mime.MimeTypeException; -import org.apache.tika.mime.MimeTypes; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.EmbeddedContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; + */ + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.FilenameUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * This class buffers data from embedded objects and pictures. - * * <p/> - * + * <p/> + * <p/> * When the parser has finished an object or picture and called * {@link #handleCompletedObject()}, this will write the object * to the {@link #handler}. - * * <p/> - * + * <p/> + * <p/> * This (in combination with TextExtractor) expects basically a flat parse. It will pull out * all pict whether they are tied to objdata or are intended * to be standalone. - * + * <p/> * <p/> * This tries to pull metadata around a pict that is encoded * with {sp {sn} {sv}} types of data. This information * sometimes contains the name and even full file path of the original file. - * - */ class RTFEmbObjHandler { - + */ +class RTFEmbObjHandler { + private static final String EMPTY_STRING = ""; - - private enum EMB_STATE { - PICT, //recording pict data - OBJDATA, //recording objdata - NADA - }; - + private final ContentHandler handler; + + + private final ParseContext context; + private final ByteArrayOutputStream os; //high hex cached for writing hexpair chars (data) private int hi = -1; - private int thumbCount = 0; //don't need atomic, do need mutable private AtomicInteger unknownFilenameCount = new AtomicInteger(); - private boolean inObject = false; - private String sv = EMPTY_STRING; private String sn = EMPTY_STRING; - private StringBuilder sb = new StringBuilder(); - - private final ContentHandler handler; private Metadata metadata; - private final ParseContext context; - - private final ByteArrayOutputStream os; private EMB_STATE state = EMB_STATE.NADA; - protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) { this.handler = handler; this.context = context; os = new ByteArrayOutputStream(); } + protected void startPict() { state = EMB_STATE.PICT; metadata = new Metadata(); } - + protected void startObjData() { state = EMB_STATE.OBJDATA; metadata = new Metadata(); } - + protected void startSN() { sb.setLength(0); sb.append(RTFMetadata.RTF_PICT_META_PREFIX); } - + protected void endSN() { sn = sb.toString(); } - + protected void startSV() { sb.setLength(0); } - + protected void endSV() { sv = sb.toString(); } - + //end metadata pair protected void endSP() { metadata.add(sn, sv); } - - protected void setInObject(boolean v) { - inObject = v; - } - + protected boolean getInObject() { return inObject; } - + + protected void setInObject(boolean v) { + inObject = v; + } + protected void writeMetadataChar(char c) { sb.append(c); } - + protected void writeHexChar(int b) throws IOException, TikaException { //if not hexchar, ignore //white space is common if (TextExtractor.isHexChar(b)) { if (hi == -1) { - hi = 16*TextExtractor.hexValue(b); + hi = 16 * TextExtractor.hexValue(b); } else { - long sum = hi+TextExtractor.hexValue(b); + long sum = hi + TextExtractor.hexValue(b); if (sum > Integer.MAX_VALUE || sum < 0) { throw new IOException("hex char to byte overflow"); } - - os.write((int)sum); - + + os.write((int) sum); + hi = -1; } return; @@ -159,80 +150,80 @@ import org.xml.sax.SAXException; throw new TikaException("hit end of stream before finishing byte pair"); } } - - + protected void writeBytes(InputStream is, int len) throws IOException, TikaException { if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { throw new IOException("length of bytes to read out of bounds: " + len); } - + byte[] bytes = new byte[len]; int bytesRead = is.read(bytes); if (bytesRead < len) { throw new TikaException("unexpected end of file: need " + len + - " bytes of binary data, found " + (len-bytesRead)); + " bytes of binary data, found " + (len - bytesRead)); } os.write(bytes); } - + /** * Call this when the objdata/pict has completed + * * @throws IOException * @throws SAXException * @throws TikaException */ protected void handleCompletedObject() throws IOException, SAXException, TikaException { - EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); - - if (embeddedExtractor == null) { - embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); - } - - byte[] bytes = os.toByteArray(); - if (state == EMB_STATE.OBJDATA) { - RTFObjDataParser objParser = new RTFObjDataParser(); - try{ - byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); - extractObj(objBytes, handler, embeddedExtractor, metadata); - } catch (IOException e) { - //swallow. If anything goes wrong, ignore. - } - } else if (state == EMB_STATE.PICT) { - String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX+"wzDescription"); - if (filePath != null && filePath.length() > 0){ - metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath); - metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); - } - metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); - extractObj(bytes, handler, embeddedExtractor, metadata); - - } else if (state == EMB_STATE.NADA) { - //swallow...no start for pict or embed?! - } - reset(); + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); + + if (embeddedExtractor == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } + + byte[] bytes = os.toByteArray(); + if (state == EMB_STATE.OBJDATA) { + RTFObjDataParser objParser = new RTFObjDataParser(); + try { + byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); + extractObj(objBytes, handler, embeddedExtractor, metadata); + } catch (IOException e) { + //swallow. If anything goes wrong, ignore. + } + } else if (state == EMB_STATE.PICT) { + String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription"); + if (filePath != null && filePath.length() > 0) { + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath); + metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); + } + metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); + extractObj(bytes, handler, embeddedExtractor, metadata); + + } else if (state == EMB_STATE.NADA) { + //swallow...no start for pict or embed?! + } + reset(); } - + private void extractObj(byte[] bytes, ContentHandler handler, - EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata) - throws SAXException, IOException, TikaException { - + EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata) + throws SAXException, IOException, TikaException { + if (bytes == null) { return; } - + metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); - + if (embeddedExtractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(bytes); if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) { String extension = getExtension(stream, metadata); stream.reset(); if (inObject && state == EMB_STATE.PICT) { - metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_"+thumbCount++ + extension); + metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension); metadata.set(RTFMetadata.THUMBNAIL, "true"); } else { - metadata.set(Metadata.RESOURCE_NAME_KEY, "file_"+unknownFilenameCount.getAndIncrement() + -extension); + metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + + extension); } } try { @@ -245,7 +236,7 @@ extension); } } } - + private String getExtension(TikaInputStream is, Metadata metadata) { String cType = metadata.get(Metadata.CONTENT_TYPE); TikaConfig config = getConfig(); @@ -260,12 +251,12 @@ extension); } catch (IOException e) { //swallow } catch (MimeTypeException e) { - + } } return ".bin"; } - + private TikaConfig getConfig() { TikaConfig config = context.get(TikaConfig.class); if (config == null) { @@ -273,7 +264,7 @@ extension); } return config; } - + /** * reset state after each object. * Do not reset unknown file number. @@ -287,4 +278,10 @@ extension); sn = EMPTY_STRING; sb.setLength(0); } + + private enum EMB_STATE { + PICT, //recording pict data + OBJDATA, //recording objdata + NADA + } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Fri May 29 14:36:21 2015 @@ -43,37 +43,35 @@ import org.apache.tika.parser.microsoft. /** * Many thanks to Simon Mourier for: - * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf + * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf * and for granting permission to use his code in Tika. - * */ class RTFObjDataParser { private final static int[] INT_LE_POWS = new int[]{ - 1, 256, 65536, 16777216 + 1, 256, 65536, 16777216 }; private final static String WIN_ASCII = "WINDOWS-1252"; /** * Parses the embedded object/pict string - * + * * @param bytes actual bytes (already converted from the * hex pair string stored in the embedded object data into actual bytes or read * as raw binary bytes) * @return a SimpleRTFEmbObj or null * @throws IOException if there are any surprise surprises during parsing */ - + /** - * * @param bytes - * @param metadata incoming metadata - * @param unknownFilenameCount + * @param metadata incoming metadata + * @param unknownFilenameCount * @return byte[] for contents of obj data * @throws IOException */ - protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) + protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { ByteArrayInputStream is = new ByteArrayInputStream(bytes); long version = readUInt(is); @@ -84,9 +82,9 @@ class RTFObjDataParser { if (formatId != 2L) { return null; } - String className = readLengthPrefixedAnsiString(is).trim(); + String className = readLengthPrefixedAnsiString(is).trim(); String topicName = readLengthPrefixedAnsiString(is).trim(); - String itemName = readLengthPrefixedAnsiString(is).trim(); + String itemName = readLengthPrefixedAnsiString(is).trim(); if (className != null && className.length() > 0) { metadata.add(RTFMetadata.EMB_CLASS, className); @@ -103,19 +101,19 @@ class RTFObjDataParser { //readBytes tests for reading too many bytes byte[] embObjBytes = readBytes(is, dataSz); - if (className.toLowerCase(Locale.ROOT).equals("package")){ + if (className.toLowerCase(Locale.ROOT).equals("package")) { return handlePackage(embObjBytes, metadata); } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) { //simple bitmap bytes return embObjBytes; } else { ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes); - if (NPOIFSFileSystem.hasPOIFSHeader(embIs)){ - try{ + if (NPOIFSFileSystem.hasPOIFSHeader(embIs)) { + try { return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount); } catch (IOException e) { //swallow - } + } } } return embObjBytes; @@ -124,8 +122,8 @@ class RTFObjDataParser { //will throw IOException if not actually POIFS //can return null byte[] - private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, - AtomicInteger unknownFilenameCount) + private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, + AtomicInteger unknownFilenameCount) throws IOException { NPOIFSFileSystem fs = null; @@ -140,7 +138,7 @@ class RTFObjDataParser { return ret; } - if (root.hasEntry("Package")){ + if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); @@ -163,9 +161,9 @@ class RTFObjDataParser { DocumentEntry contentsEntry; try { - contentsEntry = (DocumentEntry)root.getEntry("CONTENTS"); + contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { - contentsEntry = (DocumentEntry)root.getEntry("Contents"); + contentsEntry = (DocumentEntry) root.getEntry("Contents"); } DocumentInputStream inp = null; @@ -184,7 +182,7 @@ class RTFObjDataParser { is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); - metadata.set(Metadata.RESOURCE_NAME_KEY, "file_"+unknownFilenameCount.getAndIncrement() + "."+type.getExtension()); + metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } @@ -197,12 +195,11 @@ class RTFObjDataParser { } - /** - * can return null if there is a linked object + * can return null if there is a linked object * instead of an embedded file */ - private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException { + private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException { //now parse the package header ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes); readUShort(is); @@ -231,16 +228,16 @@ class RTFObjDataParser { try { long unicodeLen = readUInt(is); - for (int i = 0; i < unicodeLen; i++){ + for (int i = 0; i < unicodeLen; i++) { int lo = is.read(); int hi = is.read(); - int sum = lo+256*hi; - if (hi == -1 || lo == -1){ + int sum = lo + 256 * hi; + if (hi == -1 || lo == -1) { //stream ran out; empty SB and stop unicodeFilePath.setLength(0); break; } - unicodeFilePath.append((char)sum); + unicodeFilePath.append((char) sum); } } catch (IOException e) { //swallow; the unicode file path is optional and might not happen @@ -248,7 +245,7 @@ class RTFObjDataParser { } String fileNameToUse = ""; String pathToUse = ""; - if (unicodeFilePath.length() > 0){ + if (unicodeFilePath.length() > 0) { String p = unicodeFilePath.toString(); fileNameToUse = p; pathToUse = p; @@ -265,21 +262,21 @@ class RTFObjDataParser { private int readUShort(InputStream is) throws IOException { int lo = is.read(); - int hi = is.read()*256; + int hi = is.read() * 256; if (lo == -1 || hi == -1) { throw new IOException("Hit end of stream before reading little endian unsigned short."); } - return hi+lo; + return hi + lo; } private long readUInt(InputStream is) throws IOException { long sum = 0; - for (int i = 0; i < 4; i++){ + for (int i = 0; i < 4; i++) { int v = is.read(); if (v == -1) { throw new IOException("Hit end of stream before finishing little endian unsigned int."); } - sum += v*(long)INT_LE_POWS[i]; + sum += v * (long) INT_LE_POWS[i]; } return sum; } @@ -288,7 +285,7 @@ class RTFObjDataParser { StringBuilder sb = new StringBuilder(); int c = is.read(); while (c > 0) { - sb.append((char)c); + sb.append((char) c); c = is.read(); } if (c == -1) { @@ -319,13 +316,13 @@ class RTFObjDataParser { return bytes; } - + private byte[] initByteArray(long len) throws IOException { if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { throw new IOException("Requested length for reading bytes is out of bounds: " + len); } - return new byte[(int)len]; - + return new byte[(int) len]; + } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Fri May 29 14:36:21 2015 @@ -43,17 +43,21 @@ public class RTFParser extends AbstractP private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("rtf")); - - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - /** * maximum number of bytes per embedded object/pict (default: 20MB) */ private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB /** + * See {@link #setMaxBytesForEmbeddedObject(int)}. + * + * @return maximum number of bytes allowed for an embedded object. + */ + public static int getMaxBytesForEmbeddedObject() { + return EMB_OBJ_MAX_BYTES; + } + + /** * Bytes for embedded objects are currently cached in memory. * If something goes wrong during the parsing of an embedded object, * it is possible that a read length may be crazily too long @@ -66,19 +70,14 @@ public class RTFParser extends AbstractP EMB_OBJ_MAX_BYTES = max; } - /** - * See {@link #setMaxBytesForEmbeddedObject(int)}. - * - * @return maximum number of bytes allowed for an embedded object. - */ - public static int getMaxBytesForEmbeddedObject() { - return EMB_OBJ_MAX_BYTES; + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; } public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { TaggedInputStream tagged = new TaggedInputStream(stream); try { XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Fri May 29 14:36:21 2015 @@ -54,15 +54,6 @@ import org.xml.sax.SAXException; final class TextExtractor { private static final Charset ASCII = Charset.forName("US-ASCII"); - - private static Charset getCharset(String name) { - try { - return CharsetUtils.forName(name); - } catch (Exception e) { - return ASCII; - } - } - private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252"); private static final Charset MAC_ROMAN = getCharset("MacRoman"); private static final Charset SHIFT_JIS = getCharset("Shift_JIS"); @@ -131,100 +122,17 @@ final class TextExtractor { private static final Charset BIG5 = getCharset("Big5"); private static final Charset GB2312 = getCharset("GB2312"); private static final Charset MS949 = getCharset("ms949"); - - private int written = 0; - // Hold pending bytes (encoded in the current charset) - // for text output: - private byte[] pendingBytes = new byte[16]; - private int pendingByteCount; - private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes); - - // Holds pending chars for text output - private char[] pendingChars = new char[10]; - private int pendingCharCount; - - // Holds chars for a still-being-tokenized control word - private byte[] pendingControl = new byte[10]; - private int pendingControlCount; - - // Used when we decode bytes -> chars using CharsetDecoder: - private final char[] outputArray = new char[128]; - private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray); - - // Reused when possible: - private CharsetDecoder decoder; - private Charset lastCharset; - - private Charset globalCharset = WINDOWS_1252; - private int globalDefaultFont = -1; - private int curFontID = -1; - - // Holds the font table from this RTF doc, mapping - // the font number (from \fN control word) to the - // corresponding charset: - private final Map<Integer, Charset> fontToCharset = - new HashMap<Integer, Charset>(); - - // Group stack: when we open a new group, we push - // the previous group state onto the stack; when we - // close the group, we restore it - private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>(); - - // Current group state; in theory this initial - // GroupState is unused because the RTF doc should - // immediately open the top group (start with {): - private GroupState groupState = new GroupState(); - - private boolean inHeader = true; - private int fontTableState; - private int fontTableDepth; - - // Non null if we are processing metadata (title, - // keywords, etc.) inside the info group: - private Property nextMetaData; - private boolean inParagraph; - - // Non-zero if we are processing inside a field destination: - private int fieldState; - - // Non-zero list index - private int pendingListEnd; - private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>(); - private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>(); - private Map<Integer, ListDescriptor> currentListTable; - private ListDescriptor currentList; - private int listTableLevel = -1; - private boolean ignoreLists; - - // Non-null if we've seen the url for a HYPERLINK but not yet - // its text: - private String pendingURL; - - private final StringBuilder pendingBuffer = new StringBuilder(); - - // Used to process the sub-groups inside the upr - // group: - private int uprState = -1; - - private final XHTMLContentHandler out; - private final Metadata metadata; - private final RTFEmbObjHandler embObjHandler; - - // Used when extracting CREATION date: - private int year, month, day, hour, minute; - - // How many next ansi chars we should skip; this - // is 0 except when we are still in the "ansi - // shadow" after seeing a unicode escape, at which - // point it's set to the last ucN skip we had seen: - int ansiSkip = 0; - // The RTF doc has a "font table" that assigns ords // (f0, f1, f2, etc.) to fonts and charsets, using the // \fcharsetN control word. This mapping maps from the // N to corresponding Java charset: private static final Map<Integer, Charset> FCHARSET_MAP = new HashMap<Integer, Charset>(); + // The RTF may specify the \ansicpgN charset in the + // header; this maps the N to the corresponding Java + // character set: + private static final Map<Integer, Charset> ANSICPG_MAP = + new HashMap<Integer, Charset>(); static { FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI @@ -267,15 +175,10 @@ final class TextExtractor { FCHARSET_MAP.put(255, CP850); // OEM } - // The RTF may specify the \ansicpgN charset in the - // header; this maps the N to the corresponding Java - // character set: - private static final Map<Integer, Charset> ANSICPG_MAP = - new HashMap<Integer, Charset>(); static { ANSICPG_MAP.put(437, CP4372); // US IBM ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708) - + ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4) ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic) ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced) @@ -331,30 +234,99 @@ final class TextExtractor { ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi } + // Used when we decode bytes -> chars using CharsetDecoder: + private final char[] outputArray = new char[128]; + private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray); + // Holds the font table from this RTF doc, mapping + // the font number (from \fN control word) to the + // corresponding charset: + private final Map<Integer, Charset> fontToCharset = + new HashMap<Integer, Charset>(); + // Group stack: when we open a new group, we push + // the previous group state onto the stack; when we + // close the group, we restore it + private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>(); + private final StringBuilder pendingBuffer = new StringBuilder(); + private final XHTMLContentHandler out; + private final Metadata metadata; + private final RTFEmbObjHandler embObjHandler; + // How many next ansi chars we should skip; this + // is 0 except when we are still in the "ansi + // shadow" after seeing a unicode escape, at which + // point it's set to the last ucN skip we had seen: + int ansiSkip = 0; + private int written = 0; + // Hold pending bytes (encoded in the current charset) + // for text output: + private byte[] pendingBytes = new byte[16]; + private int pendingByteCount; + private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes); + // Holds pending chars for text output + private char[] pendingChars = new char[10]; + private int pendingCharCount; + // Holds chars for a still-being-tokenized control word + private byte[] pendingControl = new byte[10]; + private int pendingControlCount; + // Reused when possible: + private CharsetDecoder decoder; + private Charset lastCharset; + private Charset globalCharset = WINDOWS_1252; + private int globalDefaultFont = -1; + private int curFontID = -1; + // Current group state; in theory this initial + // GroupState is unused because the RTF doc should + // immediately open the top group (start with {): + private GroupState groupState = new GroupState(); + private boolean inHeader = true; + private int fontTableState; + private int fontTableDepth; + // Non null if we are processing metadata (title, + // keywords, etc.) inside the info group: + private Property nextMetaData; + private boolean inParagraph; + // Non-zero if we are processing inside a field destination: + private int fieldState; + // Non-zero list index + private int pendingListEnd; + private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>(); + private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>(); + private Map<Integer, ListDescriptor> currentListTable; + private ListDescriptor currentList; + private int listTableLevel = -1; + private boolean ignoreLists; + // Non-null if we've seen the url for a HYPERLINK but not yet + // its text: + private String pendingURL; + // Used to process the sub-groups inside the upr + // group: + private int uprState = -1; + // Used when extracting CREATION date: + private int year, month, day, hour, minute; + public TextExtractor(XHTMLContentHandler out, Metadata metadata, - RTFEmbObjHandler embObjHandler) { + RTFEmbObjHandler embObjHandler) { this.metadata = metadata; this.out = out; this.embObjHandler = embObjHandler; } - public boolean isIgnoringLists() { - return ignoreLists; - } - - public void setIgnoreLists(boolean ignore) { - this.ignoreLists = ignore; + private static Charset getCharset(String name) { + try { + return CharsetUtils.forName(name); + } catch (Exception e) { + return ASCII; + } } protected static boolean isHexChar(int ch) { return (ch >= '0' && ch <= '9') || - (ch >= 'a' && ch <= 'f') || - (ch >= 'A' && ch <= 'F'); + (ch >= 'a' && ch <= 'f') || + (ch >= 'A' && ch <= 'F'); } private static boolean isAlpha(int ch) { return (ch >= 'a' && ch <= 'z') || - (ch >= 'A' && ch <= 'Z'); + (ch >= 'A' && ch <= 'Z'); } private static boolean isDigit(int ch) { @@ -372,6 +344,14 @@ final class TextExtractor { } } + public boolean isIgnoringLists() { + return ignoreLists; + } + + public void setIgnoreLists(boolean ignore) { + this.ignoreLists = ignore; + } + // Push pending bytes or pending chars: private void pushText() throws IOException, SAXException, TikaException { if (pendingByteCount != 0) { @@ -391,27 +371,27 @@ final class TextExtractor { pushChars(); } if (groupState.pictDepth > 0) { - embObjHandler.writeMetadataChar((char)b); + embObjHandler.writeMetadataChar((char) b); } else { // Save the byte in pending buffer: if (pendingByteCount == pendingBytes.length) { // Gradual but exponential growth: - final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)]; + final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)]; System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length); pendingBytes = newArray; pendingByteBuffer = ByteBuffer.wrap(pendingBytes); } pendingBytes[pendingByteCount++] = (byte) b; - } + } } - // Buffers a byte as part of a control word: + // Buffers a byte as part of a control word: private void addControl(int b) { assert isAlpha(b); // Save the byte in pending buffer: if (pendingControlCount == pendingControl.length) { // Gradual but exponential growth: - final byte[] newArray = new byte[(int) (pendingControl.length*1.25)]; + final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)]; System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length); pendingControl = newArray; } @@ -431,7 +411,7 @@ final class TextExtractor { } else { if (pendingCharCount == pendingChars.length) { // Gradual but exponential growth: - final char[] newArray = new char[(int) (pendingChars.length*1.25)]; + final char[] newArray = new char[(int) (pendingChars.length * 1.25)]; System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length); pendingChars = newArray; } @@ -458,7 +438,7 @@ final class TextExtractor { // }; extract(new PushbackInputStream(in, 2)); } - + private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException { out.startDocument(); @@ -471,7 +451,7 @@ final class TextExtractor { } else if (b == '{') { pushText(); processGroupStart(in); - } else if (b == '}') { + } else if (b == '}') { pushText(); processGroupEnd(); if (groupStates.isEmpty()) { @@ -479,9 +459,9 @@ final class TextExtractor { break; } } else if (groupState.objdata == true || - groupState.pictDepth == 1) { + groupState.pictDepth == 1) { embObjHandler.writeHexChar(b); - } else if (b != '\r' && b != '\n' + } else if (b != '\r' && b != '\n' && (!groupState.ignore || nextMetaData != null || groupState.sn == true || groupState.sv == true)) { // Linefeed and carriage return are not @@ -497,7 +477,7 @@ final class TextExtractor { endParagraph(false); out.endDocument(); } - + private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException { int b = in.read(); if (b == '\'') { @@ -505,16 +485,16 @@ final class TextExtractor { parseHexChar(in); } else if (isAlpha(b)) { // control word - parseControlWord((char)b, in); + parseControlWord((char) b, in); } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') { // escaped char addOutputByte(b); } else if (b != -1) { // control symbol, eg \* or \~ - processControlSymbol((char)b); + processControlSymbol((char) b); } } - + private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException { int hex1 = in.read(); if (!isHexChar(hex1)) { @@ -522,7 +502,7 @@ final class TextExtractor { in.unread(hex1); return; } - + int hex2 = in.read(); if (!isHexChar(hex2)) { // TODO: log a warning here, somehow? @@ -531,7 +511,7 @@ final class TextExtractor { in.unread(hex2); return; } - + if (ansiSkip != 0) { // Skip this ansi char since we are // still in the shadow of a unicode @@ -539,19 +519,19 @@ final class TextExtractor { ansiSkip--; } else { // Unescape: - addOutputByte(16*hexValue(hex1) + hexValue(hex2)); + addOutputByte(16 * hexValue(hex1) + hexValue(hex2)); } } private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException { addControl(firstChar); - + int b = in.read(); while (isAlpha(b)) { addControl(b); b = in.read(); } - + boolean hasParam = false; boolean negParam = false; if (b == '-') { @@ -567,14 +547,14 @@ final class TextExtractor { hasParam = true; b = in.read(); } - + // space is consumed as part of the // control word, but is not added to the // control word if (b != ' ') { in.unread(b); } - + if (hasParam) { if (negParam) { param = -param; @@ -583,7 +563,7 @@ final class TextExtractor { } else { processControlWord(); } - + pendingControlCount = 0; } @@ -602,7 +582,7 @@ final class TextExtractor { } if (inList() && pendingListEnd != groupState.list) { startList(groupState.list); - } + } if (inList()) { out.startElement("li"); } else { @@ -738,7 +718,7 @@ final class TextExtractor { if (pendingControlCount != s.length()) { return false; } - for(int idx=0;idx<pendingControlCount;idx++) { + for (int idx = 0; idx < pendingControlCount; idx++) { assert isAlpha(s.charAt(idx)); if (((byte) s.charAt(idx)) != pendingControl[idx]) { return false; @@ -748,26 +728,26 @@ final class TextExtractor { } private void processControlSymbol(char ch) throws IOException, SAXException, TikaException { - switch(ch) { - case '~': - // Non-breaking space -> unicode NON-BREAKING SPACE - addOutputChar('\u00a0'); - break; - case '*': - // Ignorable destination (control words defined after - // the 1987 RTF spec). These are already handled by - // processGroupStart() - break; - case '-': - // Optional hyphen -> unicode SOFT HYPHEN - addOutputChar('\u00ad'); - break; - case '_': - // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN - addOutputChar('\u2011'); - break; - default: - break; + switch (ch) { + case '~': + // Non-breaking space -> unicode NON-BREAKING SPACE + addOutputChar('\u00a0'); + break; + case '*': + // Ignorable destination (control words defined after + // the 1987 RTF spec). These are already handled by + // processGroupStart() + break; + case '-': + // Optional hyphen -> unicode SOFT HYPHEN + addOutputChar('\u00ad'); + break; + case '_': + // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN + addOutputChar('\u2011'); + break; + default: + break; } } @@ -890,8 +870,8 @@ final class TextExtractor { currentList.templateID = param; } else if (equals("levelnfc") || equals("levelnfcn")) { //sanity check to make sure list information isn't corrupt - if (listTableLevel > -1 && - listTableLevel < currentList.numberType.length ) { + if (listTableLevel > -1 && + listTableLevel < currentList.numberType.length) { currentList.numberType[listTableLevel] = param; } } @@ -963,7 +943,7 @@ final class TextExtractor { } else if (equals("bin")) { if (param >= 0) { if (groupState.pictDepth == 1) { - try{ + try { embObjHandler.writeBytes(in, param); } catch (IOException e) { //param was out of bounds or something went wrong during writing. @@ -977,7 +957,7 @@ final class TextExtractor { while (bytesToRead > 0) { int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length)); if (r < 0) { - throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param-bytesToRead)); + throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param - bytesToRead)); } bytesToRead -= r; } @@ -1005,6 +985,7 @@ final class TextExtractor { /** * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list * type for the given <code>listID</code>. + * * @param listID The ID of the list. * @throws IOException * @throws SAXException @@ -1019,6 +1000,7 @@ final class TextExtractor { /** * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list * type for the given <code>listID</code>. + * * @param listID The ID of the list. * @throws IOException * @throws SAXException @@ -1051,11 +1033,11 @@ final class TextExtractor { if (inHeader) { if (equals("ansi")) { globalCharset = WINDOWS_1252; - } else if (equals("pca")) { + } else if (equals("pca")) { globalCharset = CP850; - } else if (equals("pc")) { + } else if (equals("pc")) { globalCharset = CP437; - } else if (equals("mac")) { + } else if (equals("mac")) { globalCharset = MAC_ROMAN; } @@ -1319,13 +1301,13 @@ final class TextExtractor { // Make new GroupState groupState = new GroupState(groupState); - assert groupStates.size() == groupState.depth: "size=" + groupStates.size() + " depth=" + groupState.depth; + assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth; if (uprState == 0) { uprState = 1; groupState.ignore = true; } - + // Check for ignorable groups. Note that // sometimes we un-ignore within this group, eg // when handling upr escape. @@ -1335,7 +1317,7 @@ final class TextExtractor { if (b3 == '*') { groupState.ignore = true; } - in.unread(b3); + in.unread(b3); } in.unread(b2); } @@ -1346,7 +1328,7 @@ final class TextExtractor { if (nextMetaData != null) { if (nextMetaData == TikaCoreProperties.CREATED) { Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT); - cal.set(year, month-1, day, hour, minute, 0); + cal.set(year, month - 1, day, hour, minute, 0); metadata.set(nextMetaData, cal.getTime()); } else if (nextMetaData.isMultiValuePermitted()) { metadata.add(nextMetaData, pendingBuffer.toString()); @@ -1360,7 +1342,7 @@ final class TextExtractor { assert groupState.depth > 0; ansiSkip = 0; - + if (groupState.objdata == true) { embObjHandler.handleCompletedObject(); groupState.objdata = false; @@ -1391,7 +1373,7 @@ final class TextExtractor { // bold changed: if (groupState.italic) { if (!outerGroupState.italic || - groupState.bold != outerGroupState.bold) { + groupState.bold != outerGroupState.bold) { end("i"); groupState.italic = false; } @@ -1425,9 +1407,9 @@ final class TextExtractor { final boolean isLocalLink = s.contains("\\l "); int idx = s.indexOf('"'); if (idx != -1) { - int idx2 = s.indexOf('"', 1+idx); + int idx2 = s.indexOf('"', 1 + idx); if (idx2 != -1) { - s = s.substring(1+idx, idx2); + s = s.substring(1 + idx, idx2); } } pendingURL = (isLocalLink ? "#" : "") + s;
