a...

tallison Fri, 29 May 2015 07:37:49 -0700

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 Fri May 29 14:36:21 2015
@@ -17,33 +17,32 @@ package org.apache.tika.parser.pdf;
  * limitations under the License.
  */
 
-import org.apache.pdfbox.util.PDFTextStripper;
-
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
 import java.util.Locale;
 import java.util.Properties;
 
+import org.apache.pdfbox.util.PDFTextStripper;
+
 /**
  * Config for PDFParser.
- * 
+ * <p/>
  * This allows parameters to be set programmatically:
  * <ol>
  * <li>Calls to PDFParser, i.e. 
parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
  * <li>Constructor of PDFParser</li>
  * <li>Passing to PDFParser through a ParseContext: 
context.set(PDFParserConfig.class, config);</li>
  * </ol>
- * 
+ * <p/>
  * Parameters can also be set by modifying the PDFParserConfig.properties file,
  * which lives in the expected places, in trunk:
  * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
- * 
+ * <p/>
  * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
  * org/apache/tika/parser/pdf
- *
  */
-public class PDFParserConfig implements Serializable{
+public class PDFParserConfig implements Serializable {
 
     private static final long serialVersionUID = 6492570218190936986L;
 
@@ -63,7 +62,7 @@ public class PDFParserConfig implements
 
     //True if we should use PDFBox's NonSequentialParser
     private boolean useNonSequentialParser = false;
-    
+
     //True if acroform content should be extracted
     private boolean extractAcroFormContent = true;
 
@@ -73,10 +72,10 @@ public class PDFParserConfig implements
     //True if inline images (as identified by their object id within
     //a pdf file) should only be extracted once.
     private boolean extractUniqueInlineImagesOnly = true;
-    
+
     //The character width-based tolerance value used to estimate where spaces 
in text should be added
     private Float averageCharTolerance;
-    
+
     //The space width-based tolerance value used to estimate where spaces in 
text should be added
     private Float spacingTolerance;
 
@@ -90,7 +89,7 @@ public class PDFParserConfig implements
      * Loads properties from InputStream and then tries to close InputStream.
      * If there is an IOException, this silently swallows the exception
      * and goes back to the default.
-     * 
+     *
      * @param is
      */
     public PDFParserConfig(InputStream is) {
@@ -109,7 +108,7 @@ public class PDFParserConfig implements
         } catch (IOException e) {
         } finally {
             if (is != null) {
-                try{
+                try {
                     is.close();
                 } catch (IOException e) {
                     //swallow
@@ -119,26 +118,26 @@ public class PDFParserConfig implements
         setEnableAutoSpace(
                 getProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
         setSuppressDuplicateOverlappingText(
-                getProp(props.getProperty("suppressDuplicateOverlappingText"), 
+                getProp(props.getProperty("suppressDuplicateOverlappingText"),
                         getSuppressDuplicateOverlappingText()));
         setExtractAnnotationText(
-                getProp(props.getProperty("extractAnnotationText"), 
+                getProp(props.getProperty("extractAnnotationText"),
                         getExtractAnnotationText()));
         setSortByPosition(
-                getProp(props.getProperty("sortByPosition"), 
+                getProp(props.getProperty("sortByPosition"),
                         getSortByPosition()));
         setUseNonSequentialParser(
-                getProp(props.getProperty("useNonSequentialParser"), 
+                getProp(props.getProperty("useNonSequentialParser"),
                         getUseNonSequentialParser()));
         setExtractAcroFormContent(
                 getProp(props.getProperty("extractAcroFormContent"),
-                getExtractAcroFormContent()));
+                        getExtractAcroFormContent()));
         setExtractInlineImages(
                 getProp(props.getProperty("extractInlineImages"),
-                getExtractInlineImages()));
+                        getExtractInlineImages()));
         setExtractUniqueInlineImagesOnly(
                 getProp(props.getProperty("extractUniqueInlineImagesOnly"),
-                getExtractUniqueInlineImagesOnly()));
+                        getExtractUniqueInlineImagesOnly()));
 
         boolean checkExtractAccessPermission = 
getProp(props.getProperty("checkExtractAccessPermission"), false);
         boolean allowExtractionForAccessibility = 
getProp(props.getProperty("allowExtractionForAccessibility"), true);
@@ -151,10 +150,10 @@ public class PDFParserConfig implements
             accessChecker = new AccessChecker(allowExtractionForAccessibility);
         }
     }
-    
+
     /**
      * Configures the given pdf2XHTML.
-     * 
+     *
      * @param pdf2XHTML
      */
     public void configure(PDF2XHTML pdf2XHTML) {
@@ -174,108 +173,118 @@ public class PDFParserConfig implements
         
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
     }
 
-    
+    /**
+     * @see #setExtractAcroFormContent(boolean)
+     */
+    public boolean getExtractAcroFormContent() {
+        return extractAcroFormContent;
+    }
+
     /**
      * If true (the default), extract content from AcroForms
      * at the end of the document.
-     * 
+     *
      * @param extractAcroFormContent
      */
     public void setExtractAcroFormContent(boolean extractAcroFormContent) {
         this.extractAcroFormContent = extractAcroFormContent;
-        
+
     }
 
-    /** @see #setExtractAcroFormContent(boolean) */
-    public boolean getExtractAcroFormContent() {
-        return extractAcroFormContent;
+    /**
+     * @see #setExtractInlineImages(boolean)
+     */
+    public boolean getExtractInlineImages() {
+        return extractInlineImages;
     }
 
     /**
      * If true, extract inline embedded OBXImages.
      * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain
-     * thousands of embedded images totaling > 2.5 GB.  Also, at least as of 
PDFBox 1.8.5, 
+     * thousands of embedded images totaling > 2.5 GB.  Also, at least as of 
PDFBox 1.8.5,
      * there can be surprisingly large memory consumption and/or out of memory 
errors.
      * Set to <code>true</code> with caution.
-     * <p>
+     * <p/>
      * The default is <code>false</code>.
-     * <p>
+     * <p/>
      * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
-     * 
+     *
      * @param extractInlineImages
      */
     public void setExtractInlineImages(boolean extractInlineImages) {
-        this.extractInlineImages = extractInlineImages;        
+        this.extractInlineImages = extractInlineImages;
     }
 
-    /** @see #setExtractInlineImages(boolean) */
-    public boolean getExtractInlineImages() {
-        return extractInlineImages;
+    /**
+     * @see #setExtractUniqueInlineImagesOnly(boolean)
+     */
+    public boolean getExtractUniqueInlineImagesOnly() {
+        return extractUniqueInlineImagesOnly;
     }
 
     /**
      * Multiple pages within a PDF file might refer to the same underlying 
image.
      * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, 
the
      * parser will call the EmbeddedExtractor each time the image appears on a 
page.
-     * This might be desired for some use cases.  However, to avoid 
duplication of 
+     * This might be desired for some use cases.  However, to avoid 
duplication of
      * extracted images, set this to <code>true</code>.  The default is 
<code>true</code>.
-     * <p>
-     * Note that uniqueness is determined only by the underlying PDF COSObject 
id, not by 
+     * <p/>
+     * Note that uniqueness is determined only by the underlying PDF COSObject 
id, not by
      * file hash or similar equality metric.
-     * If the PDF actually contains multiple copies of the same image 
+     * If the PDF actually contains multiple copies of the same image
      * -- all with different object ids -- then all images will be extracted.
-     * <p>
-     * For this parameter to have any effect, {@link #extractInlineImages} 
must be 
+     * <p/>
+     * For this parameter to have any effect, {@link #extractInlineImages} 
must be
      * set to <code>true</code>.
-     * 
+     *
      * @param extractUniqueInlineImagesOnly
      */
     public void setExtractUniqueInlineImagesOnly(boolean 
extractUniqueInlineImagesOnly) {
         this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
-        
-    }
 
-    /** @see #setExtractUniqueInlineImagesOnly(boolean) */
-    public boolean getExtractUniqueInlineImagesOnly() {
-        return extractUniqueInlineImagesOnly;
     }
 
-
-    /** @see #setEnableAutoSpace(boolean) */
+    /**
+     * @see #setEnableAutoSpace(boolean)
+     */
     public boolean getEnableAutoSpace() {
         return enableAutoSpace;
     }
 
     /**
-     *  If true (the default), the parser should estimate
-     *  where spaces should be inserted between words.  For
-     *  many PDFs this is necessary as they do not include
-     *  explicit whitespace characters.
+     * If true (the default), the parser should estimate
+     * where spaces should be inserted between words.  For
+     * many PDFs this is necessary as they do not include
+     * explicit whitespace characters.
      */
     public void setEnableAutoSpace(boolean enableAutoSpace) {
         this.enableAutoSpace = enableAutoSpace;
     }
 
-    /** @see #setSuppressDuplicateOverlappingText(boolean)*/
+    /**
+     * @see #setSuppressDuplicateOverlappingText(boolean)
+     */
     public boolean getSuppressDuplicateOverlappingText() {
         return suppressDuplicateOverlappingText;
     }
 
     /**
-     *  If true, the parser should try to remove duplicated
-     *  text over the same region.  This is needed for some
-     *  PDFs that achieve bolding by re-writing the same
-     *  text in the same area.  Note that this can
-     *  slow down extraction substantially (PDFBOX-956) and
-     *  sometimes remove characters that were not in fact
-     *  duplicated (PDFBOX-1155).  By default this is disabled.
+     * If true, the parser should try to remove duplicated
+     * text over the same region.  This is needed for some
+     * PDFs that achieve bolding by re-writing the same
+     * text in the same area.  Note that this can
+     * slow down extraction substantially (PDFBOX-956) and
+     * sometimes remove characters that were not in fact
+     * duplicated (PDFBOX-1155).  By default this is disabled.
      */
     public void setSuppressDuplicateOverlappingText(
             boolean suppressDuplicateOverlappingText) {
         this.suppressDuplicateOverlappingText = 
suppressDuplicateOverlappingText;
     }
 
-    /** @see #setExtractAnnotationText(boolean)*/
+    /**
+     * @see #setExtractAnnotationText(boolean)
+     */
     public boolean getExtractAnnotationText() {
         return extractAnnotationText;
     }
@@ -287,24 +296,29 @@ public class PDFParserConfig implements
     public void setExtractAnnotationText(boolean extractAnnotationText) {
         this.extractAnnotationText = extractAnnotationText;
     }
-    /** @see #setSortByPosition(boolean)*/
+
+    /**
+     * @see #setSortByPosition(boolean)
+     */
     public boolean getSortByPosition() {
         return sortByPosition;
     }
 
     /**
-     *  If true, sort text tokens by their x/y position
-     *  before extracting text.  This may be necessary for
-     *  some PDFs (if the text tokens are not rendered "in
-     *  order"), while for other PDFs it can produce the
-     *  wrong result (for example if there are 2 columns,
-     *  the text will be interleaved).  Default is false.
+     * If true, sort text tokens by their x/y position
+     * before extracting text.  This may be necessary for
+     * some PDFs (if the text tokens are not rendered "in
+     * order"), while for other PDFs it can produce the
+     * wrong result (for example if there are 2 columns,
+     * the text will be interleaved).  Default is false.
      */
     public void setSortByPosition(boolean sortByPosition) {
         this.sortByPosition = sortByPosition;
     }
 
-    /** @see #setUseNonSequentialParser(boolean)*/
+    /**
+     * @see #setUseNonSequentialParser(boolean)
+     */
     public boolean getUseNonSequentialParser() {
         return useNonSequentialParser;
     }
@@ -312,18 +326,21 @@ public class PDFParserConfig implements
     /**
      * If true, uses PDFBox's non-sequential parser.
      * The non-sequential parser should be much faster than the traditional
-     * full doc parser.  However, until PDFBOX-XXX is fixed, 
+     * full doc parser.  However, until PDFBOX-XXX is fixed,
      * the non-sequential parser fails
      * to extract some document metadata.
-     * <p>
+     * <p/>
      * Default is false (use the traditional parser)
+     *
      * @param useNonSequentialParser
      */
     public void setUseNonSequentialParser(boolean useNonSequentialParser) {
         this.useNonSequentialParser = useNonSequentialParser;
     }
 
-    /** @see #setAverageCharTolerance(Float)*/
+    /**
+     * @see #setAverageCharTolerance(Float)
+     */
     public Float getAverageCharTolerance() {
         return averageCharTolerance;
     }
@@ -335,7 +352,9 @@ public class PDFParserConfig implements
         this.averageCharTolerance = averageCharTolerance;
     }
 
-    /** @see #setSpacingTolerance(Float)*/
+    /**
+     * @see #setSpacingTolerance(Float)
+     */
     public Float getSpacingTolerance() {
         return spacingTolerance;
     }
@@ -347,16 +366,16 @@ public class PDFParserConfig implements
         this.spacingTolerance = spacingTolerance;
     }
 
-    public void setAccessChecker(AccessChecker accessChecker) {
-        this.accessChecker = accessChecker;
-    }
-
     public AccessChecker getAccessChecker() {
         return accessChecker;
     }
 
-    private boolean getProp(String p, boolean defaultMissing){
-        if (p == null){
+    public void setAccessChecker(AccessChecker accessChecker) {
+        this.accessChecker = accessChecker;
+    }
+
+    private boolean getProp(String p, boolean defaultMissing) {
+        if (p == null) {
             return defaultMissing;
         }
         if (p.toLowerCase(Locale.ROOT).equals("true")) {
@@ -375,7 +394,7 @@ public class PDFParserConfig implements
         result = prime
                 * result
                 + ((averageCharTolerance == null) ? 0 : averageCharTolerance
-                        .hashCode());
+                .hashCode());
         result = prime * result + (enableAutoSpace ? 1231 : 1237);
         result = prime * result + (extractAcroFormContent ? 1231 : 1237);
         result = prime * result + (extractAnnotationText ? 1231 : 1237);


Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
 Fri May 29 14:36:21 2015
@@ -59,7 +59,7 @@ class GroupState {
         list = other.list;
         listLevel = other.listLevel;
         fontCharset = other.fontCharset;
-        depth = 1+other.depth;
+        depth = 1 + other.depth;
         pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
         //do not inherit object, sn, sv or sp
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
 Fri May 29 14:36:21 2015
@@ -29,8 +29,7 @@ public class ListDescriptor {
     public boolean isStyle;
     public int[] numberType = new int[9];
 
-    public boolean isUnordered(int level)
-    {
+    public boolean isUnordered(int level) {
         return numberType[level] == NUMBER_TYPE_BULLET;
     }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
 Fri May 29 14:36:21 2015
@@ -14,143 +14,134 @@ package org.apache.tika.parser.rtf;
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */ 
-import java.io.ByteArrayOutputStream; 
-import java.io.IOException; 
-import java.io.InputStream; 
-import java.util.concurrent.atomic.AtomicInteger; 
-import org.apache.tika.config.TikaConfig; 
-import org.apache.tika.detect.Detector; 
-import org.apache.tika.exception.TikaException; 
-import org.apache.tika.extractor.EmbeddedDocumentExtractor; 
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; 
-import org.apache.tika.io.FilenameUtils; 
-import org.apache.tika.io.TikaInputStream; 
-import org.apache.tika.metadata.Metadata; 
-import org.apache.tika.metadata.RTFMetadata; 
-import org.apache.tika.mime.MediaType; 
-import org.apache.tika.mime.MimeType; 
-import org.apache.tika.mime.MimeTypeException; 
-import org.apache.tika.mime.MimeTypes; 
-import org.apache.tika.parser.ParseContext; 
-import org.apache.tika.sax.EmbeddedContentHandler; 
-import org.xml.sax.ContentHandler; 
-import org.xml.sax.SAXException; 
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.FilenameUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * This class buffers data from embedded objects and pictures.
- *
  * <p/>
- *
+ * <p/>
+ * <p/>
  * When the parser has finished an object or picture and called
  * {@link #handleCompletedObject()}, this will write the object
  * to the {@link #handler}.
- *
  * <p/>
- *
+ * <p/>
+ * <p/>
  * This (in combination with TextExtractor) expects basically a flat parse.  
It will pull out
  * all pict whether they are tied to objdata or are intended
  * to be standalone.
- *
+ * <p/>
  * <p/>
  * This tries to pull metadata around a pict that is encoded
  * with {sp {sn} {sv}} types of data.  This information
  * sometimes contains the name and even full file path of the original file.
- *
- */ class RTFEmbObjHandler {
-    
+ */
+class RTFEmbObjHandler {
+
     private static final String EMPTY_STRING = "";
-    
-    private enum EMB_STATE {
-      PICT, //recording pict data
-      OBJDATA, //recording objdata
-      NADA
-    };
-    
+    private final ContentHandler handler;
+
+
+    private final ParseContext context;
+    private final ByteArrayOutputStream os;
     //high hex cached for writing hexpair chars (data)
     private int hi = -1;
-    
     private int thumbCount = 0;
     //don't need atomic, do need mutable
     private AtomicInteger unknownFilenameCount = new AtomicInteger();
-    
     private boolean inObject = false;
-    
     private String sv = EMPTY_STRING;
     private String sn = EMPTY_STRING;
-    
     private StringBuilder sb = new StringBuilder();
-    
-    private final ContentHandler handler;
     private Metadata metadata;
-    private final ParseContext context;
-    
-    private final ByteArrayOutputStream os;
     private EMB_STATE state = EMB_STATE.NADA;
-    
     protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, 
ParseContext context) {
         this.handler = handler;
         this.context = context;
         os = new ByteArrayOutputStream();
     }
+
     protected void startPict() {
         state = EMB_STATE.PICT;
         metadata = new Metadata();
     }
-    
+
     protected void startObjData() {
         state = EMB_STATE.OBJDATA;
         metadata = new Metadata();
     }
-    
+
     protected void startSN() {
         sb.setLength(0);
         sb.append(RTFMetadata.RTF_PICT_META_PREFIX);
     }
-    
+
     protected void endSN() {
         sn = sb.toString();
     }
-    
+
     protected void startSV() {
         sb.setLength(0);
     }
-    
+
     protected void endSV() {
         sv = sb.toString();
     }
-    
+
     //end metadata pair
     protected void endSP() {
         metadata.add(sn, sv);
     }
-    
-    protected void setInObject(boolean v) {
-        inObject = v;
-    }
-    
+
     protected boolean getInObject() {
         return inObject;
     }
-    
+
+    protected void setInObject(boolean v) {
+        inObject = v;
+    }
+
     protected void writeMetadataChar(char c) {
         sb.append(c);
     }
-    
+
     protected void writeHexChar(int b) throws IOException, TikaException {
         //if not hexchar, ignore
         //white space is common
         if (TextExtractor.isHexChar(b)) {
             if (hi == -1) {
-                hi = 16*TextExtractor.hexValue(b);
+                hi = 16 * TextExtractor.hexValue(b);
             } else {
-                long sum = hi+TextExtractor.hexValue(b);
+                long sum = hi + TextExtractor.hexValue(b);
                 if (sum > Integer.MAX_VALUE || sum < 0) {
                     throw new IOException("hex char to byte overflow");
                 }
-                
-                os.write((int)sum);
-                
+
+                os.write((int) sum);
+
                 hi = -1;
             }
             return;
@@ -159,80 +150,80 @@ import org.xml.sax.SAXException;
             throw new TikaException("hit end of stream before finishing byte 
pair");
         }
     }
-    
-    
+
     protected void writeBytes(InputStream is, int len) throws IOException, 
TikaException {
         if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
             throw new IOException("length of bytes to read out of bounds: " + 
len);
         }
-        
+
         byte[] bytes = new byte[len];
         int bytesRead = is.read(bytes);
         if (bytesRead < len) {
             throw new TikaException("unexpected end of file: need " + len +
-                   " bytes of binary data, found " + (len-bytesRead));
+                    " bytes of binary data, found " + (len - bytesRead));
         }
         os.write(bytes);
     }
-    
+
     /**
      * Call this when the objdata/pict has completed
+     *
      * @throws IOException
      * @throws SAXException
      * @throws TikaException
      */
     protected void handleCompletedObject() throws IOException, SAXException, 
TikaException {
-       EmbeddedDocumentExtractor embeddedExtractor = 
context.get(EmbeddedDocumentExtractor.class);
-       
-       if (embeddedExtractor == null) {
-           embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
-       }
-       
-       byte[] bytes = os.toByteArray();
-       if (state == EMB_STATE.OBJDATA) {
-           RTFObjDataParser objParser = new RTFObjDataParser();
-           try{
-               byte[] objBytes = objParser.parse(bytes, metadata, 
unknownFilenameCount);
-               extractObj(objBytes, handler, embeddedExtractor, metadata);
-           } catch (IOException e) {
-              //swallow.  If anything goes wrong, ignore.
-           }
-       } else if (state == EMB_STATE.PICT) {
-           String filePath = 
metadata.get(RTFMetadata.RTF_PICT_META_PREFIX+"wzDescription");
-           if (filePath != null && filePath.length() > 0){
-               metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath);
-               metadata.set(Metadata.RESOURCE_NAME_KEY, 
FilenameUtils.getName(filePath));
-           }
-           metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
-           extractObj(bytes, handler, embeddedExtractor, metadata);
-           
-       } else if (state == EMB_STATE.NADA) {
-           //swallow...no start for pict or embed?!
-       }
-       reset();
+        EmbeddedDocumentExtractor embeddedExtractor = 
context.get(EmbeddedDocumentExtractor.class);
+
+        if (embeddedExtractor == null) {
+            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+        }
+
+        byte[] bytes = os.toByteArray();
+        if (state == EMB_STATE.OBJDATA) {
+            RTFObjDataParser objParser = new RTFObjDataParser();
+            try {
+                byte[] objBytes = objParser.parse(bytes, metadata, 
unknownFilenameCount);
+                extractObj(objBytes, handler, embeddedExtractor, metadata);
+            } catch (IOException e) {
+                //swallow.  If anything goes wrong, ignore.
+            }
+        } else if (state == EMB_STATE.PICT) {
+            String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + 
"wzDescription");
+            if (filePath != null && filePath.length() > 0) {
+                metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath);
+                metadata.set(Metadata.RESOURCE_NAME_KEY, 
FilenameUtils.getName(filePath));
+            }
+            metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+            extractObj(bytes, handler, embeddedExtractor, metadata);
+
+        } else if (state == EMB_STATE.NADA) {
+            //swallow...no start for pict or embed?!
+        }
+        reset();
     }
-    
+
     private void extractObj(byte[] bytes, ContentHandler handler,
-            EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata)
-                    throws SAXException, IOException, TikaException {
-        
+                            EmbeddedDocumentExtractor embeddedExtractor, 
Metadata metadata)
+            throws SAXException, IOException, TikaException {
+
         if (bytes == null) {
             return;
         }
-        
+
         metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
-        
+
         if (embeddedExtractor.shouldParseEmbedded(metadata)) {
             TikaInputStream stream = TikaInputStream.get(bytes);
             if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
                 String extension = getExtension(stream, metadata);
                 stream.reset();
                 if (inObject && state == EMB_STATE.PICT) {
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, 
"thumbnail_"+thumbCount++ + extension);
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + 
thumbCount++ + extension);
                     metadata.set(RTFMetadata.THUMBNAIL, "true");
                 } else {
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, 
"file_"+unknownFilenameCount.getAndIncrement() + 
-extension);
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + 
unknownFilenameCount.getAndIncrement() +
+                            extension);
                 }
             }
             try {
@@ -245,7 +236,7 @@ extension);
             }
         }
     }
-    
+
     private String getExtension(TikaInputStream is, Metadata metadata) {
         String cType = metadata.get(Metadata.CONTENT_TYPE);
         TikaConfig config = getConfig();
@@ -260,12 +251,12 @@ extension);
             } catch (IOException e) {
                 //swallow
             } catch (MimeTypeException e) {
-                
+
             }
         }
         return ".bin";
     }
-    
+
     private TikaConfig getConfig() {
         TikaConfig config = context.get(TikaConfig.class);
         if (config == null) {
@@ -273,7 +264,7 @@ extension);
         }
         return config;
     }
-    
+
     /**
      * reset state after each object.
      * Do not reset unknown file number.
@@ -287,4 +278,10 @@ extension);
         sn = EMPTY_STRING;
         sb.setLength(0);
     }
+
+    private enum EMB_STATE {
+        PICT, //recording pict data
+        OBJDATA, //recording objdata
+        NADA
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 Fri May 29 14:36:21 2015
@@ -43,37 +43,35 @@ import org.apache.tika.parser.microsoft.
 
 /**
  * Many thanks to Simon Mourier for:
- * 
http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
 
+ * 
http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
  * and for granting permission to use his code in Tika.
- * 
  */
 class RTFObjDataParser {
 
     private final static int[] INT_LE_POWS = new int[]{
-        1, 256, 65536, 16777216
+            1, 256, 65536, 16777216
     };
 
     private final static String WIN_ASCII = "WINDOWS-1252";
 
     /**
      * Parses the embedded object/pict string
-     * 
+     *
      * @param bytes actual bytes (already converted from the 
      *  hex pair string stored in the embedded object data into actual bytes 
or read
      *  as raw binary bytes)
      * @return a SimpleRTFEmbObj or null
      * @throws IOException if there are any surprise surprises during parsing
      */
-    
+
     /**
-     * 
      * @param bytes
-     * @param metadata incoming metadata
-     * @param unknownFilenameCount 
+     * @param metadata             incoming metadata
+     * @param unknownFilenameCount
      * @return byte[] for contents of obj data
      * @throws IOException
      */
-    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger 
unknownFilenameCount) 
+    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger 
unknownFilenameCount)
             throws IOException {
         ByteArrayInputStream is = new ByteArrayInputStream(bytes);
         long version = readUInt(is);
@@ -84,9 +82,9 @@ class RTFObjDataParser {
         if (formatId != 2L) {
             return null;
         }
-        String className = readLengthPrefixedAnsiString(is).trim(); 
+        String className = readLengthPrefixedAnsiString(is).trim();
         String topicName = readLengthPrefixedAnsiString(is).trim();
-        String itemName = readLengthPrefixedAnsiString(is).trim(); 
+        String itemName = readLengthPrefixedAnsiString(is).trim();
 
         if (className != null && className.length() > 0) {
             metadata.add(RTFMetadata.EMB_CLASS, className);
@@ -103,19 +101,19 @@ class RTFObjDataParser {
         //readBytes tests for reading too many bytes
         byte[] embObjBytes = readBytes(is, dataSz);
 
-        if (className.toLowerCase(Locale.ROOT).equals("package")){
+        if (className.toLowerCase(Locale.ROOT).equals("package")) {
             return handlePackage(embObjBytes, metadata);
         } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
             //simple bitmap bytes
             return embObjBytes;
         } else {
             ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
-            if (NPOIFSFileSystem.hasPOIFSHeader(embIs)){
-                try{
+            if (NPOIFSFileSystem.hasPOIFSHeader(embIs)) {
+                try {
                     return handleEmbeddedPOIFS(embIs, metadata, 
unknownFilenameCount);
                 } catch (IOException e) {
                     //swallow
-                }   
+                }
             }
         }
         return embObjBytes;
@@ -124,8 +122,8 @@ class RTFObjDataParser {
 
     //will throw IOException if not actually POIFS
     //can return null byte[]
-    private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, 
-            AtomicInteger unknownFilenameCount) 
+    private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata,
+                                       AtomicInteger unknownFilenameCount)
             throws IOException {
 
         NPOIFSFileSystem fs = null;
@@ -140,7 +138,7 @@ class RTFObjDataParser {
                 return ret;
             }
 
-            if (root.hasEntry("Package")){
+            if (root.hasEntry("Package")) {
                 Entry ooxml = root.getEntry("Package");
                 TikaInputStream stream = TikaInputStream.get(new 
DocumentInputStream((DocumentEntry) ooxml));
 
@@ -163,9 +161,9 @@ class RTFObjDataParser {
 
                     DocumentEntry contentsEntry;
                     try {
-                        contentsEntry = 
(DocumentEntry)root.getEntry("CONTENTS");
+                        contentsEntry = (DocumentEntry) 
root.getEntry("CONTENTS");
                     } catch (FileNotFoundException ioe) {
-                        contentsEntry = 
(DocumentEntry)root.getEntry("Contents");
+                        contentsEntry = (DocumentEntry) 
root.getEntry("Contents");
                     }
 
                     DocumentInputStream inp = null;
@@ -184,7 +182,7 @@ class RTFObjDataParser {
                     is.reset();
                     IOUtils.copy(is, out);
                     ret = out.toByteArray();
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, 
"file_"+unknownFilenameCount.getAndIncrement() + "."+type.getExtension());
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + 
unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                     metadata.set(Metadata.CONTENT_TYPE, 
type.getType().toString());
                 }
             }
@@ -197,12 +195,11 @@ class RTFObjDataParser {
     }
 
 
-
     /**
-     * can return null if there is a linked object 
+     * can return null if there is a linked object
      * instead of an embedded file
      */
-    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws 
IOException { 
+    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws 
IOException {
         //now parse the package header
         ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
         readUShort(is);
@@ -231,16 +228,16 @@ class RTFObjDataParser {
         try {
             long unicodeLen = readUInt(is);
 
-            for (int i = 0; i < unicodeLen; i++){
+            for (int i = 0; i < unicodeLen; i++) {
                 int lo = is.read();
                 int hi = is.read();
-                int sum = lo+256*hi;
-                if (hi == -1 || lo == -1){
+                int sum = lo + 256 * hi;
+                if (hi == -1 || lo == -1) {
                     //stream ran out; empty SB and stop
                     unicodeFilePath.setLength(0);
                     break;
                 }
-                unicodeFilePath.append((char)sum);
+                unicodeFilePath.append((char) sum);
             }
         } catch (IOException e) {
             //swallow; the unicode file path is optional and might not happen
@@ -248,7 +245,7 @@ class RTFObjDataParser {
         }
         String fileNameToUse = "";
         String pathToUse = "";
-        if (unicodeFilePath.length() > 0){
+        if (unicodeFilePath.length() > 0) {
             String p = unicodeFilePath.toString();
             fileNameToUse = p;
             pathToUse = p;
@@ -265,21 +262,21 @@ class RTFObjDataParser {
 
     private int readUShort(InputStream is) throws IOException {
         int lo = is.read();
-        int hi = is.read()*256;
+        int hi = is.read() * 256;
         if (lo == -1 || hi == -1) {
             throw new IOException("Hit end of stream before reading little 
endian unsigned short.");
         }
-        return hi+lo;
+        return hi + lo;
     }
 
     private long readUInt(InputStream is) throws IOException {
         long sum = 0;
-        for (int i = 0; i < 4; i++){
+        for (int i = 0; i < 4; i++) {
             int v = is.read();
             if (v == -1) {
                 throw new IOException("Hit end of stream before finishing 
little endian unsigned int.");
             }
-            sum += v*(long)INT_LE_POWS[i];
+            sum += v * (long) INT_LE_POWS[i];
         }
         return sum;
     }
@@ -288,7 +285,7 @@ class RTFObjDataParser {
         StringBuilder sb = new StringBuilder();
         int c = is.read();
         while (c > 0) {
-            sb.append((char)c);
+            sb.append((char) c);
             c = is.read();
         }
         if (c == -1) {
@@ -319,13 +316,13 @@ class RTFObjDataParser {
 
         return bytes;
     }
-    
+
     private byte[] initByteArray(long len) throws IOException {
         if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
             throw new IOException("Requested length for reading bytes is out 
of bounds: " + len);
         }
-        return new byte[(int)len];
-        
+        return new byte[(int) len];
+
     }
 }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java 
Fri May 29 14:36:21 2015
@@ -43,17 +43,21 @@ public class RTFParser extends AbstractP
 
     private static final Set<MediaType> SUPPORTED_TYPES =
             Collections.singleton(MediaType.application("rtf"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
     /**
      * maximum number of bytes per embedded object/pict (default: 20MB)
      */
     private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
 
     /**
+     * See {@link #setMaxBytesForEmbeddedObject(int)}.
+     *
+     * @return maximum number of bytes allowed for an embedded object.
+     */
+    public static int getMaxBytesForEmbeddedObject() {
+        return EMB_OBJ_MAX_BYTES;
+    }
+
+    /**
      * Bytes for embedded objects are currently cached in memory.
      * If something goes wrong during the parsing of an embedded object,
      * it is possible that a read length may be crazily too long
@@ -66,19 +70,14 @@ public class RTFParser extends AbstractP
         EMB_OBJ_MAX_BYTES = max;
     }
 
-    /**
-     * See {@link #setMaxBytesForEmbeddedObject(int)}.
-     *
-     * @return maximum number of bytes allowed for an embedded object.
-     */
-    public static int getMaxBytesForEmbeddedObject() {
-        return EMB_OBJ_MAX_BYTES;
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
     }
 
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
-        throws IOException, SAXException, TikaException {
+            throws IOException, SAXException, TikaException {
         TaggedInputStream tagged = new TaggedInputStream(stream);
         try {
             XHTMLContentHandler xhtmlHandler = new 
XHTMLContentHandler(handler, metadata);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 Fri May 29 14:36:21 2015
@@ -54,15 +54,6 @@ import org.xml.sax.SAXException;
 final class TextExtractor {
 
     private static final Charset ASCII = Charset.forName("US-ASCII");
-
-    private static Charset getCharset(String name) {
-        try {
-            return CharsetUtils.forName(name);
-        } catch (Exception e) {
-            return ASCII;
-        }
-    }
-
     private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
     private static final Charset MAC_ROMAN = getCharset("MacRoman");
     private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
@@ -131,100 +122,17 @@ final class TextExtractor {
     private static final Charset BIG5 = getCharset("Big5");
     private static final Charset GB2312 = getCharset("GB2312");
     private static final Charset MS949 = getCharset("ms949");
-
-    private int written = 0;
-    // Hold pending bytes (encoded in the current charset)
-    // for text output:
-    private byte[] pendingBytes = new byte[16];
-    private int pendingByteCount;
-    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
-      
-    // Holds pending chars for text output
-    private char[] pendingChars = new char[10];
-    private int pendingCharCount;
-
-    // Holds chars for a still-being-tokenized control word
-    private byte[] pendingControl = new byte[10];
-    private int pendingControlCount;
-
-    // Used when we decode bytes -> chars using CharsetDecoder:
-    private final char[] outputArray = new char[128];
-    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
-
-    // Reused when possible:
-    private CharsetDecoder decoder;
-    private Charset lastCharset;
-
-    private Charset globalCharset = WINDOWS_1252;
-    private int globalDefaultFont = -1;
-    private int curFontID = -1;
-
-    // Holds the font table from this RTF doc, mapping
-    // the font number (from \fN control word) to the
-    // corresponding charset:
-    private final Map<Integer, Charset> fontToCharset =
-            new HashMap<Integer, Charset>();
-
-    // Group stack: when we open a new group, we push
-    // the previous group state onto the stack; when we
-    // close the group, we restore it
-    private final LinkedList<GroupState> groupStates = new 
LinkedList<GroupState>();
-
-    // Current group state; in theory this initial
-    // GroupState is unused because the RTF doc should
-    // immediately open the top group (start with {):
-    private GroupState groupState = new GroupState();
-
-    private boolean inHeader = true;
-    private int fontTableState;
-    private int fontTableDepth;
-
-    // Non null if we are processing metadata (title,
-    // keywords, etc.) inside the info group:
-    private Property nextMetaData;
-    private boolean inParagraph;
-
-    // Non-zero if we are processing inside a field destination:
-    private int fieldState;
-
-    // Non-zero list index
-    private int pendingListEnd;
-    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, 
ListDescriptor>();
-    private Map<Integer, ListDescriptor> listOverrideTable = new 
HashMap<Integer, ListDescriptor>();
-    private Map<Integer, ListDescriptor> currentListTable;
-    private ListDescriptor currentList;
-    private int listTableLevel = -1;
-    private boolean ignoreLists;
-
-    // Non-null if we've seen the url for a HYPERLINK but not yet
-    // its text:
-    private String pendingURL;
-
-    private final StringBuilder pendingBuffer = new StringBuilder();
-
-    // Used to process the sub-groups inside the upr
-    // group:
-    private int uprState = -1;
-
-    private final XHTMLContentHandler out;
-    private final Metadata metadata;
-    private final RTFEmbObjHandler embObjHandler;
-
-    // Used when extracting CREATION date:
-    private int year, month, day, hour, minute;
-
-    // How many next ansi chars we should skip; this
-    // is 0 except when we are still in the "ansi
-    // shadow" after seeing a unicode escape, at which
-    // point it's set to the last ucN skip we had seen:
-    int ansiSkip = 0;
-
     // The RTF doc has a "font table" that assigns ords
     // (f0, f1, f2, etc.) to fonts and charsets, using the
     // \fcharsetN control word.  This mapping maps from the
     // N to corresponding Java charset:
     private static final Map<Integer, Charset> FCHARSET_MAP =
             new HashMap<Integer, Charset>();
+    // The RTF may specify the \ansicpgN charset in the
+    // header; this maps the N to the corresponding Java
+    // character set:
+    private static final Map<Integer, Charset> ANSICPG_MAP =
+            new HashMap<Integer, Charset>();
 
     static {
         FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
@@ -267,15 +175,10 @@ final class TextExtractor {
         FCHARSET_MAP.put(255, CP850); // OEM
     }
 
-    // The RTF may specify the \ansicpgN charset in the
-    // header; this maps the N to the corresponding Java
-    // character set:
-    private static final Map<Integer, Charset> ANSICPG_MAP =
-            new HashMap<Integer, Charset>();
     static {
         ANSICPG_MAP.put(437, CP4372);   // US IBM
         ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)
-      
+
         ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
         ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
         ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
@@ -331,30 +234,99 @@ final class TextExtractor {
         ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
     }
 
+    // Used when we decode bytes -> chars using CharsetDecoder:
+    private final char[] outputArray = new char[128];
+    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+    // Holds the font table from this RTF doc, mapping
+    // the font number (from \fN control word) to the
+    // corresponding charset:
+    private final Map<Integer, Charset> fontToCharset =
+            new HashMap<Integer, Charset>();
+    // Group stack: when we open a new group, we push
+    // the previous group state onto the stack; when we
+    // close the group, we restore it
+    private final LinkedList<GroupState> groupStates = new 
LinkedList<GroupState>();
+    private final StringBuilder pendingBuffer = new StringBuilder();
+    private final XHTMLContentHandler out;
+    private final Metadata metadata;
+    private final RTFEmbObjHandler embObjHandler;
+    // How many next ansi chars we should skip; this
+    // is 0 except when we are still in the "ansi
+    // shadow" after seeing a unicode escape, at which
+    // point it's set to the last ucN skip we had seen:
+    int ansiSkip = 0;
+    private int written = 0;
+    // Hold pending bytes (encoded in the current charset)
+    // for text output:
+    private byte[] pendingBytes = new byte[16];
+    private int pendingByteCount;
+    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+    // Holds pending chars for text output
+    private char[] pendingChars = new char[10];
+    private int pendingCharCount;
+    // Holds chars for a still-being-tokenized control word
+    private byte[] pendingControl = new byte[10];
+    private int pendingControlCount;
+    // Reused when possible:
+    private CharsetDecoder decoder;
+    private Charset lastCharset;
+    private Charset globalCharset = WINDOWS_1252;
+    private int globalDefaultFont = -1;
+    private int curFontID = -1;
+    // Current group state; in theory this initial
+    // GroupState is unused because the RTF doc should
+    // immediately open the top group (start with {):
+    private GroupState groupState = new GroupState();
+    private boolean inHeader = true;
+    private int fontTableState;
+    private int fontTableDepth;
+    // Non null if we are processing metadata (title,
+    // keywords, etc.) inside the info group:
+    private Property nextMetaData;
+    private boolean inParagraph;
+    // Non-zero if we are processing inside a field destination:
+    private int fieldState;
+    // Non-zero list index
+    private int pendingListEnd;
+    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, 
ListDescriptor>();
+    private Map<Integer, ListDescriptor> listOverrideTable = new 
HashMap<Integer, ListDescriptor>();
+    private Map<Integer, ListDescriptor> currentListTable;
+    private ListDescriptor currentList;
+    private int listTableLevel = -1;
+    private boolean ignoreLists;
+    // Non-null if we've seen the url for a HYPERLINK but not yet
+    // its text:
+    private String pendingURL;
+    // Used to process the sub-groups inside the upr
+    // group:
+    private int uprState = -1;
+    // Used when extracting CREATION date:
+    private int year, month, day, hour, minute;
+
     public TextExtractor(XHTMLContentHandler out, Metadata metadata,
-            RTFEmbObjHandler embObjHandler) {
+                         RTFEmbObjHandler embObjHandler) {
         this.metadata = metadata;
         this.out = out;
         this.embObjHandler = embObjHandler;
     }
 
-    public boolean isIgnoringLists() {
-        return ignoreLists;
-    }
-
-    public void setIgnoreLists(boolean ignore) {
-        this.ignoreLists = ignore;
+    private static Charset getCharset(String name) {
+        try {
+            return CharsetUtils.forName(name);
+        } catch (Exception e) {
+            return ASCII;
+        }
     }
 
     protected static boolean isHexChar(int ch) {
         return (ch >= '0' && ch <= '9') ||
-            (ch >= 'a' && ch <= 'f') ||
-            (ch >= 'A' && ch <= 'F');
+                (ch >= 'a' && ch <= 'f') ||
+                (ch >= 'A' && ch <= 'F');
     }
 
     private static boolean isAlpha(int ch) {
         return (ch >= 'a' && ch <= 'z') ||
-            (ch >= 'A' && ch <= 'Z');
+                (ch >= 'A' && ch <= 'Z');
     }
 
     private static boolean isDigit(int ch) {
@@ -372,6 +344,14 @@ final class TextExtractor {
         }
     }
 
+    public boolean isIgnoringLists() {
+        return ignoreLists;
+    }
+
+    public void setIgnoreLists(boolean ignore) {
+        this.ignoreLists = ignore;
+    }
+
     // Push pending bytes or pending chars:
     private void pushText() throws IOException, SAXException, TikaException {
         if (pendingByteCount != 0) {
@@ -391,27 +371,27 @@ final class TextExtractor {
             pushChars();
         }
         if (groupState.pictDepth > 0) {
-            embObjHandler.writeMetadataChar((char)b);
+            embObjHandler.writeMetadataChar((char) b);
         } else {
             // Save the byte in pending buffer:
             if (pendingByteCount == pendingBytes.length) {
                 // Gradual but exponential growth:
-                final byte[] newArray = new byte[(int) 
(pendingBytes.length*1.25)];
+                final byte[] newArray = new byte[(int) (pendingBytes.length * 
1.25)];
                 System.arraycopy(pendingBytes, 0, newArray, 0, 
pendingBytes.length);
                 pendingBytes = newArray;
                 pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
             }
             pendingBytes[pendingByteCount++] = (byte) b;
-       }
+        }
     }
 
-   // Buffers a byte as part of a control word:
+    // Buffers a byte as part of a control word:
     private void addControl(int b) {
         assert isAlpha(b);
         // Save the byte in pending buffer:
         if (pendingControlCount == pendingControl.length) {
             // Gradual but exponential growth:
-            final byte[] newArray = new byte[(int) 
(pendingControl.length*1.25)];
+            final byte[] newArray = new byte[(int) (pendingControl.length * 
1.25)];
             System.arraycopy(pendingControl, 0, newArray, 0, 
pendingControl.length);
             pendingControl = newArray;
         }
@@ -431,7 +411,7 @@ final class TextExtractor {
         } else {
             if (pendingCharCount == pendingChars.length) {
                 // Gradual but exponential growth:
-                final char[] newArray = new char[(int) 
(pendingChars.length*1.25)];
+                final char[] newArray = new char[(int) (pendingChars.length * 
1.25)];
                 System.arraycopy(pendingChars, 0, newArray, 0, 
pendingChars.length);
                 pendingChars = newArray;
             }
@@ -458,7 +438,7 @@ final class TextExtractor {
 //        };
         extract(new PushbackInputStream(in, 2));
     }
-    
+
     private void extract(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
         out.startDocument();
 
@@ -471,7 +451,7 @@ final class TextExtractor {
             } else if (b == '{') {
                 pushText();
                 processGroupStart(in);
-             } else if (b == '}') {
+            } else if (b == '}') {
                 pushText();
                 processGroupEnd();
                 if (groupStates.isEmpty()) {
@@ -479,9 +459,9 @@ final class TextExtractor {
                     break;
                 }
             } else if (groupState.objdata == true ||
-                groupState.pictDepth == 1) {
+                    groupState.pictDepth == 1) {
                 embObjHandler.writeHexChar(b);
-            } else if (b != '\r' && b != '\n' 
+            } else if (b != '\r' && b != '\n'
                     && (!groupState.ignore || nextMetaData != null ||
                     groupState.sn == true || groupState.sv == true)) {
                 // Linefeed and carriage return are not
@@ -497,7 +477,7 @@ final class TextExtractor {
         endParagraph(false);
         out.endDocument();
     }
-    
+
     private void parseControlToken(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
         int b = in.read();
         if (b == '\'') {
@@ -505,16 +485,16 @@ final class TextExtractor {
             parseHexChar(in);
         } else if (isAlpha(b)) {
             // control word
-            parseControlWord((char)b, in);
+            parseControlWord((char) b, in);
         } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == 
'\n') {
             // escaped char
             addOutputByte(b);
         } else if (b != -1) {
             // control symbol, eg \* or \~
-            processControlSymbol((char)b);
+            processControlSymbol((char) b);
         }
     }
-    
+
     private void parseHexChar(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
         int hex1 = in.read();
         if (!isHexChar(hex1)) {
@@ -522,7 +502,7 @@ final class TextExtractor {
             in.unread(hex1);
             return;
         }
-        
+
         int hex2 = in.read();
         if (!isHexChar(hex2)) {
             // TODO: log a warning here, somehow?
@@ -531,7 +511,7 @@ final class TextExtractor {
             in.unread(hex2);
             return;
         }
-        
+
         if (ansiSkip != 0) {
             // Skip this ansi char since we are
             // still in the shadow of a unicode
@@ -539,19 +519,19 @@ final class TextExtractor {
             ansiSkip--;
         } else {
             // Unescape:
-            addOutputByte(16*hexValue(hex1) + hexValue(hex2));
+            addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
         }
     }
 
     private void parseControlWord(int firstChar, PushbackInputStream in) 
throws IOException, SAXException, TikaException {
         addControl(firstChar);
-        
+
         int b = in.read();
         while (isAlpha(b)) {
             addControl(b);
             b = in.read();
         }
-        
+
         boolean hasParam = false;
         boolean negParam = false;
         if (b == '-') {
@@ -567,14 +547,14 @@ final class TextExtractor {
             hasParam = true;
             b = in.read();
         }
-        
+
         // space is consumed as part of the
         // control word, but is not added to the
         // control word
         if (b != ' ') {
             in.unread(b);
         }
-        
+
         if (hasParam) {
             if (negParam) {
                 param = -param;
@@ -583,7 +563,7 @@ final class TextExtractor {
         } else {
             processControlWord();
         }
-        
+
         pendingControlCount = 0;
     }
 
@@ -602,7 +582,7 @@ final class TextExtractor {
             }
             if (inList() && pendingListEnd != groupState.list) {
                 startList(groupState.list);
-            }            
+            }
             if (inList()) {
                 out.startElement("li");
             } else {
@@ -738,7 +718,7 @@ final class TextExtractor {
         if (pendingControlCount != s.length()) {
             return false;
         }
-        for(int idx=0;idx<pendingControlCount;idx++) {
+        for (int idx = 0; idx < pendingControlCount; idx++) {
             assert isAlpha(s.charAt(idx));
             if (((byte) s.charAt(idx)) != pendingControl[idx]) {
                 return false;
@@ -748,26 +728,26 @@ final class TextExtractor {
     }
 
     private void processControlSymbol(char ch) throws IOException, 
SAXException, TikaException {
-        switch(ch) {
-        case '~':
-            // Non-breaking space -> unicode NON-BREAKING SPACE
-            addOutputChar('\u00a0');
-            break;
-        case '*':
-            // Ignorable destination (control words defined after
-            // the 1987 RTF spec). These are already handled by
-            // processGroupStart()
-            break;
-        case '-':
-            // Optional hyphen -> unicode SOFT HYPHEN
-            addOutputChar('\u00ad');
-            break;
-        case '_':
-            // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
-            addOutputChar('\u2011');
-            break;
-        default:
-            break;
+        switch (ch) {
+            case '~':
+                // Non-breaking space -> unicode NON-BREAKING SPACE
+                addOutputChar('\u00a0');
+                break;
+            case '*':
+                // Ignorable destination (control words defined after
+                // the 1987 RTF spec). These are already handled by
+                // processGroupStart()
+                break;
+            case '-':
+                // Optional hyphen -> unicode SOFT HYPHEN
+                addOutputChar('\u00ad');
+                break;
+            case '_':
+                // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+                addOutputChar('\u2011');
+                break;
+            default:
+                break;
         }
     }
 
@@ -890,8 +870,8 @@ final class TextExtractor {
                     currentList.templateID = param;
                 } else if (equals("levelnfc") || equals("levelnfcn")) {
                     //sanity check to make sure list information isn't corrupt
-                    if (listTableLevel > -1 && 
-                        listTableLevel < currentList.numberType.length ) {
+                    if (listTableLevel > -1 &&
+                            listTableLevel < currentList.numberType.length) {
                         currentList.numberType[listTableLevel] = param;
                     }
                 }
@@ -963,7 +943,7 @@ final class TextExtractor {
         } else if (equals("bin")) {
             if (param >= 0) {
                 if (groupState.pictDepth == 1) {
-                    try{
+                    try {
                         embObjHandler.writeBytes(in, param);
                     } catch (IOException e) {
                         //param was out of bounds or something went wrong 
during writing.
@@ -977,7 +957,7 @@ final class TextExtractor {
                     while (bytesToRead > 0) {
                         int r = in.read(tmpArray, 0, Math.min(bytesToRead, 
tmpArray.length));
                         if (r < 0) {
-                            throw new TikaException("unexpected end of file: 
need " + param + " bytes of binary data, found " + (param-bytesToRead));
+                            throw new TikaException("unexpected end of file: 
need " + param + " bytes of binary data, found " + (param - bytesToRead));
                         }
                         bytesToRead -= r;
                     }
@@ -1005,6 +985,7 @@ final class TextExtractor {
     /**
      * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to 
determine the list
      * type for the given <code>listID</code>.
+     *
      * @param listID The ID of the list.
      * @throws IOException
      * @throws SAXException
@@ -1019,6 +1000,7 @@ final class TextExtractor {
     /**
      * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to 
determine the list
      * type for the given <code>listID</code>.
+     *
      * @param listID The ID of the list.
      * @throws IOException
      * @throws SAXException
@@ -1051,11 +1033,11 @@ final class TextExtractor {
         if (inHeader) {
             if (equals("ansi")) {
                 globalCharset = WINDOWS_1252;
-            } else if (equals("pca")) { 
+            } else if (equals("pca")) {
                 globalCharset = CP850;
-            } else if (equals("pc")) { 
+            } else if (equals("pc")) {
                 globalCharset = CP437;
-            } else if (equals("mac")) { 
+            } else if (equals("mac")) {
                 globalCharset = MAC_ROMAN;
             }
 
@@ -1319,13 +1301,13 @@ final class TextExtractor {
 
         // Make new GroupState
         groupState = new GroupState(groupState);
-        assert groupStates.size() == groupState.depth: "size=" + 
groupStates.size() + " depth=" + groupState.depth;
+        assert groupStates.size() == groupState.depth : "size=" + 
groupStates.size() + " depth=" + groupState.depth;
 
         if (uprState == 0) {
             uprState = 1;
             groupState.ignore = true;
         }
-        
+
         // Check for ignorable groups. Note that
         // sometimes we un-ignore within this group, eg
         // when handling upr escape.
@@ -1335,7 +1317,7 @@ final class TextExtractor {
             if (b3 == '*') {
                 groupState.ignore = true;
             }
-               in.unread(b3);
+            in.unread(b3);
         }
         in.unread(b2);
     }
@@ -1346,7 +1328,7 @@ final class TextExtractor {
             if (nextMetaData != null) {
                 if (nextMetaData == TikaCoreProperties.CREATED) {
                     Calendar cal = Calendar.getInstance(TimeZone.getDefault(), 
Locale.ROOT);
-                    cal.set(year, month-1, day, hour, minute, 0);
+                    cal.set(year, month - 1, day, hour, minute, 0);
                     metadata.set(nextMetaData, cal.getTime());
                 } else if (nextMetaData.isMultiValuePermitted()) {
                     metadata.add(nextMetaData, pendingBuffer.toString());
@@ -1360,7 +1342,7 @@ final class TextExtractor {
 
         assert groupState.depth > 0;
         ansiSkip = 0;
-        
+
         if (groupState.objdata == true) {
             embObjHandler.handleCompletedObject();
             groupState.objdata = false;
@@ -1391,7 +1373,7 @@ final class TextExtractor {
             // bold changed:
             if (groupState.italic) {
                 if (!outerGroupState.italic ||
-                    groupState.bold != outerGroupState.bold) {
+                        groupState.bold != outerGroupState.bold) {
                     end("i");
                     groupState.italic = false;
                 }
@@ -1425,9 +1407,9 @@ final class TextExtractor {
                 final boolean isLocalLink = s.contains("\\l ");
                 int idx = s.indexOf('"');
                 if (idx != -1) {
-                    int idx2 = s.indexOf('"', 1+idx);
+                    int idx2 = s.indexOf('"', 1 + idx);
                     if (idx2 != -1) {
-                        s = s.substring(1+idx, idx2);
+                        s = s.substring(1 + idx, idx2);
                     }
                 }
                 pendingURL = (isLocalLink ? "#" : "") + s;

svn commit: r1682489 [6/14] - in /tika/trunk: tika-parsers/src/main/java/org/apache/tika/parser/html/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/ tika-parsers/src/main/java/org/a...

Reply via email to