Author: tallison
Date: Tue Jul 21 01:34:25 2015
New Revision: 1692042

URL: http://svn.apache.org/r1692042
Log:
TIKA-1678 -- initial commit.  Need to wait for fix to PDFBOX-2896 to generate 
test file.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/
    tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/
    
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java?rev=1692042&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
 Tue Jul 21 01:34:25 2015
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.pdfparser;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.pdfbox.cos.COSString;
+
+/**
+ * In fairly rare cases, a PDF's XMP will contain a string that
+ * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
+ * ascii for ascii, e.g. 
"\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
+ * <p>
+ * This class can be used to decode those strings.
+ * <p>
+ * See TIKA-1678.  Many thanks to Andrew Jackson for raising this issue
+ * and Tilman Hausherr for the solution.
+ * <p>
+ * Unfortunately because {@link BaseParser#parseCOSString()} is protected, we
+ * had to put this in o.a.pdfbox.pdfparser and not in o.a.t.parser.pdf
+ */
+public class PDFOctalUnicodeDecoder {
+
+    private static final String[] PDF_ENCODING_BOMS = {
+            "\\376\\377", //UTF-16BE
+            "\\377\\376", //UTF-16LE
+            "\\357\\273\\277"//UTF-8
+    };
+
+    /**
+     * Does this string contain an octal-encoded UTF BOM?
+     * Call this statically to determine if you should bother creating a new 
parser to parse it.
+     * @param s
+     * @return
+     */
+    public static boolean shouldDecode(String s) {
+        if (s == null || s.length() < 8) {
+            return false;
+        }
+        for (String BOM : PDF_ENCODING_BOMS) {
+            if (s.startsWith(BOM)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * This assumes that {@link #shouldDecode(String)} has been called
+     * and has returned true.  If you run this on a non-octal encoded string,
+     * disaster will happen!
+     *
+     * @param value
+     * @return
+     */
+    public String decode(String value) {
+        try {
+            byte[] bytes = new String("(" + value + 
")").getBytes("ISO-8859-1");
+            InputStream is = new ByteArrayInputStream(bytes);
+            COSStringParser p = new COSStringParser(is);
+            COSString cosString = p.parseCOSString();
+            if (cosString != null) {
+                return cosString.getString();
+            }
+        } catch (IOException e) {
+            //oh well, we tried.
+        }
+        //just return value if something went wrong
+        return value;
+    }
+
+    private class COSStringParser extends BaseParser {
+
+        private COSStringParser(InputStream buffer) throws IOException {
+            super(buffer);
+        }
+    }
+}

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1692042&r1=1692041&r2=1692042&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
Tue Jul 21 01:34:25 2015
@@ -37,6 +37,7 @@ import org.apache.pdfbox.exceptions.Cryp
 import org.apache.pdfbox.io.RandomAccess;
 import org.apache.pdfbox.io.RandomAccessBuffer;
 import org.apache.pdfbox.io.RandomAccessFile;
+import org.apache.pdfbox.pdfparser.PDFOctalUnicodeDecoder;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
@@ -347,7 +348,7 @@ public class PDFParser extends AbstractP
         //if schema is null, just go with pdfBoxBaseline
         if (schema == null) {
             if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                metadata.set(property, pdfBoxBaseline);
+                addMetadata(metadata, property, pdfBoxBaseline);
             }
             return;
         }
@@ -360,7 +361,7 @@ public class PDFParser extends AbstractP
                 if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
                     continue;
                 }
-                metadata.add(property, value);
+                addMetadata(metadata, property, value);
                 if (!property.isMultiValuePermitted()) {
                     return;
                 }
@@ -375,7 +376,7 @@ public class PDFParser extends AbstractP
                     return;
                 }
             }
-            metadata.add(property, pdfBoxBaseline);
+            addMetadata(metadata, property, pdfBoxBaseline);
         }
     }
 
@@ -446,14 +447,26 @@ public class PDFParser extends AbstractP
 
     private void addMetadata(Metadata metadata, Property property, String 
value) {
         if (value != null) {
-            metadata.add(property, value);
+            String decoded = decode(value);
+            if (property.isMultiValuePermitted() || metadata.get(property) == 
null) {
+                metadata.add(property, decoded);
+            }
+            //silently skip adding property that already exists if multiple 
values are not permitted
         }
     }
 
     private void addMetadata(Metadata metadata, String name, String value) {
         if (value != null) {
-            metadata.add(name, value);
+            metadata.add(name, decode(value));
+        }
+    }
+
+    private String decode(String value) {
+        if (PDFOctalUnicodeDecoder.shouldDecode(value)) {
+            PDFOctalUnicodeDecoder d = new PDFOctalUnicodeDecoder();
+            return d.decode(value);
         }
+        return value;
     }
 
     private void addMetadata(Metadata metadata, String name, Calendar value) {


Reply via email to