Author: tallison
Date: Tue Jul 21 01:34:25 2015
New Revision: 1692042
URL: http://svn.apache.org/r1692042
Log:
TIKA-1678 -- initial commit. Need to wait for fix to PDFBOX-2896 to generate
test file.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java?rev=1692042&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/pdfbox/pdfparser/PDFOctalUnicodeDecoder.java
Tue Jul 21 01:34:25 2015
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.pdfparser;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.pdfbox.cos.COSString;
+
+/**
+ * In fairly rare cases, a PDF's XMP will contain a string that
+ * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
+ * ascii for ascii, e.g.
"\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
+ * <p>
+ * This class can be used to decode those strings.
+ * <p>
+ * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue
+ * and Tilman Hausherr for the solution.
+ * <p>
+ * Unfortunately because {@link BaseParser#parseCOSString()} is protected, we
+ * had to put this in o.a.pdfbox.pdfparser and not in o.a.t.parser.pdf
+ */
+public class PDFOctalUnicodeDecoder {
+
+ private static final String[] PDF_ENCODING_BOMS = {
+ "\\376\\377", //UTF-16BE
+ "\\377\\376", //UTF-16LE
+ "\\357\\273\\277"//UTF-8
+ };
+
+ /**
+ * Does this string contain an octal-encoded UTF BOM?
+ * Call this statically to determine if you should bother creating a new
parser to parse it.
+ * @param s
+ * @return
+ */
+ public static boolean shouldDecode(String s) {
+ if (s == null || s.length() < 8) {
+ return false;
+ }
+ for (String BOM : PDF_ENCODING_BOMS) {
+ if (s.startsWith(BOM)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * This assumes that {@link #shouldDecode(String)} has been called
+ * and has returned true. If you run this on a non-octal encoded string,
+ * disaster will happen!
+ *
+ * @param value
+ * @return
+ */
+ public String decode(String value) {
+ try {
+ byte[] bytes = new String("(" + value +
")").getBytes("ISO-8859-1");
+ InputStream is = new ByteArrayInputStream(bytes);
+ COSStringParser p = new COSStringParser(is);
+ COSString cosString = p.parseCOSString();
+ if (cosString != null) {
+ return cosString.getString();
+ }
+ } catch (IOException e) {
+ //oh well, we tried.
+ }
+ //just return value if something went wrong
+ return value;
+ }
+
+ private class COSStringParser extends BaseParser {
+
+ private COSStringParser(InputStream buffer) throws IOException {
+ super(buffer);
+ }
+ }
+}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1692042&r1=1692041&r2=1692042&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Tue Jul 21 01:34:25 2015
@@ -37,6 +37,7 @@ import org.apache.pdfbox.exceptions.Cryp
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessFile;
+import org.apache.pdfbox.pdfparser.PDFOctalUnicodeDecoder;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
@@ -347,7 +348,7 @@ public class PDFParser extends AbstractP
//if schema is null, just go with pdfBoxBaseline
if (schema == null) {
if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- metadata.set(property, pdfBoxBaseline);
+ addMetadata(metadata, property, pdfBoxBaseline);
}
return;
}
@@ -360,7 +361,7 @@ public class PDFParser extends AbstractP
if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
continue;
}
- metadata.add(property, value);
+ addMetadata(metadata, property, value);
if (!property.isMultiValuePermitted()) {
return;
}
@@ -375,7 +376,7 @@ public class PDFParser extends AbstractP
return;
}
}
- metadata.add(property, pdfBoxBaseline);
+ addMetadata(metadata, property, pdfBoxBaseline);
}
}
@@ -446,14 +447,26 @@ public class PDFParser extends AbstractP
private void addMetadata(Metadata metadata, Property property, String
value) {
if (value != null) {
- metadata.add(property, value);
+ String decoded = decode(value);
+ if (property.isMultiValuePermitted() || metadata.get(property) ==
null) {
+ metadata.add(property, decoded);
+ }
+ //silently skip adding property that already exists if multiple
values are not permitted
}
}
private void addMetadata(Metadata metadata, String name, String value) {
if (value != null) {
- metadata.add(name, value);
+ metadata.add(name, decode(value));
+ }
+ }
+
+ private String decode(String value) {
+ if (PDFOctalUnicodeDecoder.shouldDecode(value)) {
+ PDFOctalUnicodeDecoder d = new PDFOctalUnicodeDecoder();
+ return d.decode(value);
}
+ return value;
}
private void addMetadata(Metadata metadata, String name, Calendar value) {