This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 90b63b5 TIKA-2998-- allow users to extract font names
90b63b5 is described below
commit 90b63b5f13509a8653c0efc9a69fab4b8cd7295f
Author: tballison <[email protected]>
AuthorDate: Fri Nov 22 14:35:51 2019 -0500
TIKA-2998-- allow users to extract font names
---
.../main/java/org/apache/tika/metadata/FONT.java | 28 ++++++++++++++++++++++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 +++++++++++++++++++
.../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 17 +++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 11 +++++++++
5 files changed, 86 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/FONT.java
b/tika-core/src/main/java/org/apache/tika/metadata/FONT.java
new file mode 100644
index 0000000..71bb505
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/FONT.java
@@ -0,0 +1,28 @@
+package org.apache.tika.metadata; /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public interface FONT {
+
+ String PREFIX_FONT_META = "font";
+
+ /**
+ * Basic name of a font used in a file
+ */
+ Property FONT_NAME = Property.internalTextBag(PREFIX_FONT_META +
+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name");
+
+}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 32ed174..476601c 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,14 +33,18 @@ import java.nio.file.Path;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
+import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -87,6 +91,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.FONT;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -158,6 +163,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int unmappedUnicodeCharsPerPage = 0;
int totalCharsPerPage = 0;
+ private final Set<String> fontNames = new HashSet<>();
+
AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler,
ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
this.pdDocument = pdDocument;
@@ -453,6 +460,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
} catch (IOException e) {
handleCatchableIOE(e);
}
+
+ if (config.getExtractFontNames()) {
+
+ for (COSName n : page.getResources().getFontNames()) {
+ PDFont font = page.getResources().getFont(n);
+ if (font != null && font.getFontDescriptor() != null) {
+ String fontName = font.getFontDescriptor().getFontName();
+ if (fontName != null) {
+ fontNames.add(fontName);
+ }
+ }
+ }
+ }
}
private void handleWidget(PDAnnotationWidget widget) throws TikaException,
SAXException, IOException {
@@ -581,6 +601,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
}
+ if (fontNames.size() > 0) {
+ for (String fontName : fontNames) {
+ metadata.add(FONT.FONT_NAME, fontName);
+ }
+ }
}
void extractBookmarkText() throws SAXException, IOException, TikaException
{
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 75fe92d..a63754e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -491,6 +491,11 @@ public class PDFParser extends AbstractParser implements
Initializable {
}
@Field
+ void setExtractFontNames(boolean extractFontNames) {
+ defaultConfig.setExtractFontNames(extractFontNames);
+ }
+
+ @Field
void setSetKCMS(boolean setKCMS) {
defaultConfig.setSetKCMS(setKCMS);
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b5d54d0..44324c2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -140,6 +140,8 @@ public class PDFParserConfig implements Serializable {
private boolean extractActions = false;
+ private boolean extractFontNames = false;
+
private long maxMainMemoryBytes = -1;
private boolean setKCMS = false;
@@ -203,6 +205,10 @@ public class PDFParserConfig implements Serializable {
setExtractUniqueInlineImagesOnly(
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
getExtractUniqueInlineImagesOnly()));
+ setExtractFontNames(
+ getBooleanProp(props.getProperty("extractFontNames"),
+ getExtractFontNames()));
+
setIfXFAExtractOnlyXFA(
getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
@@ -318,6 +324,17 @@ public class PDFParserConfig implements Serializable {
this.extractBookmarksText = extractBookmarksText;
}
+ /**
+ * Extract font names into a metadata field
+ * @param extractFontNames
+ */
+ public void setExtractFontNames(boolean extractFontNames) {
+ this.extractFontNames = extractFontNames;
+ }
+
+ public boolean getExtractFontNames() {
+ return extractFontNames;
+ }
/**
* @see #setExtractInlineImages(boolean)
*/
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 489ccfa..18d4f0c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -48,6 +48,7 @@ import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.FONT;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -150,6 +151,16 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testFontNameExtraction() throws Exception {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractFontNames(true);
+ ParseContext pc = new ParseContext();
+ pc.set(PDFParserConfig.class, config);
+ XMLResult r = getXML("testPDFVarious.pdf", pc);
+ assertContains("ABCDEE+Calibri", r.metadata.get(FONT.FONT_NAME));
+ }
+
+ @Test
public void testPdfParsingMetadataOnly() throws Exception {
Metadata metadata = getXML("testPDF.pdf").metadata;