This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 50e2e6940b997934e4b172bfd317775caa5ccd01 Author: tballison <[email protected]> AuthorDate: Fri Nov 22 14:35:51 2019 -0500 TIKA-2998-- allow users to extract font names --- .../main/java/org/apache/tika/metadata/FONT.java | 28 ++++++++++++++++++++++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 +++++++++++++++++++ .../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++++ .../apache/tika/parser/pdf/PDFParserConfig.java | 17 +++++++++++++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 11 +++++++++ 5 files changed, 86 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/FONT.java b/tika-core/src/main/java/org/apache/tika/metadata/FONT.java new file mode 100644 index 0000000..71bb505 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/FONT.java @@ -0,0 +1,28 @@ +package org.apache.tika.metadata; /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public interface FONT { + + String PREFIX_FONT_META = "font"; + + /** + * Basic name of a font used in a file + */ + Property FONT_NAME = Property.internalTextBag(PREFIX_FONT_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name"); + +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 2fbdbd2..19972c4 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -33,14 +33,18 @@ import java.nio.file.Path; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.HashSet; import java.util.List; import java.util.ListIterator; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import org.apache.commons.io.IOExceptionWithCause; import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; @@ -87,6 +91,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.FONT; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PDF; import org.apache.tika.metadata.TikaCoreProperties; @@ -158,6 +163,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { int unmappedUnicodeCharsPerPage = 0; int totalCharsPerPage = 0; + private final Set<String> fontNames = new HashSet<>(); + AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws IOException { this.pdDocument = pdDocument; @@ -453,6 +460,19 @@ class AbstractPDF2XHTML extends PDFTextStripper { } catch (IOException e) { handleCatchableIOE(e); } + + if (config.getExtractFontNames()) { + + for (COSName n : page.getResources().getFontNames()) { + PDFont font = page.getResources().getFont(n); + if (font != null && font.getFontDescriptor() != null) { + String fontName = font.getFontDescriptor().getFontName(); + if (fontName != null) { + fontNames.add(fontName); + } + } + } + } } private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException { @@ -581,6 +601,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a document", e); } + if (fontNames.size() > 0) { + for (String fontName : fontNames) { + metadata.add(FONT.FONT_NAME, fontName); + } + } } void extractBookmarkText() throws SAXException, IOException, TikaException { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index b6ee44c..d2839fa 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -503,6 +503,11 @@ public class PDFParser extends AbstractParser implements Initializable { } @Field + void setExtractFontNames(boolean extractFontNames) { + defaultConfig.setExtractFontNames(extractFontNames); + } + + @Field void setSetKCMS(boolean setKCMS) { defaultConfig.setSetKCMS(setKCMS); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 7f4a751..339ce5d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -141,6 +141,8 @@ public class PDFParserConfig implements Serializable { private boolean extractActions = false; + private boolean extractFontNames = false; + private long maxMainMemoryBytes = -1; private boolean setKCMS = false; @@ -204,6 +206,10 @@ public class PDFParserConfig implements Serializable { setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); + setExtractFontNames( + getBooleanProp(props.getProperty("extractFontNames"), + getExtractFontNames())); + setIfXFAExtractOnlyXFA( getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"), @@ -318,6 +324,17 @@ public class PDFParserConfig implements Serializable { this.extractBookmarksText = extractBookmarksText; } + /** + * Extract font names into a metadata field + * @param extractFontNames + */ + public void setExtractFontNames(boolean extractFontNames) { + this.extractFontNames = extractFontNames; + } + + public boolean getExtractFontNames() { + return extractFontNames; + } /** * @see #setExtractInlineImages(boolean) */ diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 50e5e38..5649d56 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -47,6 +47,7 @@ import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.ParserContainerExtractor; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.FONT; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.PDF; @@ -148,6 +149,16 @@ public class PDFParserTest extends TikaTest { } @Test + public void testFontNameExtraction() throws Exception { + PDFParserConfig config = new PDFParserConfig(); + config.setExtractFontNames(true); + ParseContext pc = new ParseContext(); + pc.set(PDFParserConfig.class, config); + XMLResult r = getXML("testPDFVarious.pdf", pc); + assertContains("ABCDEE+Calibri", r.metadata.get(FONT.FONT_NAME)); + } + + @Test public void testPdfParsingMetadataOnly() throws Exception { Metadata metadata = getXML("testPDF.pdf").metadata;
