This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new f30180baa TIKA-4033 -- improve metadata handling for incremental updates. Generalize incremental updates to "version" and avoid use of synthetic name for resourceName. (#1121) f30180baa is described below commit f30180baa6b7da6ad56ff1e4f5f963d0fc7f4e84 Author: Tim Allison <talli...@apache.org> AuthorDate: Tue May 9 10:10:19 2023 -0400 TIKA-4033 -- improve metadata handling for incremental updates. Generalize incremental updates to "version" and avoid use of synthetic name for resourceName. (#1121) --- .../src/main/java/org/apache/tika/metadata/PDF.java | 20 ++++++++++++-------- .../org/apache/tika/metadata/TikaCoreProperties.java | 14 ++++++++++++++ .../apache/tika/parser/RecursiveParserWrapper.java | 2 ++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 1 - .../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +- .../tika/parser/pdf/PDFIncrementalUpdatesTest.java | 16 +++++++++++----- .../org/apache/tika/parser/pdf/PDFParserTest.java | 12 ++++++++---- 7 files changed, 48 insertions(+), 19 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index 0739855fc..c2baca0e8 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -28,12 +28,6 @@ public interface PDF { String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; - /** - * Incremental updates as extracted by the StartXRefScanner. See - * that class for limitations. - */ - Property PDF_INCREMENTAL_UPDATES = Property.externalInteger(PDF_PREFIX + "incrementalUpdates"); - /** * Number of %%EOF as extracted by the StartXRefScanner. See * that class for limitations. @@ -203,6 +197,16 @@ public interface PDF { * This value is populated with the parse incremental updates feature is selected * in the PDFParser. */ - Property INCREMENTAL_UPDATE_NUMBER = Property.internalInteger(PDF_PREFIX + - "incrementalUpdateNumber"); + Property INCREMENTAL_UPDATE_NUMBER = + Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"), + new Property[]{ TikaCoreProperties.VERSION_NUMBER }); + + /** + * Incremental updates as extracted by the StartXRefScanner. See + * that class for limitations. + */ + Property PDF_INCREMENTAL_UPDATE_COUNT = + Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), + new Property[]{ TikaCoreProperties.VERSION_COUNT }); + } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index 95cf0a035..b49068c39 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -322,6 +322,20 @@ public interface TikaCoreProperties { //is the file encrypted Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted"); + /** + * General metadata key for the count of non-final versions available within a file. This + * was added initially to support generalizing incremental updates in PDF. + */ + Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount"); + + /** + * General metadata key for the version number of a given file that contains + * earlier versions within it. This number is 0-indexed for the earliest version. + * The latest version does not have this metadata value. This was added initially + * to support generalizing incremental updates in PDF. + */ + Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber"); + /** * A file might contain different types of embedded documents. * The most common is the ATTACHMENT. diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 101aa3395..483181b0a 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -188,6 +188,8 @@ public class RecursiveParserWrapper extends ParserDecorator { objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) { objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID); + } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) { + objectName = "version-number-" + metadata.get(TikaCoreProperties.VERSION_NUMBER); } else { objectName = "embedded-" + (++state.unknownCount); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 5b2d4a659..3bd1a90a8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -1100,7 +1100,6 @@ class AbstractPDF2XHTML extends PDFTextStripper { updateMetadata.set(PDF.INCREMENTAL_UPDATE_NUMBER, count); updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.VERSION.toString()); - updateMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "incremental-update-" + count); if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata)) { try (InputStream tis = TikaInputStream.get(update)) { context.set(IsIncrementalUpdate.class, IsIncrementalUpdate.IS_INCREMENTAL_UPDATE); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 5cd074c21..07c4e2d9e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -279,7 +279,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia //don't count the last xref as an incremental update startXrefs--; } - metadata.set(PDF.PDF_INCREMENTAL_UPDATES, startXrefs); + metadata.set(PDF.PDF_INCREMENTAL_UPDATE_COUNT, startXrefs); if (localConfig.isParseIncrementalUpdates()) { try { parseContext.set(IncrementalUpdateRecord.class, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java index 86b4b6081..7b6c88626 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java @@ -67,7 +67,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata( "testPDF_incrementalUpdates.pdf", parseContext); - assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES)); + assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT)); + assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT)); long[] expected = new long[]{16242, 41226, 64872}; long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS); assertEquals(3, eofs.length); @@ -155,7 +156,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest { "testPDF_incrementalUpdates.pdf", parseContext); assertEquals(3, metadataList.size()); - assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES)); + assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT)); + assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT)); long[] expected = new long[]{16242, 41226, 64872}; long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS); assertEquals(3, eofs.length); @@ -170,9 +172,13 @@ public class PDFIncrementalUpdatesTest extends TikaTest { assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER)); assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER)); assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER)); - assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals("incremental-update-1", - metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("/version-number-0", + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("/version-number-1", + metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + + assertNull(metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertNull(metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(), metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index ab0340afe..78f54c4f1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -477,7 +477,9 @@ public class PDFParserTest extends TikaTest { parseContext.set(PDFParserConfig.class, pdfParserConfig); List<Metadata> metadataList = getRecursiveMetadata("test-incremental-updates.eml", parseContext); assertEquals(4, metadataList.size()); - assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATES)); + assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT)); + assertEquals(2, + metadataList.get(3).getInt(TikaCoreProperties.VERSION_COUNT)); long[] expected = new long[]{16242, 41226, 64872}; long[] eofs = metadataList.get(3).getLongValues(PDF.EOF_OFFSETS); assertEquals(3, eofs.length); @@ -494,9 +496,11 @@ public class PDFParserTest extends TikaTest { assertNull(metadataList.get(3).get(PDF.INCREMENTAL_UPDATE_NUMBER)); assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER)); assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER)); - assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals("incremental-update-1", - metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + + assertEquals("/testPDF_incrementalUpdates.pdf/version-number-0", + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("/testPDF_incrementalUpdates.pdf/version-number-1", + metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(), metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));