This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f30180baa TIKA-4033 -- improve metadata handling for incremental
updates. Generalize incremental updates to "version" and avoid use of
synthetic name for resourceName. (#1121)
f30180baa is described below
commit f30180baa6b7da6ad56ff1e4f5f963d0fc7f4e84
Author: Tim Allison <[email protected]>
AuthorDate: Tue May 9 10:10:19 2023 -0400
TIKA-4033 -- improve metadata handling for incremental updates. Generalize
incremental updates to "version" and avoid use of synthetic name for
resourceName. (#1121)
---
.../src/main/java/org/apache/tika/metadata/PDF.java | 20 ++++++++++++--------
.../org/apache/tika/metadata/TikaCoreProperties.java | 14 ++++++++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 2 ++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 1 -
.../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +-
.../tika/parser/pdf/PDFIncrementalUpdatesTest.java | 16 +++++++++++-----
.../org/apache/tika/parser/pdf/PDFParserTest.java | 12 ++++++++----
7 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 0739855fc..c2baca0e8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -28,12 +28,6 @@ public interface PDF {
String PDFAID_PREFIX = "pdfaid" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
- /**
- * Incremental updates as extracted by the StartXRefScanner. See
- * that class for limitations.
- */
- Property PDF_INCREMENTAL_UPDATES = Property.externalInteger(PDF_PREFIX +
"incrementalUpdates");
-
/**
* Number of %%EOF as extracted by the StartXRefScanner. See
* that class for limitations.
@@ -203,6 +197,16 @@ public interface PDF {
* This value is populated with the parse incremental updates feature is
selected
* in the PDFParser.
*/
- Property INCREMENTAL_UPDATE_NUMBER = Property.internalInteger(PDF_PREFIX +
- "incrementalUpdateNumber");
+ Property INCREMENTAL_UPDATE_NUMBER =
+ Property.composite(Property.internalInteger(PDF_PREFIX +
"incrementalUpdateNumber"),
+ new Property[]{ TikaCoreProperties.VERSION_NUMBER });
+
+ /**
+ * Incremental updates as extracted by the StartXRefScanner. See
+ * that class for limitations.
+ */
+ Property PDF_INCREMENTAL_UPDATE_COUNT =
+ Property.composite( Property.externalInteger(PDF_PREFIX +
"incrementalUpdateCount"),
+ new Property[]{ TikaCoreProperties.VERSION_COUNT });
+
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 95cf0a035..b49068c39 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,20 @@ public interface TikaCoreProperties {
//is the file encrypted
Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX +
"encrypted");
+ /**
+ * General metadata key for the count of non-final versions available
within a file. This
+ * was added initially to support generalizing incremental updates in PDF.
+ */
+ Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX +
"versionCount");
+
+ /**
+ * General metadata key for the version number of a given file that
contains
+ * earlier versions within it. This number is 0-indexed for the earliest
version.
+ * The latest version does not have this metadata value. This was added
initially
+ * to support generalizing incremental updates in PDF.
+ */
+ Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX +
"versionNumber");
+
/**
* A file might contain different types of embedded documents.
* The most common is the ATTACHMENT.
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 101aa3395..483181b0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -188,6 +188,8 @@ public class RecursiveParserWrapper extends ParserDecorator
{
objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
} else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID)
!= null) {
objectName =
metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+ } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
+ objectName = "version-number-" +
metadata.get(TikaCoreProperties.VERSION_NUMBER);
} else {
objectName = "embedded-" + (++state.unknownCount);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 5b2d4a659..3bd1a90a8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1100,7 +1100,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
updateMetadata.set(PDF.INCREMENTAL_UPDATE_NUMBER, count);
updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
- updateMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
"incremental-update-" + count);
if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata))
{
try (InputStream tis = TikaInputStream.get(update)) {
context.set(IsIncrementalUpdate.class,
IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 5cd074c21..07c4e2d9e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -279,7 +279,7 @@ public class PDFParser extends AbstractParser implements
RenderingParser, Initia
//don't count the last xref as an incremental update
startXrefs--;
}
- metadata.set(PDF.PDF_INCREMENTAL_UPDATES, startXrefs);
+ metadata.set(PDF.PDF_INCREMENTAL_UPDATE_COUNT, startXrefs);
if (localConfig.isParseIncrementalUpdates()) {
try {
parseContext.set(IncrementalUpdateRecord.class,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
index 86b4b6081..7b6c88626 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
@@ -67,7 +67,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata(
"testPDF_incrementalUpdates.pdf",
parseContext);
- assertEquals(2,
metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+ assertEquals(2,
metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
+ assertEquals(2,
metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
@@ -155,7 +156,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
"testPDF_incrementalUpdates.pdf",
parseContext);
assertEquals(3, metadataList.size());
- assertEquals(2,
metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+ assertEquals(2,
metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+ assertEquals(2,
metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
@@ -170,9 +172,13 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(0,
metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(1,
metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
- assertEquals("incremental-update-0",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("incremental-update-1",
- metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("/version-number-0",
+
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/version-number-1",
+
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+
assertNull(metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
assertNull(metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ab0340afe..78f54c4f1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -477,7 +477,9 @@ public class PDFParserTest extends TikaTest {
parseContext.set(PDFParserConfig.class, pdfParserConfig);
List<Metadata> metadataList =
getRecursiveMetadata("test-incremental-updates.eml", parseContext);
assertEquals(4, metadataList.size());
- assertEquals(2,
metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+ assertEquals(2,
metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+ assertEquals(2,
+ metadataList.get(3).getInt(TikaCoreProperties.VERSION_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(3).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
@@ -494,9 +496,11 @@ public class PDFParserTest extends TikaTest {
assertNull(metadataList.get(3).get(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(0,
metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(1,
metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
- assertEquals("incremental-update-0",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("incremental-update-1",
- metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ assertEquals("/testPDF_incrementalUpdates.pdf/version-number-0",
+
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/testPDF_incrementalUpdates.pdf/version-number-1",
+
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));