This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 31c1a08ad TIKA-4410 (#2226) -- improve feature extraction from xlsx
31c1a08ad is described below
commit 31c1a08ad1d08fdc088dc5cbb28f18363414543e
Author: Tim Allison <[email protected]>
AuthorDate: Fri May 30 09:41:52 2025 -0400
TIKA-4410 (#2226) -- improve feature extraction from xlsx
---
.../main/java/org/apache/tika/metadata/Office.java | 18 +++++++
.../apache/tika/metadata/TikaCoreProperties.java | 7 ++-
.../metadata/writefilter/StandardWriteFilter.java | 2 +-
.../org/apache/tika/pipes/PipesClientTest.java | 5 +-
.../parser/microsoft/AbstractPOIFSExtractor.java | 8 +--
.../microsoft/ooxml/CommentPersonHandler.java | 47 ++++++++++++++++
.../parser/microsoft/ooxml/OPCPackageWrapper.java | 1 +
.../ooxml/XSSFBExcelExtractorDecorator.java | 3 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 62 ++++++++++++++++++++--
.../microsoft/POIContainerExtractionTest.java | 7 +--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 12 +++--
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 3 +-
12 files changed, 149 insertions(+), 26 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index aa4b9f002..9d5442b67 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -152,4 +152,22 @@ public interface Office {
Property PROG_ID = Property.internalText("msoffice:progID");
Property OCX_NAME = Property.internalText("msoffice:ocxName");
+
+ Property EMBEDDED_STORAGE_CLASS_ID =
Property.internalText("msoffice:embeddedStorageClassId");
+
+ Property HAS_HIDDEN_SHEETS =
Property.internalBoolean("msoffice:excel:has-hidden-sheets");
+
+ Property HAS_VERY_HIDDEN_SHEETS =
Property.internalBoolean("msoffice:excel:has-very-hidden-sheets");
+
+ Property HIDDEN_SHEET_NAMES =
Property.internalTextBag("msoffice:excel:hidden-sheet-names");
+ Property VERY_HIDDEN_SHEET_NAMES =
Property.internalTextBag("msoffice:excel:very-hidden-sheet-names");
+
+ Property PROTECTED_WORKSHEET =
Property.internalBoolean("msoffice:excel:protected-worksheet");
+
+ Property WORKBOOK_CODENAME =
Property.internalText("msoffice:excel:workbook-codename");
+
+ Property HAS_COMMENTS = Property.internalBoolean("msoffice:has-comments");
+
+ Property COMMENT_PERSONS =
Property.internalTextBag("msoffice:comment-person-display-name");
+
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 3d7d34d4e..7e36624c5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -175,10 +175,9 @@ public interface TikaCoreProperties {
Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW =
Property.externalRealSeq(TIKA_META_PREFIX +
"detected_language_confidence_raw");
- String RESOURCE_NAME_KEY = "resourceName";
- String PROTECTED = "protected";
- String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
- String EMBEDDED_STORAGE_CLASS_ID = "embeddedStorageClassId";
+ Property RESOURCE_NAME_KEY = Property.internalText(TIKA_META_PREFIX +
"resourceName");
+ Property EMBEDDED_RELATIONSHIP_ID = Property.internalText(TIKA_META_PREFIX
+ "embeddedRelationshipId");
+
String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType";
/**
* Some file formats can store information about their original
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
index a245e8d2c..38763d079 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
@@ -81,7 +81,7 @@ public class StandardWriteFilter implements
MetadataWriteFilter, Serializable {
ALWAYS_SET_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
ALWAYS_SET_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_HINT.getName());
ALWAYS_SET_FIELDS.add(TikaCoreProperties.TIKA_CONTENT.getName());
- ALWAYS_SET_FIELDS.add(TikaCoreProperties.RESOURCE_NAME_KEY);
+ ALWAYS_SET_FIELDS.add(TikaCoreProperties.RESOURCE_NAME_KEY.getName());
ALWAYS_SET_FIELDS.add(AccessPermissions.EXTRACT_CONTENT.getName());
ALWAYS_SET_FIELDS.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY.getName());
ALWAYS_SET_FIELDS.add(Metadata.CONTENT_DISPOSITION);
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java
b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java
index 35d52fc4a..2a3f71138 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java
@@ -29,6 +29,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.MockUpperCaseFilter;
@@ -63,7 +64,7 @@ public class PipesClientTest {
Assertions.assertNotNull(pipesResult.getEmitData().getMetadataList());
Assertions.assertEquals(1,
pipesResult.getEmitData().getMetadataList().size());
Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0);
- Assertions.assertEquals("testOverlappingText.pdf",
metadata.get("resourceName"));
+ Assertions.assertEquals("testOverlappingText.pdf",
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
@Test
@@ -77,7 +78,7 @@ public class PipesClientTest {
Assertions.assertNotNull(pipesResult.getEmitData().getMetadataList());
Assertions.assertEquals(1,
pipesResult.getEmitData().getMetadataList().size());
Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0);
- Assertions.assertEquals("TESTOVERLAPPINGTEXT.PDF",
metadata.get("resourceName"));
+ Assertions.assertEquals("TESTOVERLAPPINGTEXT.PDF",
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 8910b1c00..39d5a0f0a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -126,7 +126,7 @@ abstract class AbstractPOIFSExtractor {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID,
relationshipID);
}
if (storageClassID != null) {
-
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
+ embeddedMetadata.set(Office.EMBEDDED_STORAGE_CLASS_ID,
storageClassID.toString());
}
if (mediaType != null) {
@@ -200,7 +200,7 @@ abstract class AbstractPOIFSExtractor {
// What kind of document is it?
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID,
dir.getName());
if (dir.getStorageClsid() != null) {
- metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
+ metadata.set(Office.EMBEDDED_STORAGE_CLASS_ID,
dir.getStorageClsid().toString());
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
@@ -383,7 +383,7 @@ abstract class AbstractPOIFSExtractor {
return;
}
if (dir.getStorageClsid() != null) {
- metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
+ metadata.set(Office.EMBEDDED_STORAGE_CLASS_ID,
dir.getStorageClsid().toString());
}
embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
@@ -398,7 +398,7 @@ abstract class AbstractPOIFSExtractor {
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
tis.setOpenContainer(dir);
if (dir.getStorageClsid() != null) {
- metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
+ metadata.set(Office.EMBEDDED_STORAGE_CLASS_ID,
dir.getStorageClsid().toString());
}
embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata,
outputHtml);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
new file mode 100644
index 000000000..c7efda1ae
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.utils.StringUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class CommentPersonHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ CommentPersonHandler(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
+ //what else do we want?
+ //<person displayName="Wiley Coyote"
id="{11111111-2234-2342-2342-23498237923}" userId="55bbdf23486284"
providerId="Windows Live"/>
+ if ("person".equals(localName)) {
+ String displayName = XMLReaderUtils.getAttrValue("displayName",
atts);
+ if (!StringUtils.isBlank(displayName)) {
+ metadata.add(Office.COMMENT_PERSONS, displayName);
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
index 2cfd24f92..34834a416 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
@@ -29,6 +29,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
*/
public class OPCPackageWrapper implements Closeable {
+ public static final String PERSON_RELATION =
"http://schemas.microsoft.com/office/2017/10/relationships/person";
private final OPCPackage opcPackage;
public OPCPackageWrapper(OPCPackage opcPackage) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 77000b9a9..51a30cdc9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -39,6 +39,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -64,7 +65,7 @@ public class XSSFBExcelExtractorDecorator extends
XSSFExcelExtractorDecorator {
this.metadata = metadata;
this.parseContext = context;
- metadata.set(TikaCoreProperties.PROTECTED, "false");
+ metadata.set(Office.PROTECTED_WORKSHEET, false);
super.getXHTML(handler, metadata, context);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index da5357937..97c629b6a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -35,6 +35,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
@@ -68,11 +69,13 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
@@ -122,7 +125,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
this.metadata = metadata;
this.parseContext = context;
- metadata.set(TikaCoreProperties.PROTECTED, "false");
+ metadata.set(Office.PROTECTED_WORKSHEET, "false");
super.getXHTML(handler, metadata, context);
}
@@ -148,7 +151,6 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
} catch (OpenXML4JException e) {
throw new XmlException(e);
}
-
while (iter.hasNext()) {
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config,
xhtml);
PackagePart sheetPart = null;
@@ -159,6 +161,9 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
sheetParts.add(sheetPart);
Comments comments = iter.getSheetComments();
+ if (comments != null && comments.getNumberOfComments() > 0) {
+ metadata.set(Office.HAS_COMMENTS, true);
+ }
// Start, and output the sheet name
xhtml.startElement("div");
@@ -201,13 +206,44 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
//consider adding this back to POI
try (InputStream wbData = xssfReader.getWorkbookData()) {
XMLReaderUtils
- .parseSAX(wbData, new AbsPathExtractorHandler(),
+ .parseSAX(wbData, new WorkbookMetadataHandler(),
parseContext);
} catch (InvalidFormatException | TikaException e) {
//swallow
}
+ try {
+ getPersons(container, metadata);
+ } catch (InvalidFormatException | TikaException | IOException |
SAXException e) {
+ //swallow
+ }
}
+ private void getPersons(OPCPackage container, Metadata metadata) throws
TikaException, InvalidFormatException,
+ IOException, SAXException {
+ PackageRelationship coreDocRelationship =
container.getRelationshipsByType(
+ PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
+ if (coreDocRelationship == null) {
+ return;
+ }
+ // Get the part that holds the workbook
+ PackagePart workbookPart = container.getPart(coreDocRelationship);
+ if (workbookPart == null) {
+ return;
+ }
+ PackageRelationshipCollection coll =
workbookPart.getRelationshipsByType(OPCPackageWrapper.PERSON_RELATION);
+ if (coll == null) {
+ return;
+ }
+ for (PackageRelationship rel : coll) {
+ PackagePart personsPart = workbookPart.getRelatedPart(rel);
+ if (personsPart == null) {
+ continue;
+ }
+ try (InputStream is = personsPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, new
CommentPersonHandler(metadata), parseContext);
+ }
+ }
+ }
protected void addDrawingHyperLinks(PackagePart sheetPart) {
try {
@@ -355,7 +391,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
sheetInputStream.close();
if (handler.hasProtection) {
- metadata.set(TikaCoreProperties.PROTECTED, "true");
+ metadata.set(Office.PROTECTED_WORKSHEET, true);
}
} catch (TikaException e) {
throw new RuntimeException("SAX parser appears to be broken - " +
e.getMessage());
@@ -590,7 +626,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
- private class AbsPathExtractorHandler extends DefaultHandler {
+ private class WorkbookMetadataHandler extends DefaultHandler {
@Override
public void startElement(String uri, String localName, String qName,
Attributes atts)
throws SAXException {
@@ -604,7 +640,23 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
return;
}
}
+ } else if ("sheet".equals(localName)) {
+ String n = XMLReaderUtils.getAttrValue("name", atts);
+ String state = XMLReaderUtils.getAttrValue("state", atts);
+ if ("hidden".equals(state)) {
+ metadata.set(Office.HAS_HIDDEN_SHEETS, true);
+ metadata.add(Office.HIDDEN_SHEET_NAMES, n);
+ } else if ("veryHidden".equals(state)) {
+ metadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+ metadata.set(Office.VERY_HIDDEN_SHEET_NAMES, n);
+ }
+ } else if ("workbookPr".equals(localName)) {
+ String codeName = XMLReaderUtils.getAttrValue("codeName",
atts);
+ if (!StringUtils.isBlank(codeName)) {
+ metadata.set(Office.WORKBOOK_CODENAME, codeName);
+ }
}
+ // file version? <fileVersion appName="xl" lastEdited="7"
lowestEdited="7" rupBuild="28526"/>
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index ba8fb1485..b0080ee76 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -31,6 +31,7 @@ import
org.apache.tika.detect.microsoft.POIFSContainerDetector;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -136,13 +137,13 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
List<Metadata> list = getRecursiveMetadata("testWORD_embeded.doc");
//.docx
assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
-
list.get(10).get(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID));
+ list.get(10).get(Office.EMBEDDED_STORAGE_CLASS_ID));
//_1345471035.ppt
assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
-
list.get(14).get(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID));
+ list.get(14).get(Office.EMBEDDED_STORAGE_CLASS_ID));
//_1345470949.xls
assertEquals("{00020820-0000-0000-C000-000000000046}",
-
list.get(16).get(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID));
+ list.get(16).get(Office.EMBEDDED_STORAGE_CLASS_ID));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3a74cabeb..9559e73c2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -116,7 +116,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest {
assertNotContained("9.0", content);
assertContains("196", content);
assertNotContained("196.0", content);
- assertEquals("false", metadata.get(TikaCoreProperties.PROTECTED));
+ assertEquals("false", metadata.get(Office.PROTECTED_WORKSHEET));
}
@@ -206,7 +206,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest {
assertNotContained("10.0", content);
assertContains("cb=sum", content);
assertNotContained("13.0", content);
- assertEquals("false", metadata.get(TikaCoreProperties.PROTECTED));
+ assertEquals("false", metadata.get(Office.PROTECTED_WORKSHEET));
}
@@ -482,7 +482,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest {
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("true", metadata.get(TikaCoreProperties.PROTECTED));
+ assertEquals("true", metadata.get(Office.PROTECTED_WORKSHEET));
}
@@ -497,7 +497,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest {
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("true",
xmlResult.metadata.get(TikaCoreProperties.PROTECTED));
+ assertEquals("true",
xmlResult.metadata.get(Office.PROTECTED_WORKSHEET));
assertContains("Office", xmlResult.xml);
}
@@ -1393,11 +1393,13 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
public void testMacroinXlsm() throws Exception {
//test default is "don't extract macros"
- for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm"))
{
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_macro.xlsm");
+ for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
+ assertEquals("ThisWorkbook",
metadataList.get(0).get(Office.WORKBOOK_CODENAME));
//now test that they were extracted
ParseContext context = new ParseContext();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 9046f4951..640ea0104 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -28,6 +28,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
public class TruncatedOOXMLTest extends TikaTest {
@@ -53,7 +54,7 @@ public class TruncatedOOXMLTest extends TikaTest {
metadataList.forEach(m -> {
System.out.println("depth: " + m.get("X-TIKA:embedded_depth"));
System.out.println("relid: " + m.get("embeddedRelationshipId"));
- System.out.println("res: " + m.get("resourceName"));
+ System.out.println("res: " +
m.get(TikaCoreProperties.RESOURCE_NAME_KEY));
System.out.println("cont: " + m.get("X-TIKA:content"));
});