This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new d375b4c14 TIKA-4410 (#2226)
d375b4c14 is described below
commit d375b4c145a97451069a5f0dd30ede98165c990d
Author: tallison <[email protected]>
AuthorDate: Tue Jun 3 08:49:38 2025 -0400
TIKA-4410 (#2226)
TIKA-4430 -- improve extraction of metadata from xls
---
.../main/java/org/apache/tika/metadata/Office.java | 21 +++
.../tika/parser/microsoft/ExcelExtractor.java | 81 ++++++++++-
.../microsoft/ooxml/CommentPersonHandler.java | 47 +++++++
.../parser/microsoft/ooxml/OPCPackageWrapper.java | 3 +
.../ooxml/XSSFExcelExtractorDecorator.java | 149 ++++++++++++++++++++-
.../tika/parser/microsoft/ExcelParserTest.java | 15 +++
.../test-documents/testEXCEL_extra_metadata.xls | Bin 0 -> 12800 bytes
7 files changed, 309 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 2a9e428eb..39607445f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -184,4 +184,25 @@ public interface Office {
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged");
+
+ Property HAS_HIDDEN_SHEETS =
Property.internalBoolean("msoffice:excel:has-hidden-sheets");
+
+ Property HAS_HIDDEN_COLUMNS =
Property.internalBoolean("msoffice:excel:has-hidden-cols");
+
+ Property HAS_HIDDEN_ROWS =
Property.internalBoolean("msoffice:excel:has-hidden-rows");
+
+ Property HAS_VERY_HIDDEN_SHEETS =
Property.internalBoolean("msoffice:excel:has-very-hidden-sheets");
+
+ Property HIDDEN_SHEET_NAMES =
Property.internalTextBag("msoffice:excel:hidden-sheet-names");
+
+ Property VERY_HIDDEN_SHEET_NAMES =
Property.internalTextBag("msoffice:excel:very-hidden-sheet-names");
+
+ Property PROTECTED_WORKSHEET =
Property.internalBoolean("msoffice:excel:protected-worksheet");
+
+ Property WORKBOOK_CODENAME =
Property.internalText("msoffice:excel:workbook-codename");
+
+ Property HAS_COMMENTS = Property.internalBoolean("msoffice:has-comments");
+
+ Property COMMENT_PERSONS =
Property.internalTextBag("msoffice:comment-person-display-name");
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 2aac29d91..41a1a840e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -24,8 +24,10 @@ import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
+import java.util.TreeSet;
import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
@@ -39,6 +41,7 @@ import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.ColumnInfoRecord;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
@@ -51,9 +54,12 @@ import org.apache.poi.hssf.record.HeaderRecord;
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.ProtectRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.RowRecord;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.record.TextObjectRecord;
@@ -73,8 +79,10 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Excel parser implementation which uses POI's Event API
@@ -188,6 +196,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
new TikaHSSFListener(workbookEntryName, xhtml, locale, this,
officeParserConfig);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
+ updateMetadata(listener);
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof
DirectoryEntry) {
@@ -200,6 +209,36 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
}
}
+ private void updateMetadata(TikaHSSFListener listener) {
+ if (listener.hasProtectedSheet) {
+ parentMetadata.set(Office.PROTECTED_WORKSHEET, true);
+ }
+ if (listener.hasHiddenColumn) {
+ parentMetadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+ }
+ if (listener.hasHiddenRow) {
+ parentMetadata.set(Office.HAS_HIDDEN_ROWS, true);
+ }
+ if (! listener.commentAuthors.isEmpty()) {
+ for (String author : listener.commentAuthors) {
+ parentMetadata.add(Office.COMMENT_PERSONS, author);
+ }
+ parentMetadata.set(Office.HAS_COMMENTS, true);
+ }
+ if (! listener.hiddenSheets.isEmpty()) {
+ for (String sheetName : listener.hiddenSheets) {
+ parentMetadata.add(Office.HIDDEN_SHEET_NAMES, sheetName);
+ }
+ parentMetadata.set(Office.HAS_HIDDEN_SHEETS, true);
+ }
+ if (! listener.veryHiddenSheets.isEmpty()) {
+ for (String sheetName : listener.veryHiddenSheets) {
+ parentMetadata.add(Office.VERY_HIDDEN_SHEET_NAMES, sheetName);
+ }
+ parentMetadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+ }
+ }
+
// ======================================================================
/**
@@ -266,7 +305,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
* depend on continue records that aren't always
* contiguous. Collect them for later processing.
*/
- private List<DrawingGroupRecord> drawingGroups = new ArrayList<>();
+ private final List<DrawingGroupRecord> drawingGroups = new
ArrayList<>();
+
+ private final List<String> hiddenSheets = new ArrayList<>();
+ private final List<String> veryHiddenSheets = new ArrayList<>();
+ private final Set<String> commentAuthors = new TreeSet<>();
+ private boolean hasHiddenColumn = false;
+ private boolean hasHiddenRow = false;
+ private boolean hasProtectedSheet = false;
/**
* Construct a new listener instance outputting parsed data to
@@ -328,6 +374,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener,
ExtendedFormatRecord.sid);
hssfRequest.addListener(formatListener,
DrawingGroupRecord.sid);
+ hssfRequest.addListener(formatListener, ProtectRecord.sid);
+ hssfRequest.addListener(formatListener, ColumnInfoRecord.sid);
+ hssfRequest.addListener(formatListener, RowRecord.sid);
+ hssfRequest.addListener(formatListener, NoteRecord.sid);
if (extractor.officeParserConfig.isIncludeHeadersAndFooters())
{
hssfRequest.addListener(formatListener, HeaderRecord.sid);
hssfRequest.addListener(formatListener, FooterRecord.sid);
@@ -419,6 +469,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
case BoundSheetRecord.sid: // Worksheet index record
BoundSheetRecord boundSheetRecord = (BoundSheetRecord)
record;
+ if (boundSheetRecord.isHidden()) {
+ hiddenSheets.add(boundSheetRecord.getSheetname());
+ }
+ if (boundSheetRecord.isVeryHidden()) {
+ veryHiddenSheets.add(boundSheetRecord.getSheetname());
+ }
sheetNames.add(boundSheetRecord.getSheetname());
break;
@@ -524,6 +580,28 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
addTextCell(record, footerRecord.getText());
}
break;
+ case ProtectRecord.sid:
+ if (((ProtectRecord)record).getProtect()) {
+ //TODO -- associate this worksheet name
+ hasProtectedSheet = true;
+ }
+ break;
+ case ColumnInfoRecord.sid:
+ if (((ColumnInfoRecord)record).getHidden()) {
+ hasHiddenColumn = true;
+ }
+ break;
+ case NoteRecord.sid:
+ String author = ((NoteRecord)record).getAuthor();
+ if (!StringUtils.isBlank(author)) {
+ commentAuthors.add(author);
+ }
+ break;
+ case RowRecord.sid:
+ if (((RowRecord)record).getZeroHeight()) {
+ hasHiddenRow = true;
+ }
+ break;
}
previousSid = record.getSid();
@@ -680,7 +758,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
@Override
public void processRecord(Record record) {
-// System.out.println(record.getClass() + " :
"+record.toString());
super.processRecord(record);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
new file mode 100644
index 000000000..c7efda1ae
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.utils.StringUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class CommentPersonHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ CommentPersonHandler(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
+ //what else do we want?
+ //<person displayName="Wiley Coyote"
id="{11111111-2234-2342-2342-23498237923}" userId="55bbdf23486284"
providerId="Windows Live"/>
+ if ("person".equals(localName)) {
+ String displayName = XMLReaderUtils.getAttrValue("displayName",
atts);
+ if (!StringUtils.isBlank(displayName)) {
+ metadata.add(Office.COMMENT_PERSONS, displayName);
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
index 2cfd24f92..1fb0b8e40 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
@@ -29,6 +29,9 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
*/
public class OPCPackageWrapper implements Closeable {
+ public static final String PERSON_RELATION =
"http://schemas.microsoft.com/office/2017/10/relationships/person";
+ public static final String THREADED_COMMENT_RELATION =
"http://schemas.microsoft.com/office/2017/10/relationships/threadedComment";
+
private final OPCPackage opcPackage;
public OPCPackageWrapper(OPCPackage opcPackage) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index da5357937..873242927 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -35,6 +35,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
@@ -68,11 +69,13 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
@@ -159,9 +162,12 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
sheetParts.add(sheetPart);
Comments comments = iter.getSheetComments();
+ if (comments != null && comments.getNumberOfComments() > 0) {
+ metadata.set(Office.HAS_COMMENTS, true);
+ }
// Start, and output the sheet name
- xhtml.startElement("div");
+ xhtml.startElement("div", "class", "sheet");
xhtml.element("h1", iter.getSheetName());
// Extract the main sheet contents
@@ -169,9 +175,14 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.startElement("tbody");
processSheet(sheetExtractor, comments, styles, strings,
stream);
+ try {
+ getThreadedComments(container, sheetPart, xhtml);
+ } catch (InvalidFormatException | TikaException | IOException
e) {
+ //swallow
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
}
- xhtml.endElement("tbody");
- xhtml.endElement("table");
// Output any headers and footers
// (Need to process the sheet to get them, so we can't
@@ -201,13 +212,63 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
//consider adding this back to POI
try (InputStream wbData = xssfReader.getWorkbookData()) {
XMLReaderUtils
- .parseSAX(wbData, new AbsPathExtractorHandler(),
+ .parseSAX(wbData, new WorkbookMetadataHandler(),
parseContext);
} catch (InvalidFormatException | TikaException e) {
//swallow
}
+ try {
+ getPersons(container, metadata);
+ } catch (InvalidFormatException | TikaException | IOException |
SAXException e) {
+ //swallow
+ }
+
}
+ private void getThreadedComments(OPCPackage container, PackagePart
sheetPart, XHTMLContentHandler xhtml) throws TikaException,
+ InvalidFormatException, SAXException, IOException {
+ //consider caching the person id -> person names in getPersons and
injecting that into the xhtml per comment?
+ PackageRelationshipCollection coll =
sheetPart.getRelationshipsByType(OPCPackageWrapper.THREADED_COMMENT_RELATION);
+ if (coll == null || coll.isEmpty()) {
+ return;
+ }
+ for (PackageRelationship rel : coll) {
+ PackagePart threadedCommentPart = sheetPart.getRelatedPart(rel);
+ if (threadedCommentPart == null) {
+ continue;
+ }
+ try (InputStream is = threadedCommentPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, new ThreadedCommentHandler(xhtml),
parseContext);
+ }
+ }
+ }
+
+ private void getPersons(OPCPackage container, Metadata metadata) throws
TikaException, InvalidFormatException,
+ IOException, SAXException {
+ PackageRelationship coreDocRelationship =
container.getRelationshipsByType(
+ PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
+ if (coreDocRelationship == null) {
+ return;
+ }
+ // Get the part that holds the workbook
+ PackagePart workbookPart = container.getPart(coreDocRelationship);
+ if (workbookPart == null) {
+ return;
+ }
+ PackageRelationshipCollection coll =
workbookPart.getRelationshipsByType(OPCPackageWrapper.PERSON_RELATION);
+ if (coll == null) {
+ return;
+ }
+ for (PackageRelationship rel : coll) {
+ PackagePart personsPart = workbookPart.getRelatedPart(rel);
+ if (personsPart == null) {
+ continue;
+ }
+ try (InputStream is = personsPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, new
CommentPersonHandler(metadata), parseContext);
+ }
+ }
+ }
protected void addDrawingHyperLinks(PackagePart sheetPart) {
try {
@@ -357,6 +418,12 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
if (handler.hasProtection) {
metadata.set(TikaCoreProperties.PROTECTED, "true");
}
+ if (handler.hasHiddenColumn) {
+ metadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+ }
+ if (handler.hasHiddenRow) {
+ metadata.set(Office.HAS_HIDDEN_ROWS, true);
+ }
} catch (TikaException e) {
throw new RuntimeException("SAX parser appears to be broken - " +
e.getMessage());
}
@@ -536,6 +603,8 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
protected static class XSSFSheetInterestingPartsCapturer extends
DefaultHandler {
private ContentHandler delegate;
private boolean hasProtection = false;
+ private boolean hasHiddenRow = false;
+ private boolean hasHiddenColumn = false;
protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
this.delegate = delegate;
@@ -546,6 +615,18 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
if ("sheetProtection".equals(qName)) {
hasProtection = true;
}
+ if (! hasHiddenRow && "row".equals(localName)) {
+ String v = atts.getValue("hidden");
+ if ("true".equals(v) || "1".equals(v)) {
+ hasHiddenRow = true;
+ }
+ }
+ if (! hasHiddenColumn && "col".equals(localName)) {
+ String v = atts.getValue("hidden");
+ if ("true".equals(v) || "1".equals(v)) {
+ hasHiddenColumn = true;
+ }
+ }
delegate.startElement(uri, localName, qName, atts);
}
@@ -590,7 +671,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
- private class AbsPathExtractorHandler extends DefaultHandler {
+ private class WorkbookMetadataHandler extends DefaultHandler {
@Override
public void startElement(String uri, String localName, String qName,
Attributes atts)
throws SAXException {
@@ -604,6 +685,64 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
return;
}
}
+ } else if ("sheet".equals(localName)) {
+ String n = XMLReaderUtils.getAttrValue("name", atts);
+ String state = XMLReaderUtils.getAttrValue("state", atts);
+ if ("hidden".equals(state)) {
+ metadata.set(Office.HAS_HIDDEN_SHEETS, true);
+ metadata.add(Office.HIDDEN_SHEET_NAMES, n);
+ } else if ("veryHidden".equals(state)) {
+ metadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+ metadata.set(Office.VERY_HIDDEN_SHEET_NAMES, n);
+ }
+ } else if ("workbookPr".equals(localName)) {
+ String codeName = XMLReaderUtils.getAttrValue("codeName",
atts);
+ if (!StringUtils.isBlank(codeName)) {
+ metadata.set(Office.WORKBOOK_CODENAME, codeName);
+ }
+ }
+ // file version? <fileVersion appName="xl" lastEdited="7"
lowestEdited="7" rupBuild="28526"/>
+ }
+ }
+
+ private static class ThreadedCommentHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+ StringBuilder sb = new StringBuilder();
+ boolean inText = false;
+ public ThreadedCommentHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
+ if ("text".equals(localName)) {
+ inText = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
throws SAXException {
+ if ("text".equals(localName)) {
+ xhtml.startElement("div", "class", "threaded-comment");
+ xhtml.startElement("p");
+ xhtml.characters(sb.toString());
+ xhtml.endElement("p");
+ xhtml.endElement("div");
+ sb.setLength(0);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ if (inText) {
+ sb.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
+ if (inText) {
+ sb.append(ch, start, length);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 600194407..a90d79445 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.fail;
import java.io.InputStream;
import java.text.DecimalFormatSymbols;
+import java.util.List;
import java.util.Locale;
import org.apache.poi.util.LocaleUtil;
@@ -577,4 +578,18 @@ public class ExcelParserTest extends TikaTest {
assertContains("1996-08-10", xml);
}
}
+
+ @Test
+ public void testExtraMetadata() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_extra_metadata.xls");
+ Metadata m = metadataList.get(0);
+ assertEquals("Unknown Author", m.getValues(Office.COMMENT_PERSONS)[0]);
+ assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+ assertEquals("true", m.get(Office.HAS_HIDDEN_ROWS));
+ assertEquals("true", m.get(Office.PROTECTED_WORKSHEET));
+ assertEquals("hidden-sheet",
m.getValues(Office.HIDDEN_SHEET_NAMES)[0]);
+ assertEquals("very-hidden-sheet",
m.getValues(Office.VERY_HIDDEN_SHEET_NAMES)[0]);
+ assertEquals("true", m.get(Office.HAS_COMMENTS));
+ assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
new file mode 100644
index 000000000..e624857b7
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
differ