This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4091 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0e1386bed265f5c7979ece23e2b6513b743dbe6e Author: tallison <talli...@apache.org> AuthorDate: Tue Jun 20 12:09:47 2023 -0400 TIKA-4091 -- improve OLE entry processing to be case insensitive where possible --- .../detect/microsoft/POIFSContainerDetector.java | 174 ++++++++++++++++----- .../apache/tika/parser/microsoft/OfficeParser.java | 28 +++- .../tika/parser/microsoft/SummaryExtractor.java | 18 ++- .../tika/parser/microsoft/OLE2CasingTest.java | 73 +++++++++ .../casing/protected_normal_case.docx | Bin 0 -> 17920 bytes .../casing/protected_upper_case.docx | Bin 0 -> 17920 bytes .../test-documents/casing/simple_lower_case.doc | Bin 0 -> 27136 bytes .../test-documents/casing/simple_normal_case.doc | Bin 0 -> 27136 bytes .../test-documents/casing/simple_upper_case.doc | Bin 0 -> 27136 bytes 9 files changed, 241 insertions(+), 52 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java index d0571110c..675326c2d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java @@ -26,6 +26,7 @@ import java.nio.file.Path; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; +import java.util.Locale; import java.util.Set; import java.util.regex.Pattern; @@ -43,6 +44,7 @@ import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.microsoft.OfficeParser; /** * A detector that works on a POIFS OLE2 document @@ -153,6 +155,67 @@ public class POIFSContainerDetector implements Detector { * Serial version UID */ private static final long serialVersionUID = -3028021741663605293L; + + //We need to have uppercase for finding/comparison, but we want to maintain + //the most common general casing for these items + + private static final String ENCRYPTED_PACKAGE = "EncryptedPackage".toUpperCase(Locale.US); + + private static final String ENCRYPTED_INFO = "EncryptionInfo".toUpperCase(Locale.US); + + private static final String SW_DOC_CONTENT_MGR = "SwDocContentMgr".toUpperCase(Locale.US); + + private static final String SW_DOC_MGR_TEMP_STORAGE = "SwDocMgrTempStorage".toUpperCase(Locale.US); + + private static final String STAR_CALC_DOCUMENT = "StarCalcDocument".toUpperCase(Locale.US); + + private static final String STAR_WRITER_DOCUMENT = "StarWriterDocument".toUpperCase(Locale.US); + + private static final String STAR_DRAW_DOCUMENT_3 = "StarDrawDocument3".toUpperCase(Locale.US); + + private static final String WKS_SSWORK_BOOK = "WksSSWorkBook".toUpperCase(Locale.US); + + private static final String DATA_SPACES = "\u0006DataSpaces".toUpperCase(Locale.US); + + private static final String DRM_ENCRYPTED_DATA_SPACE = "DRMEncryptedDataSpace".toUpperCase(Locale.US); + + private static final String WORD_DOCUMENT = "WordDocument".toUpperCase(Locale.US); + + private static final String QUILL = "Quill".toUpperCase(Locale.US); + + private static final String POWERPOINT_DOCUMENT = "PowerPoint Document".toUpperCase(Locale.US); + + private static final String VISIO_DOCUMENT = "VisioDocument".toUpperCase(Locale.US); + + private static final String OLE10_NATIVE_STRING = "\u0001Ole10Native".toUpperCase(Locale.US); + + private static final String MAT_OST = "MatOST".toUpperCase(Locale.US); + + private static final String CONTENTS = "CONTENTS".toUpperCase(Locale.US); + + private static final String SPELLING = "SPELLING".toUpperCase(Locale.US); + + private static final String OBJ_INFO = "\u0003ObjInfo".toUpperCase(Locale.US); + + private static final String COMP_OBJ_STRING = "\u0001CompObj".toUpperCase(Locale.US); + + private static final String PROPS = "Props".toUpperCase(Locale.US); + + private static final String PROPS_9 = "Props9".toUpperCase(Locale.US); + + private static final String PROPS_12 = "Props12".toUpperCase(Locale.US); + + private static final String EQUATION_NATIVE = "Equation Native".toUpperCase(Locale.US); + + private static final String LAYER = "Layer".toUpperCase(Locale.US); + + private static final String DGN_MF = "Dgn~Mf".toUpperCase(Locale.US); + + private static final String DGN_S = "Dgn~S".toUpperCase(Locale.US); + private static final String DGN_H = "Dgn~H".toUpperCase(Locale.US); + + private static final String SUBSTG_1 = "__substg1.0_".toUpperCase(Locale.US); + /** * An ASCII String "StarImpress" */ @@ -201,16 +264,25 @@ public class POIFSContainerDetector implements Detector { * detection may need access to the root {@link DirectoryEntry} of that file * for best results. The entry can be given as a second, optional argument. * - * @param names + * <p/> + * Following + * + * <a href="https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-cfb/60fe8611-66c3-496b-b70d-a504c94c9ace">2.6.1 of MS-CFB </a>, + * The detection is performed on case insensitive entry names. + * + * @param anyCaseNames * @param root * @return */ - public static MediaType detect(Set<String> names, DirectoryEntry root) { - if (names == null || names.size() == 0) { + public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry root) { + if (anyCaseNames == null || anyCaseNames.size() == 0) { return OLE; } + + Set<String> ucNames = upperCase(anyCaseNames); + for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) { - if (names.contains(workbookEntryName)) { + if (ucNames.contains(workbookEntryName)) { MediaType tmp = processCompObjFormatType(root); if (tmp.equals(MS_GRAPH_CHART)) { return MS_GRAPH_CHART; @@ -218,14 +290,14 @@ public class POIFSContainerDetector implements Detector { return XLS; } } - if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) { + if (ucNames.contains(SW_DOC_CONTENT_MGR) && ucNames.contains(SW_DOC_MGR_TEMP_STORAGE)) { return SLDWORKS; - } else if (names.contains("StarCalcDocument")) { + } else if (ucNames.contains(STAR_CALC_DOCUMENT)) { // Star Office Calc return SDC; - } else if (names.contains("StarWriterDocument")) { + } else if (ucNames.contains(STAR_WRITER_DOCUMENT)) { return SDW; - } else if (names.contains("StarDrawDocument3")) { + } else if (ucNames.contains(STAR_DRAW_DOCUMENT_3)) { if (root == null) { /* * This is either StarOfficeDraw or StarOfficeImpress, we have @@ -239,16 +311,16 @@ public class POIFSContainerDetector implements Detector { } else { return processCompObjFormatType(root); } - } else if (names.contains("WksSSWorkBook")) { + } else if (ucNames.contains(WKS_SSWORK_BOOK)) { // This check has to be before names.contains("Workbook") // Works 7.0 spreadsheet files contain both // we want to avoid classifying this as Excel return XLR; - } else if (names.contains("Book")) { + } else if (ucNames.contains("BOOK")) { // Excel 95 or older, we won't be able to parse this.... return XLS; - } else if (names.contains("EncryptedPackage")) { - if (names.contains("EncryptionInfo")) { + } else if (ucNames.contains(ENCRYPTED_PACKAGE)) { + if (ucNames.contains(ENCRYPTED_INFO)) { // This is a protected OOXML document, which is an OLE2 file // with an Encrypted Stream which holds the OOXML data // Without decrypting the stream, we can't tell what kind of @@ -261,12 +333,12 @@ public class POIFSContainerDetector implements Detector { //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces") //See TIKA-2982 return OOXML_PROTECTED; - } else if (names.contains("\u0006DataSpaces")) { + } else if (ucNames.contains(DATA_SPACES)) { //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not // supported by POI, but we should still detect it. //Do we also want to look for "DRMEncryptedTransform"? - if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) { + if (findRecursively(root, DRM_ENCRYPTED_DATA_SPACE, 0, 10)) { return DRM_ENCRYPTED; } else { return OLE; @@ -274,25 +346,25 @@ public class POIFSContainerDetector implements Detector { } else { return OLE; } - } else if (names.contains("WordDocument")) { + } else if (ucNames.contains(WORD_DOCUMENT)) { return DOC; - } else if (names.contains("Quill")) { + } else if (ucNames.contains(QUILL)) { return PUB; - } else if (names.contains("PowerPoint Document")) { + } else if (ucNames.contains(POWERPOINT_DOCUMENT)) { return PPT; - } else if (names.contains("VisioDocument")) { + } else if (ucNames.contains(VISIO_DOCUMENT)) { return VSD; - } else if (names.contains("\u0001Ole10Native")) { + } else if (ucNames.contains(OLE10_NATIVE_STRING)) { return OLE10_NATIVE; - } else if (names.contains("MatOST")) { + } else if (ucNames.contains(MAT_OST)) { // this occurs on older Works Word Processor files (versions 3.0 and 4.0) return WPS; - } else if (names.contains("CONTENTS") && names.contains("SPELLING")) { + } else if (ucNames.contains(CONTENTS) && ucNames.contains(SPELLING)) { // Newer Works files return WPS; - } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) { + } else if (ucNames.contains(CONTENTS) && ucNames.contains(OBJ_INFO)) { return COMP_OBJ; - } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) { + } else if (ucNames.contains(CONTENTS) && ucNames.contains(COMP_OBJ_STRING)) { // CompObj is a general kind of OLE2 embedding, but this may be an old Works file // If we have the Directory, check if (root != null) { @@ -307,33 +379,33 @@ public class POIFSContainerDetector implements Detector { // Assume it's a general CompObj embedded resource return COMP_OBJ; } - } else if (names.contains("CONTENTS")) { + } else if (ucNames.contains(CONTENTS)) { // CONTENTS without SPELLING nor CompObj normally means some sort // of embedded non-office file inside an OLE2 document // This is most commonly triggered on nested directories return OLE; - } else if (names.contains("\u0001CompObj") && - (names.contains("Props") || names.contains("Props9") || - names.contains("Props12"))) { + } else if (ucNames.contains(COMP_OBJ_STRING) && + (ucNames.contains(PROPS) || ucNames.contains(PROPS_9) || + ucNames.contains(PROPS_12))) { // Could be Project, look for common name patterns - for (String name : names) { + for (String name : ucNames) { if (mppDataMatch.matcher(name).matches()) { return MPP; } } - } else if (names.contains("Equation Native")) { + } else if (ucNames.contains(EQUATION_NATIVE)) { return MS_EQUATION; - } else if (names.contains("Layer")) { + } else if (ucNames.contains(LAYER)) { //in one test file, also saw LayerSmallImage and LayerLargeImage //maybe add those if we get false positives? //in other test files there was a single entry for "Layer" return ESRI_LAYER; - } else if (names.contains("Dgn~Mf") && names.contains("Dgn~S") && - names.contains("Dgn~H")) { + } else if (ucNames.contains(DGN_MF) && ucNames.contains(DGN_S) && + ucNames.contains(DGN_H)) { return DGN_8; } else { - for (String name : names) { - if (name.startsWith("__substg1.0_")) { + for (String name : ucNames) { + if (name.startsWith(SUBSTG_1)) { return MSG; } } @@ -344,12 +416,28 @@ public class POIFSContainerDetector implements Detector { return OLE; } + private static Set<String> upperCase(Set<String> names) { + Set<String> uc = new HashSet<>(names.size()); + for (String s : names) { + uc.add(s.toUpperCase(Locale.US)); + } + return uc; + } + + /** + * + * @param entry entry to search + * @param targetName Upper cased target name + * @param depth current depth + * @param maxDepth maximum allowed depth + * @return + */ private static boolean findRecursively(Entry entry, String targetName, int depth, int maxDepth) { if (entry == null) { return false; } - if (entry.getName().equals(targetName)) { + if (entry.getName().toUpperCase(Locale.US).equals(targetName)) { return true; } if (depth >= maxDepth) { @@ -374,13 +462,9 @@ public class POIFSContainerDetector implements Detector { */ private static MediaType processCompObjFormatType(DirectoryEntry root) { try { - - if (!root.hasEntry("\u0001CompObj")) { - return OLE; - } - Entry e = root.getEntry("\u0001CompObj"); - if (e != null && e.isDocumentEntry()) { - DocumentNode dn = (DocumentNode) e; + Entry entry = OfficeParser.getUCEntry(root, COMP_OBJ_STRING); + if (entry != null && entry.isDocumentEntry()) { + DocumentNode dn = (DocumentNode) entry; DocumentInputStream stream = new DocumentInputStream(dn); byte[] bytes = IOUtils.toByteArray(stream); /* @@ -414,6 +498,7 @@ public class POIFSContainerDetector implements Detector { return OLE; } + // poor man's search for byte arrays, replace with some library call if // you know one without adding new dependencies private static boolean arrayContains(byte[] larger, byte[] smaller) { @@ -434,6 +519,11 @@ public class POIFSContainerDetector implements Detector { return false; } + /** + * These are the literal top level names in the root. These are not uppercased + * @param root + * @return + */ private static Set<String> getTopLevelNames(DirectoryNode root) { Set<String> names = new HashSet<>(); for (Entry entry : root) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 8d938fdbe..e862260f6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -16,6 +16,7 @@ */ package org.apache.tika.parser.microsoft; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -24,6 +25,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -240,10 +242,10 @@ public class OfficeParser extends AbstractOfficeParser { extractor.parse(xhtml); break; case ENCRYPTED: - EncryptionInfo info = new EncryptionInfo(root); - Decryptor d = Decryptor.getInstance(info); try { + EncryptionInfo info = new EncryptionInfo(root); + Decryptor d = Decryptor.getInstance(info); // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; @@ -271,6 +273,10 @@ public class OfficeParser extends AbstractOfficeParser { } } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); + } catch (FileNotFoundException ex) { + //this can happen because POI may not support case-insensitive ole2 object + //lookups + throw new EncryptedDocumentException(ex); } break; case DRMENCRYPTED: @@ -351,4 +357,22 @@ public class OfficeParser extends AbstractOfficeParser { } } + /** + * Looks for entry within root (non-recursive) that has an upper-cased + * name that equals ucTarget + * @param root + * @param ucTarget + * @return + */ + public static Entry getUCEntry(DirectoryEntry root, String ucTarget) { + Iterator<Entry> it = root.getEntries(); + while (it.hasNext()) { + Entry e = it.next(); + if (e.getName().toUpperCase(Locale.US).equals(ucTarget)) { + return e; + } + } + return null; + } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java index e71812204..89f1913fa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java @@ -16,11 +16,11 @@ */ package org.apache.tika.parser.microsoft; -import java.io.FileNotFoundException; import java.io.IOException; import java.util.Arrays; import java.util.Date; import java.util.HashSet; +import java.util.Locale; import java.util.Set; import org.apache.poi.hpsf.CustomProperties; @@ -53,10 +53,11 @@ import org.apache.tika.utils.StringUtils; public class SummaryExtractor { private static final Logger LOG = LoggerFactory.getLogger(AbstractPOIFSExtractor.class); - private static final String SUMMARY_INFORMATION = SummaryInformation.DEFAULT_STREAM_NAME; + private static final String SUMMARY_INFORMATION = + SummaryInformation.DEFAULT_STREAM_NAME.toUpperCase(Locale.US); private static final String DOCUMENT_SUMMARY_INFORMATION = - DocumentSummaryInformation.DEFAULT_STREAM_NAME; + DocumentSummaryInformation.DEFAULT_STREAM_NAME.toUpperCase(Locale.US); private final Metadata metadata; @@ -99,18 +100,19 @@ public class SummaryExtractor { private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException { try { - if (!root.hasEntry(entryName)) { - return; - } DocumentEntry entry = null; try { - entry = (DocumentEntry) root.getEntry(entryName); - } catch (FileNotFoundException | IllegalArgumentException e) { + entry = (DocumentEntry) OfficeParser.getUCEntry(root, entryName); + } catch (IllegalArgumentException e) { //POI throws these if there is a key in the entries map //but the entry is null return; } + if (entry == null) { + return; + } + PropertySet properties = new PropertySet(new DocumentInputStream(entry)); if (properties.isSummaryInformation()) { parse(new SummaryInformation(properties)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java new file mode 100644 index 000000000..7f38caf76 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; + +public class OLE2CasingTest extends TikaTest { + + final static Set<String> IGNORE_FIELDS = new HashSet<>(); + + static { + IGNORE_FIELDS.add(TikaCoreProperties.PARSE_TIME_MILLIS.getName()); + } + + @Test + public void testEncrypted() throws Exception { + Assertions.assertThrows(EncryptedDocumentException.class, () -> { + getXML("casing/protected_normal_case.docx"); + }); + Assertions.assertThrows(EncryptedDocumentException.class, () -> { + getXML("casing/protected_upper_case.docx"); + }); + } + + @Test + @Disabled("until POI can handle case insensitive entry lookups") + public void testBasic() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("casing/simple_normal_case.doc"); + assertCloseEnough(metadataList, getRecursiveMetadata("casing/simple_lower_case.doc")); + assertCloseEnough(metadataList, getRecursiveMetadata("casing/simple_upper_case.doc")); + } + + private void assertCloseEnough(List<Metadata> expected, List<Metadata> test) { + for (int i = 0; i < expected.size(); i++) { + assertCloseEnough(expected.get(i), test.get(i)); + } + } + + private void assertCloseEnough(Metadata expected, Metadata test) { + for (String n : expected.names()) { + if (! IGNORE_FIELDS.contains(n)) { + assertArrayEquals(expected.getValues(n), test.getValues(n)); + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/protected_normal_case.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/protected_normal_case.docx new file mode 100644 index 000000000..2c68ef9e0 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/protected_normal_case.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/protected_upper_case.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/protected_upper_case.docx new file mode 100644 index 000000000..20b9fcdad Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/protected_upper_case.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_lower_case.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_lower_case.doc new file mode 100644 index 000000000..f9f2c6ed1 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_lower_case.doc differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_normal_case.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_normal_case.doc new file mode 100644 index 000000000..e79a62102 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_normal_case.doc differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_upper_case.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_upper_case.doc new file mode 100644 index 000000000..b09ec382c Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/casing/simple_upper_case.doc differ