Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests all public methods of the ChmItspHeader + * + */ +public class TestChmItspHeader { + private ChmItspHeader chmItspHeader = null; + + @Before + public void setUp() throws Exception { + byte[] data = TestParameters.chmData; + + ChmItsfHeader chmItsfHeader = new ChmItsfHeader(); + // chmItsfHeader.parse(Arrays.copyOfRange(data, 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader); + chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader); + + chmItspHeader = new ChmItspHeader(); + // chmItspHeader.parse(Arrays.copyOfRange( data, (int) + // chmItsfHeader.getDirOffset(), + // (int) chmItsfHeader.getDirOffset() + // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + chmItspHeader.parse(ChmCommons.copyOfRange(data, + (int) chmItsfHeader.getDirOffset(), + (int) chmItsfHeader.getDirOffset() + + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + } + + @Test + public void testGetBlock_len() { + assertEquals(TestParameters.VP_BLOCK_LENGTH, + chmItspHeader.getBlock_len()); + } + + @Test + public void testGetBlockidx_intvl() { + assertEquals(TestParameters.VP_BLOCK_INDEX_INTERVAL, + chmItspHeader.getBlockidx_intvl()); + } + + @Test + public void testGetHeader_len() { + assertEquals(TestParameters.VP_ITSP_HEADER_LENGTH, + chmItspHeader.getHeader_len()); + } + + @Test + public void testGetIndex_depth() { + assertEquals(TestParameters.VP_INDEX_DEPTH, + chmItspHeader.getIndex_depth()); + } + + @Test + public void testGetIndex_head() { + assertEquals(TestParameters.VP_INDEX_HEAD, + chmItspHeader.getIndex_head()); + } + + @Test + public void testGetIndex_root() { + assertEquals(TestParameters.VP_INDEX_ROOT, + chmItspHeader.getIndex_root()); + } + + @Test + public void testGetLang_id() { + assertEquals(TestParameters.VP_LANGUAGE_ID, + chmItspHeader.getLang_id()); + } + + @Test + public void testGetNum_blocks() { + assertEquals(TestParameters.VP_UNKNOWN_NUM_BLOCKS, + chmItspHeader.getNum_blocks()); + } + + @Test + public void testGetUnknown_000c() { + assertEquals(TestParameters.VP_ITSP_UNKNOWN_000C, + chmItspHeader.getUnknown_000c()); + } + + @Test + public void testGetUnknown_0024() { + assertEquals(TestParameters.VP_ITSP_UNKNOWN_0024, + chmItspHeader.getUnknown_0024()); + } + + @Test + public void testGetUnknown_002() { + assertEquals(TestParameters.VP_ITSP_UNKNOWN_002C, + chmItspHeader.getUnknown_002c()); + } + + @Test + public void testGetUnknown_0044() { + assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN, + chmItspHeader.getUnknown_0044().length); + } + + @Test + public void testGetVersion() { + assertEquals(TestParameters.VP_ITSP_VERSION, + chmItspHeader.getVersion()); + } + + @Test + public void testGetSignature() { + assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String( + chmItspHeader.getSignature(), UTF_8)); + } + + @Test + public void testGetSystem_uuid() { + assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN, + chmItspHeader.getSystem_uuid().length); + } + + @Test + public void testToString() { + assertTrue(chmItspHeader.toString().contains( + TestParameters.VP_ISTP_SIGNATURE)); + } + + @After + public void tearDown() throws Exception { + chmItspHeader = null; + } + +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.lzx.ChmLzxState; +import org.junit.Before; +import org.junit.Test; + +public class TestChmLzxState { + private ChmLzxState chmLzxState; + private int windowSize; + + @Before + public void setUp() throws Exception { + byte[] data = TestParameters.chmData; + + /* Creates and parses itsf header */ + ChmItsfHeader chmItsHeader = new ChmItsfHeader(); + // chmItsHeader.parse(Arrays.copyOfRange(data, 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader); + chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader); + /* Creates and parses itsp block */ + ChmItspHeader chmItspHeader = new ChmItspHeader(); + // chmItspHeader.parse(Arrays.copyOfRange( data, (int) + // chmItsHeader.getDirOffset(), + // (int) chmItsHeader.getDirOffset() + // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + chmItspHeader.parse(ChmCommons.copyOfRange(data, + (int) chmItsHeader.getDirOffset(), + (int) chmItsHeader.getDirOffset() + + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + + /* Creating instance of ChmDirListingContainer */ + ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet( + data, chmItsHeader, chmItspHeader); + int indexOfControlData = ChmCommons.indexOf( + chmDirListCont.getDirectoryListingEntryList(), + ChmConstants.CONTROL_DATA); + + int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, + ChmConstants.LZXC.getBytes(UTF_8)); + byte[] dir_chunk = null; + if (indexOfResetTable > 0) { + // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, + // indexOfResetTable + // + + // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength()); + dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, + indexOfResetTable + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfControlData).getLength()); + } + + ChmLzxcControlData clcd = new ChmLzxcControlData(); + clcd.parse(dir_chunk, clcd); + windowSize = (int) clcd.getWindowSize(); + } + + @Test + public void testChmLzxStateConstructor() throws TikaException { + chmLzxState = new ChmLzxState(windowSize); + assertNotNull(chmLzxState); + } + + @Test + public void testToString() throws TikaException { + if (chmLzxState == null) + testChmLzxStateConstructor(); + assertTrue(chmLzxState.toString().length() > 20); + } + + // TODO add more tests + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests all public methods of ChmLzxcControlData block + */ +public class TestChmLzxcControlData { + private ChmLzxcControlData chmLzxcControlData = null; + + @Before + public void setUp() throws Exception { + byte[] data = TestParameters.chmData; + /* Creates and parses itsf header */ + ChmItsfHeader chmItsHeader = new ChmItsfHeader(); + // chmItsHeader.parse(Arrays.copyOfRange(data, 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader); + chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader); + /* Creates and parses itsp block */ + ChmItspHeader chmItspHeader = new ChmItspHeader(); + // chmItspHeader.parse(Arrays.copyOfRange( data, (int) + // chmItsHeader.getDirOffset(), + // (int) chmItsHeader.getDirOffset() + // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + chmItspHeader.parse(ChmCommons.copyOfRange(data, + (int) chmItsHeader.getDirOffset(), + (int) chmItsHeader.getDirOffset() + + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + /* Creating instance of ChmDirListingContainer */ + ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet( + data, chmItsHeader, chmItspHeader); + int indexOfControlData = chmDirListCont.getControlDataIndex(); + + int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, + ChmConstants.LZXC.getBytes(UTF_8)); + byte[] dir_chunk = null; + if (indexOfResetTable > 0) { + // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, + // indexOfResetTable + // + + // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength()); + dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, + indexOfResetTable + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfControlData).getLength()); + } + + /* Creates and parses control block */ + chmLzxcControlData = new ChmLzxcControlData(); + chmLzxcControlData.parse(dir_chunk, chmLzxcControlData); + + } + + @Test + public void testConstructorNotNull() { + assertNotNull(chmLzxcControlData); + } + + @Test + public void testGetResetInterval() { + assertEquals(TestParameters.VP_RESET_INTERVAL, + chmLzxcControlData.getResetInterval()); + } + + @Test + public void testGetSize() { + assertEquals(TestParameters.VP_CONTROL_DATA_SIZE, + chmLzxcControlData.getSize()); + } + + @Test + public void testGetUnknown_18() { + assertEquals(TestParameters.VP_UNKNOWN_18, + chmLzxcControlData.getUnknown_18()); + } + + @Test + public void testGetVersion() { + assertEquals(TestParameters.VP_CONTROL_DATA_VERSION, + chmLzxcControlData.getVersion()); + } + + @Test + public void testGetWindowSize() { + assertEquals(TestParameters.VP_WINDOW_SIZE, + chmLzxcControlData.getWindowSize()); + } + + @Test + public void testGetWindowsPerReset() { + assertEquals(TestParameters.VP_WINDOWS_PER_RESET, + chmLzxcControlData.getWindowsPerReset()); + } + + @Test + public void testGetToString() { + assertTrue(chmLzxcControlData.toString().contains( + TestParameters.VP_CONTROL_DATA_SIGNATURE)); + } + + @Test + public void testGetSignature() { + assertEquals( + TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length, + chmLzxcControlData.getSignature().length); + } + + @Test + public void testGetSignaure() { + assertEquals( + TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length, + chmLzxcControlData.getSignature().length); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.chm; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.junit.Before; +import org.junit.Test; + +public class TestChmLzxcResetTable { + private ChmLzxcResetTable chmLzxcResetTable = null; + + @Before + public void setUp() throws Exception { + byte[] data = TestParameters.chmData; + /* Creates and parses itsf header */ + ChmItsfHeader chmItsfHeader = new ChmItsfHeader(); + // chmItsfHeader.parse(Arrays.copyOfRange(data, 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader); + chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader); + /* Creates and parses itsp block */ + ChmItspHeader chmItspHeader = new ChmItspHeader(); + // chmItspHeader.parse(Arrays.copyOfRange( data, (int) + // chmItsfHeader.getDirOffset(), + // (int) chmItsfHeader.getDirOffset() + // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + chmItspHeader.parse(ChmCommons.copyOfRange(data, + (int) chmItsfHeader.getDirOffset(), + (int) chmItsfHeader.getDirOffset() + + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + /* Creating instance of ChmDirListingContainer */ + ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet( + data, chmItsfHeader, chmItspHeader); + int indexOfControlData = chmDirListCont.getControlDataIndex(); + + int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, + ChmConstants.LZXC.getBytes(UTF_8)); + byte[] dir_chunk = null; + if (indexOfResetTable > 0) { + // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, + // indexOfResetTable + // + + // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength()); + dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, + indexOfResetTable + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfControlData).getLength()); + } + + /* Creates and parses control block */ + ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData(); + chmLzxcControlData.parse(dir_chunk, chmLzxcControlData); + + indexOfResetTable = chmDirListCont.getResetTableIndex(); + chmLzxcResetTable = new ChmLzxcResetTable(); + + int startIndex = (int) chmDirListCont.getDataOffset() + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfResetTable).getOffset(); + + ChmAssert.assertCopyingDataIndex(startIndex, data.length); + + // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex + // + + // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength()); + dir_chunk = ChmCommons.copyOfRange( + data, + startIndex, + startIndex + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfResetTable).getLength()); + + chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable); + } + + @Test + public void testGetBlockAddress() { + assertEquals(TestParameters.VP_RESET_TABLE_BA, + chmLzxcResetTable.getBlockAddress().length); + } + + @Test + public void testGetBlockCount() { + assertEquals(TestParameters.VP_RESET_TABLE_BA, + chmLzxcResetTable.getBlockCount()); + } + + @Test + public void testGetBlockLen() { + assertEquals(TestParameters.VP_RES_TBL_BLOCK_LENGTH, + chmLzxcResetTable.getBlockLen()); + } + + @Test + public void testGetCompressedLen() { + assertEquals(TestParameters.VP_RES_TBL_COMPR_LENGTH, + chmLzxcResetTable.getCompressedLen()); + } + + @Test + public void testGetTableOffset() { + assertEquals(TestParameters.VP_TBL_OFFSET, + chmLzxcResetTable.getTableOffset()); + } + + @Test + public void testGetUncompressedLen() { + assertEquals(TestParameters.VP_RES_TBL_UNCOMP_LENGTH, + chmLzxcResetTable.getUncompressedLen()); + } + + @Test + public void testGetUnknown() { + assertEquals(TestParameters.VP_RES_TBL_UNKNOWN, + chmLzxcResetTable.getUnknown()); + } + + @Test + public void testGetVersion() { + assertEquals(TestParameters.VP_RES_TBL_VERSION, + chmLzxcResetTable.getVersion()); + } + + @Test + public void testToString() { + assertTrue(chmLzxcResetTable.toString().length() > 0); + } + + // TODO: add setters to be tested +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests public methods of the DirectoryListingEntry class + * + * @author olegt + * + */ +public class TestDirectoryListingEntry { + private DirectoryListingEntry dle = null; + + @Before + public void setUp() throws Exception { + dle = new DirectoryListingEntry(TestParameters.nameLength, + TestParameters.entryName, TestParameters.entryType, + TestParameters.offset, TestParameters.length); + } + + @Test + public void testDefaultConstructor() { + assertNotNull(dle); + } + + @Test + public void testParamConstructor() { + assertEquals(TestParameters.nameLength, dle.getNameLength()); + assertEquals(TestParameters.entryName, dle.getName()); + assertEquals(TestParameters.entryType, dle.getEntryType()); + assertEquals(TestParameters.offset, dle.getOffset()); + assertEquals(TestParameters.length, dle.getLength()); + } + + @Test + public void testToString() { + assertNotNull(dle.toString()); + } + + @Test + public void testGetNameLength() { + assertEquals(TestParameters.nameLength, dle.getNameLength()); + } + + @Test + public void testGetName() { + assertEquals(TestParameters.entryName, dle.getName()); + } + + @Test + public void testGetEntryType() { + assertEquals(TestParameters.entryType, dle.getEntryType()); + } + + @Test + public void testGetOffset() { + assertEquals(TestParameters.offset, dle.getOffset()); + } + + @Test + public void testGetLength() { + assertEquals(TestParameters.length, dle.getLength()); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.parser.chm.core.ChmCommons.EntryType; + +/** + * Holds test parameters such as verification points + */ +public class TestParameters { + /* Prevents initialization */ + private TestParameters() { + } + + /* Tests values */ + static final int nameLength = 5; + static final String entryName = TestParameters.class.getName(); + static EntryType entryType = EntryType.COMPRESSED; + static final int offset = 3; + static final int length = 20; + static final int NTHREADS = 2; + + static final int BUFFER_SIZE = 16384; + + static final byte[] chmData = readResource("/test-documents/testChm.chm"); + + private static byte[] readResource(String name) { + try { + try (InputStream stream = TestParameters.class.getResourceAsStream(name)) { + return IOUtils.toByteArray(stream); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /* Verification points */ + static final String VP_CHM_MIME_TYPE = "Content-Type=application/x-chm"; + static final String VP_EXTRACTED_TEXT = "The TCard method accepts only numeric arguments"; + static final String VP_ISTF_SIGNATURE = "ITSF"; + static final String VP_ISTP_SIGNATURE = "ITSP"; + static final String VP_PMGL_SIGNATURE = "PMGL"; + static final String VP_CONTROL_DATA_SIGNATURE = "LZXC"; + + static final int VP_DIRECTORY_LENGTH = 4180; + static final int VP_DATA_OFFSET_LENGTH = 4300; + static final int VP_DIRECTORY_OFFSET = 120; + static final int VP_ITSF_HEADER_LENGTH = 96; + static final int VP_LANGUAGE_ID = 1033; + static final int VP_LAST_MODIFIED = 1042357880; + static final int VP_UNKNOWN_000C = 1; + static final int VP_UNKNOWN_LEN = 24; + static final int VP_UNKNOWN_OFFSET = 96; + static final int VP_VERSION = 3; + static final int VP_BLOCK_LENGTH = 4096; + static final int VP_BLOCK_INDEX_INTERVAL = 2; + static final int VP_ITSP_HEADER_LENGTH = 84; + static final int VP_INDEX_DEPTH = 1; + static final int VP_INDEX_HEAD = 0; + static final int VP_INDEX_ROOT = -1; + static final int VP_UNKNOWN_NUM_BLOCKS = -1; + static final int VP_ITSP_UNKNOWN_000C = 10; + static final int VP_ITSP_UNKNOWN_0024 = 0; + static final int VP_ITSP_UNKNOWN_002C = 1; + static final int VP_ITSP_BYTEARR_LEN = 16; + static final int VP_ITSP_VERSION = 1; + static final int VP_RESET_INTERVAL = 2; + static final int VP_CONTROL_DATA_SIZE = 6; + static final int VP_UNKNOWN_18 = 0; + static final int VP_CONTROL_DATA_VERSION = 2; + static final int VP_WINDOW_SIZE = 65536; + static final int VP_WINDOWS_PER_RESET = 1; + static final int VP_CHM_ENTITIES_NUMBER = 100; //updated by Hawking + static final int VP_PMGI_FREE_SPACE = 3; + static final int VP_PMGL_BLOCK_NEXT = -1; + static final int VP_PMGL_BLOCK_PREV = -1; + static final int VP_PMGL_FREE_SPACE = 1644; + static final int VP_PMGL_UNKNOWN_008 = 0; + static final int VP_RESET_TABLE_BA = 12; + static final int VP_RES_TBL_BLOCK_LENGTH = 32768; + static final int VP_RES_TBL_COMPR_LENGTH = 177408; + static final int VP_RES_TBL_UNCOMP_LENGTH = 383786; + static final int VP_TBL_OFFSET = 40; + static final int VP_RES_TBL_UNKNOWN = 8; + static final int VP_RES_TBL_VERSION = 2; +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmPmgiHeader; +import org.junit.Before; +import org.junit.Test; + +public class TestPmgiHeader { + ChmPmgiHeader chmPmgiHeader = null; + + @Before + public void setUp() throws Exception { + byte[] data = TestParameters.chmData; + chmPmgiHeader = new ChmPmgiHeader(); + chmPmgiHeader.parse(data, chmPmgiHeader); + } + + @Test + public void testToString() { + assertTrue((chmPmgiHeader != null) && (chmPmgiHeader.toString().length() > 0)); + } + + @Test + public void testGetFreeSpace() { + assertEquals(TestParameters.VP_PMGI_FREE_SPACE, chmPmgiHeader.getFreeSpace()); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmPmglHeader; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.junit.Before; +import org.junit.Test; + +public class TestPmglHeader { + ChmPmglHeader chmPmglHeader = null; + + @Before + public void setUp() throws Exception { + byte[] data = TestParameters.chmData; + chmPmglHeader = new ChmPmglHeader(); + chmPmglHeader.parse(ChmCommons.copyOfRange(data, + ChmConstants.START_PMGL, ChmConstants.START_PMGL + + ChmConstants.CHM_PMGL_LEN + 10), chmPmglHeader); + } + + @Test + public void testToString() { + assertTrue((chmPmglHeader != null) + && chmPmglHeader.toString().length() > 0); + } + + @Test + public void testChmPmglHeaderGet() { + assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String( + chmPmglHeader.getSignature(), UTF_8)); + } + + @Test + public void testGetBlockNext() { + assertEquals(TestParameters.VP_PMGL_BLOCK_NEXT, + chmPmglHeader.getBlockNext()); + } + + @Test + public void testGetBlockPrev() { + assertEquals(TestParameters.VP_PMGL_BLOCK_PREV, + chmPmglHeader.getBlockPrev()); + } + + @Test + public void testGetFreeSpace() { + assertEquals(TestParameters.VP_PMGL_FREE_SPACE, + chmPmglHeader.getFreeSpace()); + } + + @Test + public void testGetUnknown0008() { + assertEquals(TestParameters.VP_PMGL_UNKNOWN_008, + chmPmglHeader.getUnknown0008()); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; +import java.util.Map; + +import org.apache.tika.detect.TypeDetector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class MboxParserTest { + + protected ParseContext recursingContext; + private Parser autoDetectParser; + private TypeDetector typeDetector; + private MboxParser mboxParser; + + private static InputStream getStream(String name) { + return MboxParserTest.class.getClass().getResourceAsStream(name); + } + + @Before + public void setUp() throws Exception { + typeDetector = new TypeDetector(); + autoDetectParser = new AutoDetectParser(typeDetector); + recursingContext = new ParseContext(); + recursingContext.set(Parser.class, autoDetectParser); + + mboxParser = new MboxParser(); + mboxParser.setTracking(true); + } + + @Test + public void testSimple() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/simple.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + String content = handler.toString(); + assertContains("Test content 1", content); + assertContains("Test content 2", content); + assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE)); + + Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata(); + assertEquals("Nb. Of mails", 2, mailsMetadata.size()); + + Metadata mail1 = mailsMetadata.get(0); + assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE)); + assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from")); + + Metadata mail2 = mailsMetadata.get(1); + assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE)); + assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from")); + } + + @Test + public void testHeaders() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/headers.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertContains("Test content", handler.toString()); + assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); + + Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); + + assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED)); + assertEquals("<[email protected]>", mailMetadata.get(TikaCoreProperties.CREATOR)); + assertEquals("subject", mailMetadata.get(Metadata.SUBJECT)); + assertEquals("<[email protected]>", mailMetadata.get(Metadata.AUTHOR)); + assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals("[email protected]", mailMetadata.get("Message-From")); + assertEquals("<[email protected]>", mailMetadata.get("MboxParser-return-path")); + } + + @Test + public void testMultilineHeader() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/multiline.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); + + Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); + assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received")); + } + + @Test + public void testQuoted() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/quoted.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertContains("Test content", handler.toString()); + assertContains("> quoted stuff", handler.toString()); + } + + @Test + public void testComplex() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/complex.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size()); + + Metadata firstMail = mboxParser.getTrackingMetadata().get(0); + assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT)); + assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE)); + assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(Metadata.AUTHOR)); + assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(TikaCoreProperties.CREATOR)); + assertEquals("[email protected]", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS)); + + assertContains("When a Mapper completes", handler.toString()); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ToHTMLContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class OutlookPSTParserTest extends TikaTest { + + private Parser parser = new OutlookPSTParser(); + + @Test + public void testAccept() throws Exception { + assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst")))); + } + + @Test + public void testParse() throws Exception { + Parser pstParser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + ContentHandler handler = new ToHTMLContentHandler(); + + ParseContext context = new ParseContext(); + EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context); + context.set(EmbeddedDocumentExtractor.class, trackingExtrator); + context.set(Parser.class, new AutoDetectParser()); + + pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context); + + String output = handler.toString(); + + assertFalse(output.isEmpty()); + assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">")); + assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">")); + + assertTrue(output.contains("<body><div class=\"email-folder\"><h1>")); + assertTrue(output.contains("<div class=\"embedded\" id=\"<[email protected]>\"><h1>Re: Feature Generators</h1>")); + assertTrue(output.contains("<div class=\"embedded\" id=\"<[email protected]>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>")); + assertTrue(output.contains("Gary Murphy commented on TIKA-1250:")); + + assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>")); + + + List<Metadata> metaList = trackingExtrator.trackingMetadata; + assertEquals(6, metaList.size()); + + Metadata firstMail = metaList.get(0); + assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR)); + assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE)); + assertEquals("[email protected]", firstMail.get("senderEmailAddress")); + assertEquals("[email protected]", firstMail.get("displayTo")); + assertEquals("", firstMail.get("displayCC")); + assertEquals("", firstMail.get("displayBCC")); + } + + + private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor { + List<Metadata> trackingMetadata = new ArrayList<Metadata>(); + + public EmbeddedTrackingExtrator(ParseContext context) { + super(context); + } + + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { + this.trackingMetadata.add(metadata); + super.parseEmbedded(stream, handler, metadata, outputHtml); + } + + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.net.URL; + +import org.apache.tika.TikaTest.TrackingHandler; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.mime.MediaType; + +/** + * Parent class of tests that the various POI powered parsers are + * able to extract their embedded contents. + */ +public abstract class AbstractPOIContainerExtractionTest { + public static final MediaType TYPE_DOC = MediaType.application("msword"); + public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint"); + public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel"); + public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); + public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"); + public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook"); + + public static final MediaType TYPE_TXT = MediaType.text("plain"); + public static final MediaType TYPE_PDF = MediaType.application("pdf"); + + public static final MediaType TYPE_JPG = MediaType.image("jpeg"); + public static final MediaType TYPE_GIF = MediaType.image("gif"); + public static final MediaType TYPE_PNG = MediaType.image("png"); + public static final MediaType TYPE_EMF = MediaType.application("x-emf"); + public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile"); + + protected static TikaInputStream getTestFile(String filename) throws Exception { + URL input = AbstractPOIContainerExtractionTest.class.getResource( + "/test-documents/" + filename); + assertNotNull(filename + " not found", input); + + return TikaInputStream.get(input); + } + + protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception { + try (TikaInputStream stream = getTestFile(filename)) { + assertEquals(true, extractor.isSupported(stream)); + + // Process it + TrackingHandler handler = new TrackingHandler(); + if (recurse) { + extractor.extract(stream, extractor, handler); + } else { + extractor.extract(stream, null, handler); + } + + // So they can check what happened + return handler; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,443 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.apache.tika.TikaTest.assertContains; +import static org.apache.tika.TikaTest.assertNotContained; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.InputStream; +import java.util.Locale; + +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class ExcelParserTest { + @Test + @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys + public void testExcelParser() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL.xls")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); + + // Mon Oct 01 17:13:56 BST 2007 + assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE)); + + // Mon Oct 01 17:31:43 BST 2007 + assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE)); + + String content = handler.toString(); + assertContains("Sample Excel Worksheet", content); + assertContains("Numbers and their Squares", content); + assertContains("\t\tNumber\tSquare", content); + assertContains("9", content); + assertNotContained("9.0", content); + assertContains("196", content); + assertNotContained("196.0", content); + } + } + + @Test + public void testExcelParserFormatting() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL-formats.xls")) { + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, context); + + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + + // Number #,##0.00 + assertContains("1,599.99", content); + assertContains("-1,599.99", content); + + // Currency $#,##0.00;[Red]($#,##0.00) + assertContains("$1,599.99", content); + assertContains("($1,599.99)", content); + + // Scientific 0.00E+00 + // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08 + assertTrue(content.contains("1.98E08") || content.contains("1.98E+08")); + assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08")); + + // Percentage. + assertContains("2.50%", content); + // Excel rounds up to 3%, but that requires Java 1.6 or later + if (System.getProperty("java.version").startsWith("1.5")) { + assertContains("2%", content); + } else { + assertContains("3%", content); + } + + // Time Format: h:mm + assertContains("6:15", content); + assertContains("18:15", content); + + // Date Format: d-mmm-yy + assertContains("17-May-07", content); + + // Date Format: m/d/yy + assertContains("10/3/09", content); + + // Date/Time Format: m/d/yy h:mm + assertContains("1/19/08 4:35", content); + + // Fraction (2.5): # ?/? + assertContains("2 1/2", content); + + + // Below assertions represent outstanding formatting issues to be addressed + // they are included to allow the issues to be progressed with the Apache POI + // team - See TIKA-103. + + /************************************************************************* + // Custom Number (0 "dollars and" .00 "cents") + assertContains("19 dollars and .99 cents", content); + + // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) + assertContains("At 4:20 AM on Thursday May 17, 2007", content); + **************************************************************************/ + + } + } + + @Test + public void testExcelParserPassword() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL_protected_passtika.xls")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + fail("Document is encrypted, shouldn't parse"); + } catch (EncryptedDocumentException e) { + // Good + } + + // Try again, this time with the password + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL_protected_passtika.xls")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + context.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "tika"; + } + }); + new OfficeParser().parse(input, handler, metadata, context); + + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED)); + + String content = handler.toString(); + assertContains("This is an Encrypted Excel spreadsheet", content); + assertNotContained("9.0", content); + } + } + + /** + * TIKA-214 - Ensure we extract labels etc from Charts + */ + @Test + public void testExcelParserCharts() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL-charts.xls")) { + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, context); + + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + + // The first sheet has a pie chart + assertContains("charttabyodawg", content); + assertContains("WhamPuff", content); + + // The second sheet has a bar chart and some text + assertContains("Sheet1", content); + assertContains("Test Excel Spreasheet", content); + assertContains("foo", content); + assertContains("bar", content); + assertContains("fizzlepuff", content); + assertContains("whyaxis", content); + assertContains("eksaxis", content); + + // The third sheet has some text + assertContains("Sheet2", content); + assertContains("dingdong", content); + } + } + + @Test + public void testJXL() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/jxl.xls")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("Number Formats", content); + } + } + + @Test + public void testWorksSpreadsheet70() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testWORKSSpreadsheet7.0.xlr")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + + String content = handler.toString(); + assertContains("Microsoft Works", content); + } + } + + /** + * We don't currently support the .xlsb file format + * (an OOXML container with binary blobs), but we + * shouldn't break on these files either (TIKA-826) + */ + @Test + public void testExcelXLSB() throws Exception { + Detector detector = new DefaultDetector(); + AutoDetectParser parser = new AutoDetectParser(); + + Metadata m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); + + // Should be detected correctly + MediaType type; + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL.xlsb")) { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); + } + + // OfficeParser won't handle it + assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // OOXMLParser won't handle it + assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // AutoDetectParser doesn't break on it + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + parser.parse(input, handler, m, context); + + String content = handler.toString(); + assertEquals("", content); + } + } + + /** + * Excel 5 and 95 are older formats, and only get basic support + */ + @Test + public void testExcel95() throws Exception { + Detector detector = new DefaultDetector(); + AutoDetectParser parser = new AutoDetectParser(); + MediaType type; + Metadata m; + + // First try detection of Excel 5 + m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls"); + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel", type.toString()); + } + + // Now Excel 95 + m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls"); + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel", type.toString()); + } + + // OfficeParser can handle it + assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // OOXMLParser won't handle it + assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); + + + // Parse the Excel 5 file + m = new Metadata(); + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + parser.parse(input, handler, m, context); + + String content = handler.toString(); + + // Sheet names + assertContains("Feuil1", content); + assertContains("Feuil3", content); + + // Text + assertContains("Sample Excel", content); + assertContains("Number", content); + + // Numbers + assertContains("15", content); + assertContains("225", content); + + // Metadata was also fetched + assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR)); + } + + // Parse the Excel 95 file + m = new Metadata(); + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + parser.parse(input, handler, m, context); + + String content = handler.toString(); + + // Sheet name + assertContains("Foglio1", content); + + // Very boring file, no actual text or numbers! + + // Metadata was also fetched + assertEquals(null, m.get(TikaCoreProperties.TITLE)); + assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR)); + } + } + + /** + * Ensures that custom OLE2 (HPSF) properties are extracted + */ + @Test + public void testCustomProperties() throws Exception { + Metadata metadata = new Metadata(); + + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL_custom_props.xls")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + } + + assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + } + + @Test + public void testHeaderAndFooterExtraction() throws Exception { + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL_headers_footers.xls")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.UK); + new OfficeParser().parse(input, handler, metadata, context); + + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR)); + + String content = handler.toString(); + assertContains("John Smith1", content); + assertContains("John Smith50", content); + assertContains("1 Corporate HQ", content); + assertContains("Header - Corporate Spreadsheet", content); + assertContains("Header - For Internal Use Only", content); + assertContains("Header - Author: John Smith", content); + assertContains("Footer - Corporate Spreadsheet", content); + assertContains("Footer - For Internal Use Only", content); + assertContains("Footer - Author: John Smith", content); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.TikaTest; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +public class JackcessParserTest extends TikaTest { + + @Test + public void testBasic() throws Exception { + + Parser p = new AutoDetectParser(); + + RecursiveParserWrapper w = new RecursiveParserWrapper(p, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + + for (String fName : new String[]{"testAccess2.accdb", "testAccess2_2000.mdb", + "testAccess2_2002-2003.mdb"}) { + InputStream is = null; + try { + is = this.getResourceAsStream("/test-documents/" + fName); + + Metadata meta = new Metadata(); + ParseContext c = new ParseContext(); + w.parse(is, new DefaultHandler(), meta, c); + } finally { + IOUtils.closeQuietly(is); + } + List<Metadata> list = w.getMetadata(); + assertEquals(4, list.size()); + String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + + //make sure there's a thead and tbody + assertContains("</thead><tbody>", mainContent); + + //assert table header + assertContains("<th>ShortTextField</th>", mainContent); + + //test date format + assertContains("6/24/15", mainContent); + + //test that markup is stripped + assertContains("over the bold italic dog", mainContent); + + //test unicode + assertContains("\u666E\u6797\u65AF\u987F\u5927\u5B66", mainContent); + + //test embedded document handling + assertContains("Test Document with embedded pdf", + list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); + + w.reset(); + } + } + + @Test + public void testPassword() throws Exception { + ParseContext c = new ParseContext(); + c.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "tika"; + } + }); + Parser p = new AutoDetectParser(); + String content = null; + try (InputStream is = + this.getResourceAsStream( + "/test-documents/testAccess2_encrypted.accdb")){ + content = getText(is, p, c); + } + assertContains("red and brown", content); + + //now try wrong password + c.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "WRONG"; + } + }); + + boolean ex = false; + try (InputStream is = + this.getResourceAsStream( + "/test-documents/testAccess2_encrypted.accdb")){ + getText(is, p, c); + } catch (EncryptedDocumentException e) { + ex = true; + } + assertTrue("failed to throw encrypted document exception for wrong password", ex); + + //now try null + c.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return null; + } + }); + + ex = false; + try (InputStream is = + this.getResourceAsStream( + "/test-documents/testAccess2_encrypted.accdb")){ + getText(is, p, c); + } catch (EncryptedDocumentException e) { + ex = true; + } + assertTrue("failed to throw encrypted document exception for null password", ex); + + + //now try missing password provider + c = new ParseContext(); + ex = false; + try (InputStream is = + this.getResourceAsStream( + "/test-documents/testAccess2_encrypted.accdb")){ + getText(is, p, c); + } catch (EncryptedDocumentException e) { + ex = true; + } + assertTrue("failed to throw encrypted document exception for missing password provider", ex); + + //now try password on file that doesn't need a password + c = new ParseContext(); + c.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "tika"; + } + }); + ex = false; + try (InputStream is = + this.getResourceAsStream( + "/test-documents/testAccess2.accdb")){ + content = getText(is, p, c); + } catch (EncryptedDocumentException e) { + ex = true; + } + assertFalse("shouldn't have thrown encrypted document exception for "+ + "opening unencrypted file that doesn't need passowrd", ex); + assertContains("red and brown", content); + } + + @Test + public void testReadOnly() throws Exception { + //TIKA-1681: just make sure an exception is not thrown + XMLResult r = getXML("testAccess_V1997.mdb"); + assertContains("hijklmnop", r.xml); + } + + @Test + public void testMetadata() throws Exception { + //basic tests for normalized metadata + XMLResult r = getXML("testAccess_V1997.mdb"); + assertEquals("tmccune", r.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Health Market Science", r.metadata.get(OfficeOpenXMLExtended.COMPANY)); + assertEquals("test", r.metadata.get(TikaCoreProperties.TITLE)); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest; +import org.junit.Test; + + +public class OfficeParserTest extends TikaTest { + + @Test + public void parseOfficeWord() throws Exception { + Metadata metadata = new Metadata(); + Parser parser = new OfficeParser(); + + String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml; + + assertTrue(xml.contains("test")); + } + + private InputStream getTestDocument(String name) { + return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name)); + } +}
