TIKA-2020, remove 3 parameter parse() and simplify CAD tests
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0c71b2ff Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0c71b2ff Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0c71b2ff Branch: refs/heads/2.x Commit: 0c71b2ffc97a3907a541fdd164ba79302f5c0637 Parents: 6bb6827 Author: tballison <[email protected]> Authored: Fri Jun 24 11:13:54 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jun 24 11:13:54 2016 -0400 ---------------------------------------------------------------------- .../apache/tika/parser/dwg/DWGParserTest.java | 372 +++++++++---------- .../apache/tika/parser/prt/PRTParserTest.java | 214 +++++------ 2 files changed, 271 insertions(+), 315 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/0c71b2ff/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java index 321d715..ee3e767 100644 --- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java +++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java @@ -1,202 +1,170 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.dwg; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; -import static org.apache.tika.TikaTest.assertContains; - -import java.io.InputStream; - -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.sax.BodyContentHandler; -import org.junit.Test; -import org.xml.sax.ContentHandler; - -public class DWGParserTest { - - @Test - public void testDWG2000Parser() throws Exception { - InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2000.dwg"); - testParserAlt(input); - } - - @Test - public void testDWG2004Parser() throws Exception { - InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2004.dwg"); - testParser(input); - } - - @Test - public void testDWG2004ParserNoHeaderAddress() throws Exception { - InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2004_no_header.dwg"); - testParserNoHeader(input); - } - - @Test - public void testDWG2007Parser() throws Exception { - InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2007.dwg"); - testParser(input); - } - - @Test - public void testDWG2010Parser() throws Exception { - InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2010.dwg"); - testParser(input); - } - - @Test - public void testDWG2010CustomPropertiesParser() throws Exception { - // Check that standard parsing works - InputStream testInput = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2010_custom_props.dwg"); - testParser(testInput); - - // Check that custom properties with alternate padding work - try (InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWG2010_custom_props.dwg")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata, null); - - assertEquals("valueforcustomprop1", - metadata.get("customprop1")); - assertEquals("valueforcustomprop2", - metadata.get("customprop2")); - } - } - - @Test - public void testDWGMechParser() throws Exception { - String[] types = new String[] { - "6", "2004", "2004DX", "2005", "2006", - "2007", "2008", "2009", "2010", "2011" - }; - for (String type : types) { - InputStream input = DWGParserTest.class.getResourceAsStream( - "/test-documents/testDWGmech"+type+".dwg"); - testParserAlt(input); - } - } - - @SuppressWarnings("deprecation") - private void testParser(InputStream input) throws Exception { - try { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata); - - assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); - - assertEquals("The quick brown fox jumps over the lazy dog", - metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(Metadata.SUBJECT)); - assertEquals("Nevin Nollop", - metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Pangram, fox, dog", - metadata.get(TikaCoreProperties.KEYWORDS)); - assertEquals("Lorem ipsum", - metadata.get(TikaCoreProperties.COMMENTS).substring(0,11)); - assertEquals("http://www.alfresco.com", - metadata.get(TikaCoreProperties.RELATION)); - - // Check some of the old style metadata too - assertEquals("The quick brown fox jumps over the lazy dog", - metadata.get(Metadata.TITLE)); - assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(Metadata.SUBJECT)); - - String content = handler.toString(); - assertContains("The quick brown fox jumps over the lazy dog", content); - assertContains("Gym class", content); - assertContains("www.alfresco.com", content); - } finally { - input.close(); - } - } - - @SuppressWarnings("deprecation") - private void testParserNoHeader(InputStream input) throws Exception { - try { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata); - - assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); - - assertNull(metadata.get(TikaCoreProperties.TITLE)); - assertNull(metadata.get(TikaCoreProperties.DESCRIPTION)); - assertNull(metadata.get(Metadata.SUBJECT)); - assertNull(metadata.get(TikaCoreProperties.CREATOR)); - assertNull(metadata.get(TikaCoreProperties.KEYWORDS)); - assertNull(metadata.get(TikaCoreProperties.COMMENTS)); - assertNull(metadata.get(TikaCoreProperties.RELATION)); - - String content = handler.toString(); - assertEquals("", content); - } finally { - input.close(); - } - } - - @SuppressWarnings("deprecation") - private void testParserAlt(InputStream input) throws Exception { - try { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata); - - assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); - - assertEquals("Test Title", - metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Test Subject", - metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Test Subject", - metadata.get(Metadata.SUBJECT)); - assertEquals("My Author", - metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("My keyword1, MyKeyword2", - metadata.get(TikaCoreProperties.KEYWORDS)); - assertEquals("This is a comment", - metadata.get(TikaCoreProperties.COMMENTS)); - assertEquals("bejanpol", - metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("bejanpol", - metadata.get(Metadata.LAST_AUTHOR)); - assertEquals("http://mycompany/drawings", - metadata.get(TikaCoreProperties.RELATION)); - assertEquals("MyCustomPropertyValue", - metadata.get("MyCustomProperty")); - - String content = handler.toString(); - assertContains("This is a comment", content); - assertContains("mycompany", content); - } finally { - input.close(); - } - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dwg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class DWGParserTest extends TikaTest { + + @Test + public void testDWG2000Parser() throws Exception { + testParserAlt("testDWG2000.dwg"); + } + + @Test + public void testDWG2004Parser() throws Exception { + testParser("testDWG2004.dwg"); + } + + @Test + public void testDWG2004ParserNoHeaderAddress() throws Exception { + testParserNoHeader("testDWG2004_no_header.dwg"); + } + + @Test + public void testDWG2007Parser() throws Exception { + testParser("testDWG2007.dwg"); + } + + @Test + public void testDWG2010Parser() throws Exception { + testParser("testDWG2010.dwg"); + } + + @Test + public void testDWG2010CustomPropertiesParser() throws Exception { + // Check that standard parsing works + testParser("testDWG2010_custom_props.dwg"); + + // Check that custom properties with alternate padding work + + XMLResult r = getXML("testDWG2010_custom_props.dwg"); + assertEquals("valueforcustomprop1", + r.metadata.get("customprop1")); + assertEquals("valueforcustomprop2", + r.metadata.get("customprop2")); + } + + @Test + public void testDWGMechParser() throws Exception { + String[] types = new String[]{ + "6", "2004", "2004DX", "2005", "2006", + "2007", "2008", "2009", "2010", "2011" + }; + for (String type : types) { + testParserAlt("testDWGmech" + type + ".dwg"); + } + } + + private void testParser(String testFileName) throws Exception { + XMLResult r = getXML(testFileName, new DWGParser()); + Metadata metadata = r.metadata; + String content = r.xml; + + + assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("The quick brown fox jumps over the lazy dog", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(Metadata.SUBJECT)); + assertEquals("Nevin Nollop", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Pangram, fox, dog", + metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Lorem ipsum", + metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11)); + assertEquals("http://www.alfresco.com", + metadata.get(TikaCoreProperties.RELATION)); + + // Check some of the old style metadata too + assertEquals("The quick brown fox jumps over the lazy dog", + metadata.get(Metadata.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(Metadata.SUBJECT)); + + assertContains("The quick brown fox jumps over the lazy dog", content); + assertContains("Gym class", content); + assertContains("www.alfresco.com", content); + } + + @SuppressWarnings("deprecation") + private void testParserNoHeader(String testFileName) throws Exception { + try (InputStream input = getResourceAsStream("/test-documents/" + testFileName)) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DWGParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); + + assertNull(metadata.get(TikaCoreProperties.TITLE)); + assertNull(metadata.get(TikaCoreProperties.DESCRIPTION)); + assertNull(metadata.get(Metadata.SUBJECT)); + assertNull(metadata.get(TikaCoreProperties.CREATOR)); + assertNull(metadata.get(TikaCoreProperties.KEYWORDS)); + assertNull(metadata.get(TikaCoreProperties.COMMENTS)); + assertNull(metadata.get(TikaCoreProperties.RELATION)); + + String content = handler.toString(); + assertEquals("", content); + } + } + + private void testParserAlt(String testFileName) throws Exception { + XMLResult r = getXML(testFileName, new DWGParser()); + Metadata metadata = r.metadata; + String content = r.xml; + + assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("Test Title", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Test Subject", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Test Subject", + metadata.get(Metadata.SUBJECT)); + assertEquals("My Author", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("My keyword1, MyKeyword2", + metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("This is a comment", + metadata.get(TikaCoreProperties.COMMENTS)); + assertEquals("bejanpol", + metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("bejanpol", + metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("http://mycompany/drawings", + metadata.get(TikaCoreProperties.RELATION)); + assertEquals("MyCustomPropertyValue", + metadata.get("MyCustomProperty")); + assertContains("This is a comment", content); + assertContains("mycompany", content); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/0c71b2ff/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java index 155512c..de870ed 100644 --- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java +++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java @@ -1,113 +1,101 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.prt; - -import static org.junit.Assert.assertEquals; - -import java.io.InputStream; - -import org.apache.tika.TikaTest; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.sax.BodyContentHandler; -import org.junit.Test; -import org.xml.sax.ContentHandler; - -public class PRTParserTest extends TikaTest { - /** - * Try with a simple file - */ - @Test - public void testPRTParserBasics() throws Exception { - try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new PRTParser().parse(input, handler, metadata); - - assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); - - // This file has a date - assertEquals("2011-06-20T16:54:00", - metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2011-06-20T16:54:00", - metadata.get(Metadata.CREATION_DATE)); - // But no description - assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION)); - - String contents = handler.toString(); - - assertContains("Front View", contents); - assertContains("Back View", contents); - assertContains("Bottom View", contents); - assertContains("Right View", contents); - assertContains("Left View", contents); - //assertContains("Isometric View", contents); // Can't detect yet - assertContains("Axonometric View", contents); - - assertContains("You've managed to extract all the text!", contents); - assertContains("This is more text", contents); - assertContains("Text Inside a PRT file", contents); - } - } - - /** - * Now a more complex one - */ - @Test - public void testPRTParserComplex() throws Exception { - try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new PRTParser().parse(input, handler, metadata); - - assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); - - // File has both a date and a description - assertEquals("1997-04-01T08:59:00", - metadata.get(Metadata.DATE)); - assertEquals("1997-04-01T08:59:00", - metadata.get(Metadata.CREATION_DATE)); - assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", - metadata.get(TikaCoreProperties.DESCRIPTION)); - - String contents = handler.toString(); - - assertContains("ITEM", contents); - assertContains("REQ.", contents); - assertContains("DESCRIPTION", contents); - assertContains("MAT'L", contents); - assertContains("TOLERANCES UNLESS", contents); - assertContains("FRACTIONS", contents); - assertContains("ANGLES", contents); - assertContains("Acme Corporation", contents); - - assertContains("DATE", contents); - assertContains("CHANGE", contents); - assertContains("DRAWN BY", contents); - assertContains("SCALE", contents); - assertContains("TIKA TEST DRAWING", contents); - assertContains("TIKA LETTERS", contents); - assertContains("5.82", contents); - assertContains("112" + '\u00b0', contents); // Degrees - assertContains("TIKA TEST LETTER", contents); - assertContains("17.11", contents); - assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter - assertContains("Diameter", contents); - assertContains("The Apache Tika toolkit", contents); - } - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.prt; + +import static org.junit.Assert.assertEquals; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +public class PRTParserTest extends TikaTest { + /** + * Try with a simple file + */ + @Test + public void testPRTParserBasics() throws Exception { + XMLResult r = getXML("testCADKey.prt", new PRTParser()); + Metadata metadata = r.metadata; + String contents = r.xml; + assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); + + // This file has a date + assertEquals("2011-06-20T16:54:00", + metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-06-20T16:54:00", + metadata.get(Metadata.CREATION_DATE)); + // But no description + assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION)); + + assertContains("Front View", contents); + assertContains("Back View", contents); + assertContains("Bottom View", contents); + assertContains("Right View", contents); + assertContains("Left View", contents); + //assertContains("Isometric View", contents); // Can't detect yet + assertContains("Axonometric View", contents); + + assertContains("You've managed to extract all the text!", contents); + assertContains("This is more text", contents); + assertContains("Text Inside a PRT file", contents); + + } + + /** + * Now a more complex one + */ + @Test + public void testPRTParserComplex() throws Exception { + + XMLResult r = getXML("testCADKEY2.prt", new PRTParser()); + Metadata metadata = r.metadata; + String contents = r.xml; + assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); + + // File has both a date and a description + assertEquals("1997-04-01T08:59:00", + metadata.get(Metadata.DATE)); + assertEquals("1997-04-01T08:59:00", + metadata.get(Metadata.CREATION_DATE)); + assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", + metadata.get(TikaCoreProperties.DESCRIPTION)); + + assertContains("ITEM", contents); + assertContains("REQ.", contents); + assertContains("DESCRIPTION", contents); + assertContains("MAT'L", contents); + assertContains("TOLERANCES UNLESS", contents); + assertContains("FRACTIONS", contents); + assertContains("ANGLES", contents); + assertContains("Acme Corporation", contents); + + assertContains("DATE", contents); + assertContains("CHANGE", contents); + assertContains("DRAWN BY", contents); + assertContains("SCALE", contents); + assertContains("TIKA TEST DRAWING", contents); + assertContains("TIKA LETTERS", contents); + assertContains("5.82", contents); + assertContains("112" + '\u00b0', contents); // Degrees + assertContains("TIKA TEST LETTER", contents); + assertContains("17.11", contents); + assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter + assertContains("Diameter", contents); + assertContains("The Apache Tika toolkit", contents); + } +}
