Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +public class DcXMLParserTest extends TikaTest { + + @Test + public void testXMLParserAsciiChars() throws Exception { + try (InputStream input = DcXMLParserTest.class.getResourceAsStream( + "/test-documents/testXML.xml")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DcXMLParser().parse(input, handler, metadata); + + assertEquals( + "application/xml", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR)); + + // The file contains 5 dc:subject tags, which come through as + // a multi-valued Tika Metadata entry in file order + assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS)); + assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length); + assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]); + assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]); + assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]); + assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]); + assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]); + assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT)); + assertEquals(5, metadata.getValues(Metadata.SUBJECT).length); + assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]); + assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]); + assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]); + assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]); + assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]); + + assertEquals( + "Framework d\'indexation des documents XML, HTML, PDF etc..", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals( + "http://www.apache.org", + metadata.get(TikaCoreProperties.IDENTIFIER)); + assertEquals("test", metadata.get(TikaCoreProperties.TYPE)); + assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT)); + assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE)); + assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars")); + + String content = handler.toString(); + assertContains("Tika test document", content); + + assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED)); + } + } + + @Test + public void testXMLParserNonAsciiChars() throws Exception { + try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) { + Metadata metadata = new Metadata(); + new DcXMLParser().parse(input, new DefaultHandler(), metadata); + + final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9"; + assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS)); + } + } + + // TIKA-1048 + @Test + public void testNoSpaces() throws Exception { + String text = getXML("testXML2.xml").xml; + assertFalse(text.contains("testSubject")); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest { + + private Property FIRST_NAME = Property.internalTextBag( + "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName"); + private Property LAST_NAME = Property.internalTextBag( + "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName"); + + @Test + public void testDefaultBehavior() throws Exception { + try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream( + "/test-documents/testXML3.xml")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals(4, metadata.getValues(FIRST_NAME).length); + assertEquals(2, metadata.getValues(LAST_NAME).length); + + assertEquals("John", metadata.getValues(FIRST_NAME)[0]); + assertEquals("Smith", metadata.getValues(LAST_NAME)[0]); + + assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]); + assertEquals("Doe", metadata.getValues(LAST_NAME)[1]); + + // We didn't know Bob's last name, but now we don't know an entry existed + assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]); + + // We don't know Kate's last name because it was a duplicate + assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]); + } + } + + @Test + public void testEmptiesAndRepeats() throws Exception { + try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream( + "/test-documents/testXML3.xml")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals(4, metadata.getValues(FIRST_NAME).length); + assertEquals(4, metadata.getValues(LAST_NAME).length); + + assertEquals("John", metadata.getValues(FIRST_NAME)[0]); + assertEquals("Smith", metadata.getValues(LAST_NAME)[0]); + + assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]); + assertEquals("Doe", metadata.getValues(LAST_NAME)[1]); + + assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]); + assertEquals("", metadata.getValues(LAST_NAME)[2]); + + assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]); + assertEquals("Smith", metadata.getValues(LAST_NAME)[3]); + } + } + + private class DefaultCustomXMLTestParser extends XMLParser { + + private static final long serialVersionUID = 2458579047014545931L; + + protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) { + return new ElementMetadataHandler( + "http://custom", + localPart, + metadata, + tikaProperty); + } + + protected ContentHandler getContentHandler( + ContentHandler handler, Metadata metadata, ParseContext context) { + return new TeeContentHandler( + super.getContentHandler(handler, metadata, context), + getCustomElementHandler(metadata, FIRST_NAME, "FirstName"), + getCustomElementHandler(metadata, LAST_NAME, "LastName")); + } + } + + private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser { + + private static final long serialVersionUID = 3735646809954466229L; + + protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) { + return new ElementMetadataHandler( + "http://custom", + localPart, + metadata, + tikaProperty, + true, + true); + } + } + + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.TikaTest.TrackingHandler; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class FictionBookParserTest { + + @Test + public void testFB2() throws Exception { + try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new FictionBookParser().parse(input, handler, metadata, new ParseContext()); + String content = handler.toString(); + + assertContains("1812", content); + } + } + + @Test + public void testEmbedded() throws Exception { + try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) { + ContainerExtractor extractor = new ParserContainerExtractor(); + TikaInputStream stream = TikaInputStream.get(input); + + assertEquals(true, extractor.isSupported(stream)); + + // Process it + TrackingHandler handler = new TrackingHandler(); + extractor.extract(stream, null, handler); + + assertEquals(2, handler.filenames.size()); + } + } +} Modified: tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml?rev=1725014&r1=1725011&r2=1725014&view=diff ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml (original) +++ tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -19,8 +19,8 @@ <version>2.0-SNAPSHOT</version> </parent> - <artifactId>tika-web-module</artifactId> - <name>Apache Tika Web Module</name> + <artifactId>tika-web-parser-module</artifactId> + <name>Apache Tika Web Parser Module</name> <url>http://tika.apache.org/</url> <properties> @@ -73,13 +73,13 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-text-module</artifactId> + <artifactId>tika-text-parser-module</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-package-module</artifactId> + <artifactId>tika-package-parser-module</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> Modified: tika/branches/2.x/tika-parsers/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/pom.xml?rev=1725014&r1=1725013&r2=1725014&view=diff ============================================================================== --- tika/branches/2.x/tika-parsers/pom.xml (original) +++ tika/branches/2.x/tika-parsers/pom.xml Sat Jan 16 18:23:01 2016 @@ -147,67 +147,67 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-multimedia-module</artifactId> + <artifactId>tika-multimedia-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-advanced-module</artifactId> + <artifactId>tika-advanced-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-cad-module</artifactId> + <artifactId>tika-cad-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-code-module</artifactId> + <artifactId>tika-code-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-database-module</artifactId> + <artifactId>tika-database-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-ebook-module</artifactId> + <artifactId>tika-ebook-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-journal-module</artifactId> + <artifactId>tika-journal-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-office-module</artifactId> + <artifactId>tika-office-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-package-module</artifactId> + <artifactId>tika-package-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-pdf-module</artifactId> + <artifactId>tika-pdf-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-scientific-module</artifactId> + <artifactId>tika-scientific-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-text-module</artifactId> + <artifactId>tika-text-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-web-module</artifactId> + <artifactId>tika-web-parser-module</artifactId> <version>${project.version}</version> </dependency> <dependency> @@ -322,19 +322,19 @@ </createDependencyReducedPom> <artifactSet> <includes> - <include>org.apache.tika:tika-multimedia-module</include> - <include>org.apache.tika:tika-advanced-module</include> - <include>org.apache.tika:tika-cad-module</include> - <include>org.apache.tika:tika-code-module</include> - <include>org.apache.tika:tika-database-module</include> - <include>org.apache.tika:tika-ebook-module</include> - <include>org.apache.tika:tika-journal-module</include> - <include>org.apache.tika:tika-office-module</include> - <include>org.apache.tika:tika-package-module</include> - <include>org.apache.tika:tika-pdf-module</include> - <include>org.apache.tika:tika-scientific-module</include> - <include>org.apache.tika:tika-text-module</include> - <include>org.apache.tika:tika-web-module</include> + <include>org.apache.tika:tika-multimedia-parser-module</include> + <include>org.apache.tika:tika-advanced-parser-module</include> + <include>org.apache.tika:tika-cad-parser-module</include> + <include>org.apache.tika:tika-code-parser-module</include> + <include>org.apache.tika:tika-database-parser-module</include> + <include>org.apache.tika:tika-ebook-parser-module</include> + <include>org.apache.tika:tika-journal-parser-module</include> + <include>org.apache.tika:tika-office-parser-module</include> + <include>org.apache.tika:tika-package-parser-module</include> + <include>org.apache.tika:tika-pdf-parser-module</include> + <include>org.apache.tika:tika-scientific-parser-module</include> + <include>org.apache.tika:tika-text-parser-module</include> + <include>org.apache.tika:tika-web-parser-module</include> </includes> </artifactSet> <transformers>
