Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,109 @@ +package org.apache.tika.parser.jdbc; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.sql.Blob; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Locale; + +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + + +/** + * Concrete class for SQLLite table parsing. This overrides + * column type handling from JDBCRowHandler. + * <p/> + * This class is not designed to be thread safe (because of DateFormat)! + * Need to call a new instance for each parse, as AbstractDBParser does. + * <p/> + * For now, this silently skips cells of type CLOB, because xerial's jdbc connector + * does not currently support them. + */ +class SQLite3TableReader extends JDBCTableReader { + + + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", Locale.ROOT); + + public SQLite3TableReader(Connection connection, String tableName, ParseContext context) { + super(connection, tableName, context); + } + + + /** + * No-op for now in {@link SQLite3TableReader}. + * + * @param tableName + * @param fieldName + * @param rowNum + * @param resultSet + * @param columnIndex + * @param handler + * @param context + * @throws java.sql.SQLException + * @throws java.io.IOException + * @throws org.xml.sax.SAXException + */ + @Override + protected void handleClob(String tableName, String fieldName, int rowNum, + ResultSet resultSet, int columnIndex, + ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException { + //no-op for now. + } + + /** + * The jdbc connection to Sqlite does not yet implement blob, have to getBytes(). + * + * @param resultSet resultSet + * @param columnIndex columnIndex for blob + * @return + * @throws java.sql.SQLException + */ + @Override + protected InputStream getInputStreamFromBlob(ResultSet resultSet, int columnIndex, Blob blob, Metadata m) throws SQLException { + return TikaInputStream.get(resultSet.getBytes(columnIndex), m); + } + + @Override + protected void handleInteger(String columnTypeName, ResultSet rs, int columnIndex, + ContentHandler handler) throws SQLException, SAXException { + //As of this writing, with xerial's sqlite jdbc connector, a timestamp is + //stored as a column of type Integer, but the columnTypeName is TIMESTAMP, and the + //value is a string representing a Long. + if (columnTypeName.equals("TIMESTAMP")) { + addAllCharacters(parseDateFromLongString(rs.getString(columnIndex)), handler); + } else { + addAllCharacters(Integer.toString(rs.getInt(columnIndex)), handler); + } + + } + + private String parseDateFromLongString(String longString) throws SAXException { + java.sql.Date d = new java.sql.Date(Long.parseLong(longString)); + return dateFormat.format(d); + + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016 @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#org.apache.tika.parser.jdbc.SQLite3DBParser +org.apache.tika.parser.jdbc.SQLite3Parser Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,356 @@ +package org.apache.tika.parser.jdbc; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.TikaTest; +import org.apache.tika.extractor.EmbeddedResourceHandler; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Database; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class SQLite3ParserTest extends TikaTest { + private final static String TEST_FILE_NAME = "testSqlite3b.db"; + private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME; + + @Test + public void testBasic() throws Exception { + Parser p = new AutoDetectParser(); + + //test different types of input streams + //actual inputstream, memory buffered bytearray and literal file + InputStream[] streams = new InputStream[3]; + streams[0] = getResourceAsStream(TEST_FILE1); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + IOUtils.copy(getResourceAsStream(TEST_FILE1), bos); + streams[1] = new ByteArrayInputStream(bos.toByteArray()); + streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1)); + int tests = 0; + for (InputStream stream : streams) { + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); + //1) getXML closes the stream + //2) getXML runs recursively on the contents, so the embedded docs should show up + XMLResult result = getXML(stream, p, metadata); + String x = result.xml; + //first table name + assertContains("<table name=\"my_table1\"><thead><tr>\t<th>INT_COL</th>", x); + //non-ascii + assertContains("<td>æ®ææ¯é¡¿å¤§å¦</td>", x); + //boolean + assertContains("<td>true</td>\t<td>2015-01-02</td>", x); + //date test + assertContains("2015-01-04", x); + //timestamp test + assertContains("2015-01-03 15:17:03", x); + //first embedded doc's image tag + assertContains("alt=\"image1.png\"", x); + //second embedded doc's image tag + assertContains("alt=\"A description...\"", x); + //second table name + assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x); + + Metadata post = result.metadata; + String[] tableNames = post.getValues(Database.TABLE_NAME); + assertEquals(2, tableNames.length); + assertEquals("my_table1", tableNames[0]); + assertEquals("my_table2", tableNames[1]); + tests++; + } + assertEquals(3, tests); + } + + //make sure that table cells and rows are properly marked to + //yield \t and \n at the appropriate places + @Test + public void testSpacesInBodyContentHandler() throws Exception { + Parser p = new AutoDetectParser(); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); + ContentHandler handler = new BodyContentHandler(-1); + ParseContext ctx = new ParseContext(); + ctx.set(Parser.class, p); + try (InputStream stream = getResourceAsStream(TEST_FILE1)) { + p.parse(stream, handler, metadata, ctx); + } + String s = handler.toString(); + assertContains("0\t2.3\t2.4\tlorem", s); + assertContains("tempor\n", s); + } + + //test what happens if the user forgets to pass in a parser via context + //to handle embedded documents + @Test + public void testNotAddingEmbeddedParserToParseContext() throws Exception { + Parser p = new AutoDetectParser(); + + InputStream is = getResourceAsStream(TEST_FILE1); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); + ContentHandler handler = new ToXMLContentHandler(); + p.parse(is, handler, metadata, new ParseContext()); + String xml = handler.toString(); + //just includes headers for embedded documents + assertContains("<table name=\"my_table1\"><thead><tr>", xml); + assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml); + //but no other content + assertNotContained("dog", xml); + assertNotContained("alt=\"image1.png\"", xml); + //second embedded doc's image tag + assertNotContained("alt=\"A description...\"", xml); + } + + @Test + public void testRecursiveParserWrapper() throws Exception { + Parser p = new AutoDetectParser(); + + RecursiveParserWrapper wrapper = + new RecursiveParserWrapper(p, new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); + InputStream is = getResourceAsStream(TEST_FILE1); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); + wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext()); + List<Metadata> metadataList = wrapper.getMetadata(); + int i = 0; + assertEquals(5, metadataList.size()); + //make sure the \t are inserted in a body handler + + String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + assertContains("0\t2.3\t2.4\tlorem", table); + assertContains("æ®ææ¯é¡¿å¤§å¦", table); + + //make sure the \n is inserted + String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + assertContains("do eiusmod tempor\n", table2); + + assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT)); + + //confirm .doc was added to blob + assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + } + + @Test + public void testParserContainerExtractor() throws Exception { + //There should be 6 embedded documents: + //2x tables -- UTF-8 csv representations of the tables + //2x word files, one doc and one docx + //2x png files, the same image embedded in each of the doc and docx + + ParserContainerExtractor ex = new ParserContainerExtractor(); + ByteCopyingHandler byteCopier = new ByteCopyingHandler(); + InputStream is = getResourceAsStream(TEST_FILE1); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); + ex.extract(TikaInputStream.get(is), ex, byteCopier); + + assertEquals(4, byteCopier.bytes.size()); + String[] strings = new String[4]; + for (int i = 1; i < byteCopier.bytes.size(); i++) { + byte[] byteArr = byteCopier.bytes.get(i); + String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8); + strings[i] = s; + } + byte[] oleBytes = new byte[]{ + (byte) -48, + (byte) -49, + (byte) 17, + (byte) -32, + (byte) -95, + (byte) -79, + (byte) 26, + (byte) -31, + (byte) 0, + (byte) 0, + }; + //test OLE + for (int i = 0; i < 10; i++) { + assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]); + } + assertContains("PNG", strings[1]); + assertContains("PK", strings[2]); + assertContains("PNG", strings[3]); + } + + //This confirms that reading the stream twice is not + //quadrupling the number of attachments. + @Test + public void testInputStreamReset() throws Exception { + //There should be 8 embedded documents: + //4x word files, two docs and two docxs + //4x png files, the same image embedded in each of the doc and docx + + ParserContainerExtractor ex = new ParserContainerExtractor(); + InputStreamResettingHandler byteCopier = new InputStreamResettingHandler(); + InputStream is = getResourceAsStream(TEST_FILE1); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); + ex.extract(TikaInputStream.get(is), ex, byteCopier); + is.reset(); + assertEquals(8, byteCopier.bytes.size()); + } + + + public static class InputStreamResettingHandler implements EmbeddedResourceHandler { + + public List<byte[]> bytes = new ArrayList<byte[]>(); + + @Override + public void handle(String filename, MediaType mediaType, + InputStream stream) { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + if (!stream.markSupported()) { + stream = TikaInputStream.get(stream); + } + stream.mark(1000000); + try { + IOUtils.copy(stream, os); + bytes.add(os.toByteArray()); + stream.reset(); + //now try again + os.reset(); + IOUtils.copy(stream, os); + bytes.add(os.toByteArray()); + stream.reset(); + } catch (IOException e) { + //swallow + } + } + } + + //code used for creating the test file +/* + private Connection getConnection(String dbFileName) throws Exception { + File testDirectory = new File(this.getClass().getResource("/test-documents").toURI()); + System.out.println("Writing to: " + testDirectory.getAbsolutePath()); + File testDB = new File(testDirectory, dbFileName); + Connection c = null; + try { + Class.forName("org.sqlite.JDBC"); + c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath()); + } catch ( Exception e ) { + System.err.println( e.getClass().getName() + ": " + e.getMessage() ); + System.exit(0); + } + return c; + } + + @Test + public void testCreateDB() throws Exception { + Connection c = getConnection("testSQLLite3b.db"); + Statement st = c.createStatement(); + String sql = "DROP TABLE if exists my_table1"; + st.execute(sql); + sql = "CREATE TABLE my_table1 (" + + "INT_COL INT PRIMARY KEY, "+ + "FLOAT_COL FLOAT, " + + "DOUBLE_COL DOUBLE, " + + "CHAR_COL CHAR(30), "+ + "VARCHAR_COL VARCHAR(30), "+ + "BOOLEAN_COL BOOLEAN,"+ + "DATE_COL DATE,"+ + "TIME_STAMP_COL TIMESTAMP,"+ + "BYTES_COL BYTES" + + ")"; + st.execute(sql); + sql = "insert into my_table1 (INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " + + "VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, BYTES_COL) " + + "values (?,?,?,?,?,?,?,?,?)"; + SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + java.util.Date d = f.parse("2015-01-03 15:17:03"); + System.out.println(d.getTime()); + long d1Long = 1420229823000L;// 2015-01-02 15:17:03 + long d2Long = 1420316223000L;// 2015-01-03 15:17:03 + PreparedStatement ps = c.prepareStatement(sql); + ps.setInt(1, 0); + ps.setFloat(2, 2.3f); + ps.setDouble(3, 2.4d); + ps.setString(4, "lorem"); + ps.setString(5, "æ®ææ¯é¡¿å¤§å¦"); + ps.setBoolean(6, true); + ps.setString(7, "2015-01-02"); + ps.setString(8, "2015-01-03 15:17:03"); +// ps.setClob(9, new StringReader(clobString)); + ps.setBytes(9, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox" + ps.executeUpdate(); + ps.clearParameters(); + + ps.setInt(1, 1); + ps.setFloat(2, 4.6f); + ps.setDouble(3, 4.8d); + ps.setString(4, "dolor"); + ps.setString(5, "sit"); + ps.setBoolean(6, false); + ps.setString(7, "2015-01-04"); + ps.setString(8, "2015-01-03 15:17:03"); + //ps.setClob(9, new StringReader("consectetur adipiscing elit")); + ps.setBytes(9, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!" + + ps.executeUpdate(); + + //build table2 + sql = "DROP TABLE if exists my_table2"; + st.execute(sql); + + sql = "CREATE TABLE my_table2 (" + + "INT_COL2 INT PRIMARY KEY, "+ + "VARCHAR_COL2 VARCHAR(64))"; + st.execute(sql); + sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')"; + st.execute(sql); + sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')"; + st.execute(sql); + + c.close(); + } + + private byte[] getByteArray(InputStream is) throws IOException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + byte[] buff = new byte[1024]; + for (int bytesRead; (bytesRead = is.read(buff)) != -1;) { + bos.write(buff, 0, bytesRead); + } + return bos.toByteArray(); + } + +*/ + + +} Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -0,0 +1,56 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-ebook-parser-module</artifactId> + <name>Apache Tika e-Book Parser Module</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-parser-module</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.epub; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import javax.xml.XMLConstants; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXNotRecognizedException; + +/** + * Parser for EPUB OPS <code>*.html</code> files. + * + * For the time being, assume XHTML (TODO: DTBook) + */ +public class EpubContentParser extends AbstractParser { + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.emptySet(); // not a top-level parser + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + final XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler,metadata); + + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + try { + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + } catch (SAXNotRecognizedException e) { + // TIKA-329: Some XML parsers do not support the secure-processing + // feature, even though it's required by JAXP in Java 5. Ignoring + // the exception is fine here, deployments without this feature + // are inherently vulnerable to XML denial-of-service attacks. + } + SAXParser parser = factory.newSAXParser(); + parser.parse( + new CloseShieldInputStream(stream), + new OfflineContentHandler(xhtml)); + } catch (ParserConfigurationException e) { + throw new TikaException("XML parser configuration error", e); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.epub; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.xml.DcXMLParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Epub parser + */ +public class EpubParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 215176772484050550L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("epub+zip"), + MediaType.application("x-ibooks+zip") + ))); + + private Parser meta = new DcXMLParser(); + + private Parser content = new EpubContentParser(); + + public Parser getMetaParser() { + return meta; + } + + public void setMetaParser(Parser meta) { + this.meta = meta; + } + + public Parser getContentParser() { + return content; + } + + public void setContentParser(Parser content) { + this.content = content; + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // Because an EPub file is often made up of multiple XHTML files, + // we need explicit control over the start and end of the document + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + ContentHandler childHandler = new EmbeddedContentHandler( + new BodyContentHandler(xhtml)); + + ZipInputStream zip = new ZipInputStream(stream); + ZipEntry entry = zip.getNextEntry(); + while (entry != null) { + if (entry.getName().equals("mimetype")) { + String type = IOUtils.toString(zip, UTF_8); + metadata.set(Metadata.CONTENT_TYPE, type); + } else if (entry.getName().equals("metadata.xml")) { + meta.parse(zip, new DefaultHandler(), metadata, context); + } else if (entry.getName().endsWith(".opf")) { + meta.parse(zip, new DefaultHandler(), metadata, context); + } else if (entry.getName().endsWith(".html") || + entry.getName().endsWith(".xhtml")) { + content.parse(zip, childHandler, metadata, context); + } + entry = zip.getNextEntry(); + } + + // Finish everything + xhtml.endDocument(); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016 @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.epub.EpubParser Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.epub; + +import static org.junit.Assert.assertEquals; +import static org.apache.tika.TikaTest.assertContains; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class EpubParserTest { + + @Test + public void testXMLParser() throws Exception { + try (InputStream input = EpubParserTest.class.getResourceAsStream( + "/test-documents/testEPUB.epub")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new EpubParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals("application/epub+zip", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("en", + metadata.get(TikaCoreProperties.LANGUAGE)); + assertEquals("This is an ePub test publication for Tika.", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Apache", + metadata.get(TikaCoreProperties.PUBLISHER)); + + String content = handler.toString(); + assertContains("Plus a simple div", content); + assertContains("First item", content); + assertContains("The previous headings were subchapters", content); + assertContains("Table data", content); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ibooks; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.epub.EpubParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class iBooksParserTest { + + @Test + public void testiBooksParser() throws Exception { + try (InputStream input = iBooksParserTest.class.getResourceAsStream( + "/test-documents/testiBooks.ibooks")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new EpubParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals("application/x-ibooks+zip", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("en-GB", + metadata.get(TikaCoreProperties.LANGUAGE)); + assertEquals("iBooks Author v1.0", + metadata.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("Apache", + metadata.get(TikaCoreProperties.CREATOR)); + + /* TODO For some reason, the xhtml files in iBooks-style ePub are not parsed properly, and the content comes back empty.git che + String content = handler.toString(); + System.out.println("content="+content); + assertContains("Plus a simple div", content); + assertContains("First item", content); + assertContains("The previous headings were subchapters", content); + assertContains("Table data", content); + assertContains("Lorem ipsum dolor rutur amet", content); + */ + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -0,0 +1,76 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-journal-parser-module</artifactId> + <name>Apache Tika Journal Parser Module</name> + <url>http://tika.apache.org/</url> + + <properties> + <cxf.version>3.0.3</cxf.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-rs-client</artifactId> + <version>${cxf.version}</version> + </dependency> + <dependency> + <groupId>org.json</groupId> + <artifactId>json</artifactId> + <version>20140107</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pdf-parser-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-parser-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.journal; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Properties; + +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.cxf.jaxrs.ext.multipart.Attachment; +import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition; +import org.apache.cxf.jaxrs.ext.multipart.MultipartBody; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; + +public class GrobidRESTParser { + + private static final String GROBID_REST_HOST = "http://localhost:8080"; + + private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive + // doesn't work + // nfc why + + private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument"; + + private String restHostUrlStr; + + public GrobidRESTParser() { + String restHostUrlStr = null; + try { + restHostUrlStr = readRestUrl(); + } catch (IOException e) { + e.printStackTrace(); + } + + if (restHostUrlStr == null + || (restHostUrlStr != null && restHostUrlStr.equals(""))) { + this.restHostUrlStr = GROBID_REST_HOST; + } else { + this.restHostUrlStr = restHostUrlStr; + } + } + + public void parse(String filePath, ContentHandler handler, Metadata metadata, + ParseContext context) throws FileNotFoundException { + + File pdfFile = new File(filePath); + ContentDisposition cd = new ContentDisposition( + "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\""); + Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd); + MultipartBody body = new MultipartBody(att); + + Response response = WebClient + .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH) + .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA) + .post(body); + + try { + String resp = response.readEntity(String.class); + Metadata teiMet = new TEIParser().parse(resp); + for (String key : teiMet.names()) { + metadata.add("grobid:header_" + key, teiMet.get(key)); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + private static String readRestUrl() throws IOException { + Properties grobidProperties = new Properties(); + grobidProperties.load(GrobidRESTParser.class + .getResourceAsStream("GrobidExtractor.properties")); + + return grobidProperties.getProperty("grobid.server.url"); + } + + protected static boolean canRun() { + Response response = null; + + try { + response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH) + .accept(MediaType.TEXT_HTML).get(); + String resp = response.readEntity(String.class); + return resp != null && !resp.equals("") && resp.startsWith("<h4>"); + } catch (Exception e) { + e.printStackTrace(); + return false; + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.journal; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pdf.PDFParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class JournalParser extends AbstractParser { + + /** + * Generated serial ID + */ + private static final long serialVersionUID = 4664255544154296438L; + + private static final MediaType TYPE = MediaType.application("pdf"); + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(TYPE); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); + File tmpFile = tis.getFile(); + + GrobidRESTParser grobidParser = new GrobidRESTParser(); + grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); + + PDFParser parser = new PDFParser(); + parser.parse(new FileInputStream(tmpFile), handler, metadata, context); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,893 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.journal; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.metadata.Metadata; +import org.json.JSONArray; +import org.json.JSONObject; +import org.json.XML; + +public class TEIParser { + + public TEIParser() { + } + + public Metadata parse(String source) { + JSONObject obj = XML.toJSONObject(source); + Metadata metadata = new Metadata(); + createGrobidMetadata(source, obj, metadata); + return metadata; + } + + private void createGrobidMetadata(String source, JSONObject obj, + Metadata metadata) { + if (obj != null) { + JSONObject teiHeader = obj.getJSONObject("TEI") + .getJSONObject("teiHeader"); + if (teiHeader.has("text")) { + parseText(teiHeader.getJSONObject("text"), metadata); + } + + if (teiHeader.has("fileDesc")) { + parseFileDesc(teiHeader.getJSONObject("fileDesc"), metadata); + + } + if (teiHeader.has("profileDesc")) { + parseProfileDesc(teiHeader.getJSONObject("profileDesc"), metadata); + } + } + + addStaticMet(source, obj, metadata); + } + + private void addStaticMet(String source, JSONObject obj, Metadata metadata) { + metadata.add("Class", Metadata.class.getName()); + metadata.add("TEIJSONSource", obj.toString()); + metadata.add("TEIXMLSource", source); + } + + private void parseText(JSONObject text, Metadata metadata) { + if (text.has("xml:lang")) { + metadata.add("Language", text.getString("xml:lang")); + } + } + + private void parseFileDesc(JSONObject fileDesc, Metadata metadata) { + if (fileDesc.has("titleStmt")) { + parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata); + } + + if (fileDesc.has("sourceDesc")) { + parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata); + } + } + + private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) { + if (titleStmt.has("title")) { + JSONObject title = titleStmt.getJSONObject("title"); + if (title.has("content")) { + metadata.add("Title", title.getString("content")); + } + } + } + + private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) { + if (sourceDesc.has("biblStruct")) { + parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata); + } + } + + private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) { + if (biblStruct.has("analytic") + && biblStruct.get("analytic") instanceof JSONObject) { + JSONObject analytic = biblStruct.getJSONObject("analytic"); + if (analytic.has("author")) { + Object authorObj = analytic.get("author"); + + List<Author> authorList = new ArrayList<Author>(); + if (authorObj instanceof JSONObject) { + parseAuthor((JSONObject) authorObj, authorList); + } else if (authorObj instanceof JSONArray) { + JSONArray authors = (JSONArray) authorObj; + if (authors.length() > 0) { + for (int i = 0; i < authors.length(); i++) { + JSONObject author = authors.getJSONObject(i); + parseAuthor(author, authorList); + } + } + + metadata.add("Address", getMetadataAddresses(authorList)); + metadata.add("Affiliation", getMetadataAffiliations(authorList)); + metadata.add("Authors", getMetadataAuthors(authorList)); + metadata.add("FullAffiliations", + getMetadataFullAffiliations(authorList)); + } + + } + } else { + metadata.add("Error", "Unable to parse: no analytic section in JSON"); + } + + } + + private String getMetadataFullAffiliations(List<Author> authorList) { + List<Affiliation> unique = new ArrayList<Affiliation>(); + StringBuilder metAffils = new StringBuilder(); + + for (Author a : authorList) { + for (Affiliation af : a.getAffiliations()) { + if (!unique.contains(af)) { + unique.add(af); + } + } + } + metAffils.append("["); + for (Affiliation af : unique) { + metAffils.append(af.toString()); + metAffils.append(","); + } + metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1)); + metAffils.append("]"); + return metAffils.toString(); + } + + private String getMetadataAuthors(List<Author> authorList) { + // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2 + // Steve Hughes 1 + List<Affiliation> unique = new ArrayList<Affiliation>(); + StringBuilder metAuthors = new StringBuilder(); + + for (Author a : authorList) { + for (Affiliation af : a.getAffiliations()) { + if (!unique.contains(af)) { + unique.add(af); + } + } + } + + for (Author a : authorList) { + metAuthors.append(printOrBlank(a.getFirstName())); + metAuthors.append(printOrBlank(a.getMiddleName())); + metAuthors.append(printOrBlank(a.getSurName())); + + StringBuilder affilBuilder = new StringBuilder(); + for (int idx = 0; idx < unique.size(); idx++) { + Affiliation af = unique.get(idx); + if (a.getAffiliations().contains(af)) { + affilBuilder.append((idx + 1)); + affilBuilder.append(","); + } + } + + if (affilBuilder.length() > 0) + affilBuilder.deleteCharAt(affilBuilder.length() - 1); + + metAuthors.append(affilBuilder.toString()); + metAuthors.append(" "); + } + + return metAuthors.toString(); + } + + private String getMetadataAffiliations(List<Author> authorList) { + // generates 1 Jet Propulsion Laboratory California Institute of Technology + // ; 2 Computer Science Department University of Southern California + List<Affiliation> unique = new ArrayList<Affiliation>(); + StringBuilder metAffil = new StringBuilder(); + + for (Author a : authorList) { + for (Affiliation af : a.getAffiliations()) { + if (!unique.contains(af)) { + unique.add(af); + } + } + } + + int count = 1; + for (Affiliation a : unique) { + metAffil.append(count); + metAffil.append(" "); + metAffil.append(a.getOrgName().toString()); + metAffil.deleteCharAt(metAffil.length() - 1); + metAffil.append("; "); + count++; + } + + if (count > 1) { + metAffil.deleteCharAt(metAffil.length() - 1); + metAffil.deleteCharAt(metAffil.length() - 1); + } + + return metAffil.toString(); + } + + private String getMetadataAddresses(List<Author> authorList) { + // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA", + List<Address> unique = new ArrayList<Address>(); + StringBuilder metAddress = new StringBuilder(); + + for (Author a : authorList) { + for (Affiliation af : a.getAffiliations()) { + if (!unique.contains(af.getAddress())) { + unique.add(af.getAddress()); + } + } + } + + for (Address ad : unique) { + metAddress.append(ad.toString()); + metAddress.append(" "); + } + + return metAddress.toString(); + } + + private void parseAuthor(JSONObject authorObj, List<Author> authorList) { + Author author = new Author(); + + if (authorObj.has("persName")) { + JSONObject persName = authorObj.getJSONObject("persName"); + + if (persName.has("forename")) { + + Object foreNameObj = persName.get("forename"); + + if (foreNameObj instanceof JSONObject) { + parseNamePart((JSONObject) foreNameObj, author); + } else if (foreNameObj instanceof JSONArray) { + JSONArray foreName = persName.getJSONArray("forename"); + + if (foreName.length() > 0) { + for (int i = 0; i < foreName.length(); i++) { + JSONObject namePart = foreName.getJSONObject(i); + parseNamePart(namePart, author); + } + } + } + } + + if (persName.has("surname")) { + author.setSurName(persName.getString("surname")); + } + + if (authorObj.has("affiliation")) { + parseAffiliation(authorObj.get("affiliation"), author); + } + + } + + authorList.add(author); + } + + private void parseNamePart(JSONObject namePart, Author author) { + if (namePart.has("type") && namePart.has("content")) { + String type = namePart.getString("type"); + String content = namePart.getString("content"); + + if (type.equals("first")) { + author.setFirstName(content); + } + + if (type.equals("middle")) { + author.setMiddleName(content); + } + } + } + + private void parseAffiliation(Object affiliationJSON, Author author) { + if (affiliationJSON instanceof JSONObject) { + parseOneAffiliation((JSONObject) affiliationJSON, author); + } else if (affiliationJSON instanceof JSONArray) { + JSONArray affiliationArray = (JSONArray) affiliationJSON; + if (affiliationArray != null && affiliationArray.length() > 0) { + for (int i = 0; i < affiliationArray.length(); i++) { + JSONObject affiliationObj = affiliationArray.getJSONObject(i); + parseOneAffiliation(affiliationObj, author); + } + } + } + } + + private void parseOneAffiliation(JSONObject affiliationObj, Author author) { + + Affiliation affiliation = new Affiliation(); + if (affiliationObj.has("address")) { + parseAddress(affiliationObj.getJSONObject("address"), affiliation); + } + + if (affiliationObj.has("orgName")) { + OrgName orgName = new OrgName(); + Object orgObject = affiliationObj.get("orgName"); + if (orgObject instanceof JSONObject) { + parseOrgName((JSONObject) orgObject, orgName); + } else if (orgObject instanceof JSONArray) { + JSONArray orgNames = (JSONArray) orgObject; + if (orgNames != null && orgNames.length() > 0) { + for (int i = 0; i < orgNames.length(); i++) { + parseOrgName(orgNames.getJSONObject(i), orgName); + } + } + + affiliation.setOrgName(orgName); + } + + } + + author.getAffiliations().add(affiliation); + } + + private void parseAddress(JSONObject addressObj, Affiliation affiliation) { + Address address = new Address(); + + if (addressObj.has("region")) { + address.setRegion(addressObj.getString("region")); + } + + if (addressObj.has("postCode")) { + address.setPostCode(JSONObject.valueToString(addressObj.get("postCode"))); + } + + if (addressObj.has("settlement")) { + address.setSettlment(addressObj.getString("settlement")); + } + + if (addressObj.has("country")) { + Country country = new Country(); + Object countryObj = addressObj.get("country"); + + if (countryObj instanceof JSONObject) { + JSONObject countryJson = addressObj.getJSONObject("country"); + + if (countryJson.has("content")) { + country.setContent(countryJson.getString("content")); + } + + if (countryJson.has("key")) { + country.setKey(countryJson.getString("key")); + } + } else if (countryObj instanceof String) { + country.setContent((String) countryObj); + } + address.setCountry(country); + } + + affiliation.setAddress(address); + } + + private void parseOrgName(JSONObject orgObj, OrgName orgName) { + OrgTypeName typeName = new OrgTypeName(); + if (orgObj.has("content")) { + typeName.setName(orgObj.getString("content")); + } + + if (orgObj.has("type")) { + typeName.setType(orgObj.getString("type")); + } + + orgName.getTypeNames().add(typeName); + } + + private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) { + if (profileDesc.has("abstract")) { + if (profileDesc.has("p")) { + metadata.add("Abstract", profileDesc.getString("p")); + } + } + + if (profileDesc.has("textClass")) { + JSONObject textClass = profileDesc.getJSONObject("textClass"); + + if (textClass.has("keywords")) { + Object keywordsObj = textClass.get("keywords"); + // test AJ15.pdf + if (keywordsObj instanceof String) { + metadata.add("Keyword", (String) keywordsObj); + } else if (keywordsObj instanceof JSONObject) { + JSONObject keywords = textClass.getJSONObject("keywords"); + if (keywords.has("term")) { + JSONArray termArr = keywords.getJSONArray("term"); + for (int i = 0; i < termArr.length(); i++) { + metadata.add("Keyword", JSONObject.valueToString(termArr.get(i))); + } + } + } + + } + } + + } + + private String printOrBlank(String val) { + if (val != null && !val.equals("")) { + return val + " "; + } else + return " "; + } + + class Author { + + private String surName; + + private String middleName; + + private String firstName; + + private List<Affiliation> affiliations; + + public Author() { + this.surName = null; + this.middleName = null; + this.firstName = null; + this.affiliations = new ArrayList<Affiliation>(); + } + + /** + * @return the surName + */ + public String getSurName() { + return surName; + } + + /** + * @param surName + * the surName to set + */ + public void setSurName(String surName) { + this.surName = surName; + } + + /** + * @return the middleName + */ + public String getMiddleName() { + return middleName; + } + + /** + * @param middleName + * the middleName to set + */ + public void setMiddleName(String middleName) { + this.middleName = middleName; + } + + /** + * @return the firstName + */ + public String getFirstName() { + return firstName; + } + + /** + * @param firstName + * the firstName to set + */ + public void setFirstName(String firstName) { + this.firstName = firstName; + } + + /** + * @return the affiliations + */ + public List<Affiliation> getAffiliations() { + return affiliations; + } + + /** + * @param affiliations + * the affiliations to set + */ + public void setAffiliations(List<Affiliation> affiliations) { + this.affiliations = affiliations; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName + : "" + ", firstName=" + firstName + ", affiliations=" + affiliations + + "]"; + } + + } + + class Affiliation { + + private OrgName orgName; + + private Address address; + + public Affiliation() { + this.orgName = new OrgName(); + this.address = new Address(); + } + + /** + * @return the orgName + */ + public OrgName getOrgName() { + return orgName; + } + + /** + * @param orgName + * the orgName to set + */ + public void setOrgName(OrgName orgName) { + this.orgName = orgName; + } + + /** + * @return the address + */ + public Address getAddress() { + return address; + } + + /** + * @param address + * the address to set + */ + public void setAddress(Address address) { + this.address = address; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + Affiliation otherA = (Affiliation) obj; + return this.getAddress().equals(otherA.getAddress()) + && this.getOrgName().equals(otherA.getOrgName()); + + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "Affiliation {orgName=" + orgName + ", address=" + address + "}"; + } + + } + + class OrgName { + private List<OrgTypeName> typeNames; + + public OrgName() { + this.typeNames = new ArrayList<OrgTypeName>(); + } + + /** + * @return the typeNames + */ + public List<OrgTypeName> getTypeNames() { + return typeNames; + } + + /** + * @param typeNames + * the typeNames to set + */ + public void setTypeNames(List<OrgTypeName> typeNames) { + this.typeNames = typeNames; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + for (OrgTypeName on : this.typeNames) { + builder.append(on.getName()); + builder.append(" "); + } + return builder.toString(); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + OrgName otherA = (OrgName) obj; + + if (otherA.getTypeNames() != null) { + if (this.typeNames == null) { + return false; + } else { + return this.typeNames.size() == otherA.getTypeNames().size(); + } + } else { + if (this.typeNames == null) { + return true; + } else + return false; + } + + } + + } + + class OrgTypeName { + private String name; + private String type; + + public OrgTypeName() { + this.name = null; + this.type = null; + } + + /** + * @return the name + */ + public String getName() { + return name; + } + + /** + * @param name + * the name to set + */ + public void setName(String name) { + this.name = name; + } + + /** + * @return the type + */ + public String getType() { + return type; + } + + /** + * @param type + * the type to set + */ + public void setType(String type) { + this.type = type; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + OrgTypeName otherOrgName = (OrgTypeName) obj; + return this.type.equals(otherOrgName.getType()) + && this.name.equals(otherOrgName.getName()); + } + + } + + private class Address { + + private String region; + private String postCode; + private String settlment; + private Country country; + + public Address() { + this.region = null; + this.postCode = null; + this.settlment = null; + this.country = new Country(); + } + + /** + * @return the region + */ + public String getRegion() { + return region; + } + + /** + * @param region + * the region to set + */ + public void setRegion(String region) { + this.region = region; + } + + /** + * @return the postCode + */ + public String getPostCode() { + return postCode; + } + + /** + * @param postCode + * the postCode to set + */ + public void setPostCode(String postCode) { + this.postCode = postCode; + } + + /** + * @return the settlment + */ + public String getSettlment() { + return settlment; + } + + /** + * @param settlment + * the settlment to set + */ + public void setSettlment(String settlment) { + this.settlment = settlment; + } + + /** + * @return the country + */ + public Country getCountry() { + return country; + } + + /** + * @param country + * the country to set + */ + public void setCountry(Country country) { + this.country = country; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + Address otherA = (Address) obj; + if (this.settlment == null) { + return otherA.getSettlment() == null; + } else if (this.country == null) { + return otherA.getCountry() == null; + } else if (this.postCode == null) { + return otherA.getPostCode() == null; + } else if (this.region == null) { + return otherA.getRegion() == null; + } + + return this.settlment.equals(otherA.getSettlment()) + && this.country.equals(otherA.getCountry()) + && this.postCode.equals(otherA.getPostCode()) + && this.region.equals(otherA.getRegion()); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append(settlment); + builder.append(", "); + builder.append(region); + builder.append(" "); + builder.append(postCode); + builder.append(" "); + builder.append(country.getContent()); + return builder.toString(); + } + } + + private class Country { + private String key; + private String content; + + public Country() { + this.key = null; + this.content = null; + } + + /** + * @return the key + */ + public String getKey() { + return key; + } + + /** + * @param key + * the key to set + */ + public void setKey(String key) { + this.key = key; + } + + /** + * @return the content + */ + public String getContent() { + return content; + } + + /** + * @param content + * the content to set + */ + public void setContent(String content) { + this.content = content; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + Country otherC = (Country) obj; + + if (this.key == null) { + if (otherC.getKey() != null) { + return false; + } else { + if (this.content == null) { + if (otherC.getContent() != null) { + return false; + } else { + return true; + } + } else { + return content.equals(otherC.getContent()); + } + } + } else { + if (this.content == null) { + if (otherC.getContent() != null) { + return false; + } else { + return this.key.equals(otherC.getKey()); + } + } else { + return this.key.equals(otherC.getKey()) + && this.content.equals(otherC.getContent()); + } + } + } + + } +} Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016 @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#org.apache.tika.parser.journal.GrobidRESTParser +org.apache.tika.parser.journal.JournalParser +#org.apache.tika.parser.journal.TEIParser Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties Sat Jan 16 18:23:01 2016 @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +grobid.server.url=http://localhost:8080 Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.journal; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; +import static org.junit.Assume.assumeTrue; +import static org.apache.tika.parser.journal.GrobidRESTParser.canRun; +import java.io.InputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class JournalParserTest { + + @Test + public void testJournalParser() { + String path = "/test-documents/testJournalParser.pdf"; + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + assumeTrue(canRun()); + + InputStream stream = JournalParserTest.class.getResourceAsStream(path); + JournalParser jParser = new JournalParser(); + try { + jParser.parse(stream, handler, metadata, new ParseContext()); + } catch (Exception e){ + e.printStackTrace(); + fail(e.getMessage()); + } + + assertNotNull(metadata.get("grobid:header_Title")); + } +} Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml?rev=1725014&r1=1725011&r2=1725014&view=diff ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml (original) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -19,8 +19,8 @@ <version>2.0-SNAPSHOT</version> </parent> - <artifactId>tika-multimedia-module</artifactId> - <name>Apache Tika Multimedia Module</name> + <artifactId>tika-multimedia-parser-module</artifactId> + <name>Apache Tika Multimedia Parser Module</name> <url>http://tika.apache.org/</url> <properties> @@ -105,19 +105,19 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-web-module</artifactId> + <artifactId>tika-web-parser-module</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-pdf-module</artifactId> + <artifactId>tika-pdf-parser-module</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-office-module</artifactId> + <artifactId>tika-office-parser-module</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency>
