This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 7eb1a4225 TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter 7eb1a4225 is described below commit 7eb1a422597ebe7089dcbd6c55719390e8beca8c Author: tallison <talli...@apache.org> AuthorDate: Thu Nov 17 10:04:26 2022 -0500 TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter --- CHANGES.txt | 5 +- tika-pipes/pom.xml | 15 +++- .../tika/pipes/emitter/jdbc/JDBCEmitter.java | 94 +++++++++++++++++++--- .../tika/pipes/emitter/jdbc/JDBCEmitterTest.java | 38 +++++++++ .../tika-config-jdbc-emitter-multivalued.xml | 47 +++++++++++ tika-pipes/tika-fetchers/pom.xml | 3 + 6 files changed, 188 insertions(+), 14 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index c7995493b..e9b58ee10 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,9 @@ Release 2.6.1 - ??? - * Downgraded logging in PipesClient for each parse from info to debug. + * Add multivalued field strategy option in jdbc-emitter (TIKA-3930). + Default is now 'concatenate' with ', ' as the delimiter. + + * Downgrade logging in PipesClient for each parse from info to debug. Release 2.6.0 - 11/3/2022 diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index 1b4eb9f66..2f0f833cf 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -37,7 +37,20 @@ <module>tika-pipes-reporters</module> <module>tika-async-cli</module> </modules> - + <dependencies> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <version>${log4j2.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + <version>${log4j2.version}</version> + <scope>test</scope> + </dependency> + </dependencies> <build> <plugins> <plugin> diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java index ccd66445a..ce6fc79c8 100644 --- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java +++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java @@ -66,6 +66,15 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close private static final Logger LOGGER = LoggerFactory.getLogger(JDBCEmitter.class); + public enum AttachmentStrategy { + FIRST_ONLY, ALL + //anything else? + } + + public enum MultivaluedFieldStrategy { + FIRST_ONLY, CONCATENATE + //anything else? + } //some file formats do not have time zones... //try both private static final String[] TIKA_DATE_PATTERNS = new String[] { @@ -87,6 +96,11 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close private PreparedStatement insertStatement; private AttachmentStrategy attachmentStrategy = AttachmentStrategy.FIRST_ONLY; + private MultivaluedFieldStrategy multivaluedFieldStrategy = + MultivaluedFieldStrategy.CONCATENATE; + + private String multivaluedFieldDelimiter = ", "; + //emitters are run in a single thread. If we ever start running them //multithreaded, this will be a big problem. private final DateFormat[] dateFormats; @@ -124,6 +138,44 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close this.connectionString = connectionString; } + /** + * This applies to fields of type 'string' or 'varchar'. If there's + * a multivalued field in a metadata object, do you want the first value only + * or should we concatenate these with the + * {@link JDBCEmitter#setMultivaluedFieldDelimiter(String)}. + * + * The default values as of 2.6.1 are {@link MultivaluedFieldStrategy#CONCATENATE} + * and the default delimiter is ", " + * + * @param strategy + * @throws TikaConfigException + */ + @Field + public void setMultivaluedFieldStrategy(String strategy) throws TikaConfigException { + String lc = strategy.toLowerCase(Locale.US); + if (lc.equals("first_only")) { + setMultivaluedFieldStrategy(MultivaluedFieldStrategy.FIRST_ONLY); + } else if (lc.equals("concatenate")) { + setMultivaluedFieldStrategy(MultivaluedFieldStrategy.CONCATENATE); + } else { + throw new TikaConfigException("I'm sorry, I only recogize 'first_only' and " + + "'concatenate'. I don't mind '" + strategy + "'"); + } + } + + public void setMultivaluedFieldStrategy(MultivaluedFieldStrategy multivaluedFieldStrategy) { + this.multivaluedFieldStrategy = multivaluedFieldStrategy; + } + + /** + * See {@link JDBCEmitter#setMultivaluedFieldDelimiter(String)} + * @param delimiter + */ + @Field + public void setMultivaluedFieldDelimiter(String delimiter) { + this.multivaluedFieldDelimiter = delimiter; + } + /** * The implementation of keys should be a LinkedHashMap because * order matters! @@ -261,10 +313,8 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close private void updateValue(PreparedStatement insertStatement, int i, String key, String type, int metadataListIndex, List<Metadata> metadataList) throws SQLException { - //for now we're only taking the info from the container document. Metadata metadata = metadataList.get(metadataListIndex); - String val = metadata.get(key); - + String val = getVal(metadata, key, type); String lcType = type.toLowerCase(Locale.US); if (lcType.startsWith("varchar")) { updateVarchar(lcType, insertStatement, i, val); @@ -301,6 +351,35 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close } } + private String getVal(Metadata metadata, String key, String type) { + if (! type.equals("string") && ! type.startsWith("varchar")) { + return metadata.get(key); + } + if (multivaluedFieldStrategy == MultivaluedFieldStrategy.FIRST_ONLY) { + return metadata.get(key); + } + String[] vals = metadata.getValues(key); + if (vals.length == 0) { + return null; + } else if (vals.length == 1) { + return vals[0]; + } + + int i = 0; + StringBuilder sb = new StringBuilder(); + for (String val : metadata.getValues(key)) { + if (StringUtils.isBlank(val)) { + continue; + } + if (i > 0) { + sb.append(multivaluedFieldDelimiter); + } + sb.append(val); + i++; + } + return sb.toString(); + } + private void updateDouble(PreparedStatement insertStatement, int i, String val) throws SQLException { if (StringUtils.isBlank(val)) { insertStatement.setNull(i, Types.DOUBLE); @@ -451,13 +530,4 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close } } - /* - TODO: This is currently not ever called. We need rework the PipesParser - to ensure that emitters are closed cleanly. - */ - - public enum AttachmentStrategy { - FIRST_ONLY, ALL - //anything else? - } } diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java index 2fd5148f7..873c885fd 100644 --- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java +++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java @@ -178,6 +178,44 @@ public class JDBCEmitterTest { } } + @Test + public void testMultiValuedFields(@TempDir Path tmpDir) throws Exception { + Files.createDirectories(tmpDir.resolve("db")); + Path dbDir = tmpDir.resolve("db/h2"); + Path config = tmpDir.resolve("tika-config.xml"); + String connectionString = "jdbc:h2:file:" + dbDir.toAbsolutePath(); + + writeConfig("/configs/tika-config-jdbc-emitter-multivalued.xml", + connectionString, config); + + EmitterManager emitterManager = EmitterManager.load(config); + Emitter emitter = emitterManager.getEmitter(); + List<Metadata> data = new ArrayList<>(); + Metadata m = new Metadata(); + m.add("k1", "first"); + m.add("k1", "second"); + m.add("k1", "third"); + m.add("k1", "fourth"); + data.add(m); + emitter.emit("id0", data); + + String expected = "first, second, third, fourth"; + int rows = 0; + try (Connection connection = DriverManager.getConnection(connectionString)) { + try (Statement st = connection.createStatement()) { + try (ResultSet rs = st.executeQuery("select * from test")) { + assertEquals("path", rs.getMetaData().getColumnName(1).toLowerCase(Locale.US)); + while (rs.next()) { + assertEquals("id0", rs.getString(1)); + assertEquals(expected, rs.getString(2)); + rows++; + } + } + } + } + assertEquals(1, rows); + } + private void writeConfig(String srcConfig, String dbDir, Path config) throws IOException { String xml = IOUtils.resourceToString(srcConfig, StandardCharsets.UTF_8); xml = xml.replace("CONNECTION_STRING", dbDir); diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml new file mode 100644 index 000000000..a46e145f0 --- /dev/null +++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <emitters> + <emitter class="org.apache.tika.pipes.emitter.jdbc.JDBCEmitter"> + <params> + <name>jdbc</name> + <connection>CONNECTION_STRING</connection> + <createTable>create table test + (path varchar(512) primary key, + k1 varchar(512)); + </createTable> + <!-- the jdbc emitter always puts ths emitKey value as the first + item --> + <insert>insert into test (path, k1) values (?,?); + </insert> + <!-- these are the keys in the metadata object. + The emitKey is added as the first element in the insert statement. + Then the these values are added in order. + They must be in the order of the insert statement. + --> + <keys> + <key k="k1" v="varchar(512)"/> + </keys> + <multivaluedFieldStrategy>concatenate</multivaluedFieldStrategy> + <multivaluedFieldDelimiter>, </multivaluedFieldDelimiter> + </params> + </emitter> + </emitters> +</properties> \ No newline at end of file diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 958f84b79..bccb2f35a 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -38,6 +38,9 @@ <module>tika-fetcher-az-blob</module> </modules> + <dependencies> + + </dependencies> <scm> <tag>2.2.1-rc2</tag> </scm>