This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4198 by this push: new 7fad80367 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns 7fad80367 is described below commit 7fad803673b1ae82ba4ff74aad1a9d12e356224d Author: tallison <talli...@apache.org> AuthorDate: Tue Feb 20 06:49:29 2024 -0500 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns --- .../apache/tika/parser/geopkg/GeoPkgDBParser.java | 30 ++++++++-------------- .../apache/tika/parser/geopkg/GeoPkgParser.java | 16 +++++++++--- .../tika/parser/geopkg/GeoPkgTableReader.java | 14 +++++----- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java index 5dc0f9ff2..d4b56127d 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java @@ -16,29 +16,11 @@ */ package org.apache.tika.parser.geopkg; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; import java.sql.Connection; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; import java.util.Set; -import org.sqlite.SQLiteConfig; - import org.apache.tika.extractor.EmbeddedDocumentUtil; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; -import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.jdbc.AbstractDBParser; import org.apache.tika.parser.jdbc.JDBCTableReader; import org.apache.tika.parser.sqlite3.SQLite3DBParser; @@ -50,15 +32,23 @@ import org.apache.tika.parser.sqlite3.SQLite3DBParser; */ class GeoPkgDBParser extends SQLite3DBParser { + private final Set<String> ignoreBlobColumns; + + GeoPkgDBParser(Set<String> ignoreBlobColumns) { + this.ignoreBlobColumns = ignoreBlobColumns; + } + @Override public JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext context) { - return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context)); + return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context), + ignoreBlobColumns); } @Override protected JDBCTableReader getTableReader(Connection connection, String tableName, EmbeddedDocumentUtil embeddedDocumentUtil) { - return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil); + return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil, + ignoreBlobColumns); } } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java index 6aae7cb04..907e6de39 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java @@ -20,22 +20,22 @@ package org.apache.tika.parser.geopkg; import java.io.IOException; import java.io.InputStream; import java.util.Collections; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.config.Initializable; +import org.apache.tika.config.Field; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; import org.apache.tika.parser.sqlite3.SQLite3Parser; /** @@ -52,10 +52,13 @@ public class GeoPkgParser extends SQLite3Parser { private static final Set<MediaType> SUPPORTED_TYPES; + static { SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); } + private static final Set<String> DEFAULT_IGNORE_BLOB_COLUMNS = Set.of("geom", "data"); + private Set<String> ignoreBlobColumns = new HashSet<>(DEFAULT_IGNORE_BLOB_COLUMNS); /** * Checks to see if class is available for org.sqlite.JDBC. * <p/> @@ -73,10 +76,15 @@ public class GeoPkgParser extends SQLite3Parser { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - GeoPkgDBParser p = new GeoPkgDBParser(); + GeoPkgDBParser p = new GeoPkgDBParser(ignoreBlobColumns); p.parse(stream, handler, metadata, context); } + @Field + public void setIgnoreBlobColumns(List<String> ignoreBlobColumns) { + this.ignoreBlobColumns.clear(); + this.ignoreBlobColumns.addAll(ignoreBlobColumns); + } /** * No-op * diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java index e9b093565..48256c2a5 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java @@ -18,11 +18,10 @@ package org.apache.tika.parser.geopkg; import java.io.IOException; -import java.sql.Blob; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; -import javax.sql.rowset.serial.SerialBlob; +import java.util.Set; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -30,9 +29,7 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.extractor.EmbeddedDocumentUtil; -import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.jdbc.JDBCTableReader; import org.apache.tika.parser.sqlite3.SQLite3TableReader; @@ -45,11 +42,12 @@ import org.apache.tika.parser.sqlite3.SQLite3TableReader; */ class GeoPkgTableReader extends SQLite3TableReader { - private static final String GEOM = "geom"; - private static final String DATA = "data"; + private final Set<String> ignoreBlobColumns; + public GeoPkgTableReader(Connection connection, String tableName, - EmbeddedDocumentUtil embeddedDocumentUtil) { + EmbeddedDocumentUtil embeddedDocumentUtil, Set<String> ignoreBlobColumns) { super(connection, tableName, embeddedDocumentUtil); + this.ignoreBlobColumns = ignoreBlobColumns; } @@ -58,7 +56,7 @@ class GeoPkgTableReader extends SQLite3TableReader { protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException { - if (GEOM.equals(columnName) || DATA.equals(columnName)) { + if (ignoreBlobColumns.contains(columnName)) { Attributes attrs = new AttributesImpl(); ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob"); ((AttributesImpl) attrs)