This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4187
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 73694d21ab19e7e1134ee4f2bf8b76e8c35387bf
Author: tallison
AuthorDate: Tue Jan 30 12:22:59 2024 -0500
TIKA-4187 -- improve detection of sqlite3 based files and add metadata
extraction
---
CHANGES.txt| 2 +
.../org/apache/tika/mime/tika-mimetypes.xml| 105 -
.../tika/parser/sqlite3/SQLite3DBParser.java | 32 +++
.../apache/tika/parser/sqlite3/SQLite3Parser.java | 15 +++
.../tika/parser/sqlite3/SQLite3ParserTest.java | 4 +
.../apache/tika/parser/jdbc/AbstractDBParser.java | 14 +++
6 files changed, 171 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index f9ac540e6..163753e9b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023
Other Changes/Updates
+ * Improve detection of sqlite3-based file formats (TIKA-4187).
+
* Upgrade PDFBox to 3.0.1 (TIKA-3347)
* Deprecated AbstractParser for removal in 4.x (TIKA-4132).
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 54f7cc6f6..2930fa720 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4858,11 +4858,114 @@
+
-
+
+
+https://www.geopackage.org/
+
+
+
+
+
+
+
+
+
+
+
+https://www.geopackage.org/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
<_comment>Stata DTA Script
DO
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index f4c9d745e..947272a0a 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -27,6 +27,7 @@ import java.sql.SQLException;
import java.sql.Statement;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.sqlite.SQLiteConfig;
@@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.jdbc.AbstractDBParser;
@@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
class SQLite3DBParser extends AbstractDBParser {
protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+protected static final Map METADATA_KEYS = Map.of(
+SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from
pragma_application_id",
+SQLite3Parser.SQLITE_USER_VERSION, "select user_version from
pragma_user_version"
+);
+
//If the InputStream wasn't a TikaInputStream, copy to this tmp file
Path tmpFile = null;
@@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser {
EmbeddedDocumentUtil
embeddedDocumentUtil) {
return new SQLite3TableReader(connection, tableName,
embeddedDocumentUtil);
}
+
+@Override
+protected void extractMetadata(Connection connection, Metadata metadata) {
+//TODO -- figure out how to get the version of sqlite3 that last
modified this file and
+// version-valid-for.
+// version-valid-for is at offset 92, last modified by app version
isat offset 96 --
+// not clear how to get this info via sql
+//'file' extracts this info; we should to :\
+//See: https://www.sqlite.org/fileformat.html
+for (Map.Entry e : METADATA_KEYS.entrySet()) {
+try (Statement st = connection.createStatement()) {
+try (ResultSet rs =