(tika) branch TIKA-4187 created (now 73694d21a)

2024-01-30 Thread tallison
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4187
in repository https://gitbox.apache.org/repos/asf/tika.git


  at 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and 
add metadata extraction

This branch includes the following new commits:

 new 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and 
add metadata extraction

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




(tika) 01/01: TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction

2024-01-30 Thread tallison
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4187
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 73694d21ab19e7e1134ee4f2bf8b76e8c35387bf
Author: tallison 
AuthorDate: Tue Jan 30 12:22:59 2024 -0500

TIKA-4187 -- improve detection of sqlite3 based files and add metadata 
extraction
---
 CHANGES.txt|   2 +
 .../org/apache/tika/mime/tika-mimetypes.xml| 105 -
 .../tika/parser/sqlite3/SQLite3DBParser.java   |  32 +++
 .../apache/tika/parser/sqlite3/SQLite3Parser.java  |  15 +++
 .../tika/parser/sqlite3/SQLite3ParserTest.java |   4 +
 .../apache/tika/parser/jdbc/AbstractDBParser.java  |  14 +++
 6 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index f9ac540e6..163753e9b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023
 
Other Changes/Updates
 
+   * Improve detection of sqlite3-based file formats (TIKA-4187).
+
* Upgrade PDFBox to 3.0.1 (TIKA-3347)

* Deprecated AbstractParser for removal in 4.x (TIKA-4132).
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 54f7cc6f6..2930fa720 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4858,11 +4858,114 @@
   
 
   
+
 
   
 
   
-
+  
+  
+https://www.geopackage.org/
+
+
+  
+
+  
+
+
+
+  
+  
+  
+https://www.geopackage.org/
+
+  
+
+  
+
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
+  
+
+  
+
+  
+
+
+  
   
 <_comment>Stata DTA Script
 DO
diff --git 
a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
 
b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index f4c9d745e..947272a0a 100644
--- 
a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ 
b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -27,6 +27,7 @@ import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.sqlite.SQLiteConfig;
@@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.jdbc.AbstractDBParser;
@@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
 class SQLite3DBParser extends AbstractDBParser {
 
 protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+protected static final Map METADATA_KEYS = Map.of(
+SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from 
pragma_application_id",
+SQLite3Parser.SQLITE_USER_VERSION, "select user_version from 
pragma_user_version"
+);
+
 //If the InputStream wasn't a TikaInputStream, copy to this tmp file
 Path tmpFile = null;
 
@@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser {
  EmbeddedDocumentUtil 
embeddedDocumentUtil) {
 return new SQLite3TableReader(connection, tableName, 
embeddedDocumentUtil);
 }
+
+@Override
+protected void extractMetadata(Connection connection, Metadata metadata) {
+//TODO -- figure out how to get the version of sqlite3 that last 
modified this file and
+// version-valid-for.
+// version-valid-for is at offset 92, last modified by app version 
isat offset 96 --
+// not clear how to get this info via sql
+//'file' extracts this info; we should to :\
+//See: https://www.sqlite.org/fileformat.html
+for (Map.Entry e : METADATA_KEYS.entrySet()) {
+try (Statement st = connection.createStatement()) {
+try (ResultSet rs =