(tika) branch main updated: TIKA-4215 -- avoid loading all the tika resources just to get the version (#1672)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 85d713a9a TIKA-4215 -- avoid loading all the tika resources just to get the version (#1672) 85d713a9a is described below commit 85d713a9a671d1e8c31bb4a78c830616c0b3eab5 Author: Tim Allison AuthorDate: Thu Mar 21 10:06:57 2024 -0400 TIKA-4215 -- avoid loading all the tika resources just to get the version (#1672) --- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +- tika-core/src/main/java/org/apache/tika/Tika.java | 4 .../src/main/java/org/apache/tika/server/core/TikaServerProcess.java | 2 +- .../main/java/org/apache/tika/server/core/resource/TikaResource.java | 2 +- .../src/test/java/org/apache/tika/server/core/TikaVersionTest.java| 2 +- .../src/test/java/org/apache/tika/server/core/TikaWelcomeTest.java| 4 ++-- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 6ae0f8ca7..bd78d4338 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -657,7 +657,7 @@ public class TikaCLI { } private void version() { -System.out.println(new Tika().toString()); +System.out.println(Tika.getString()); } private boolean testForHelp(String[] args) { diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java index 601703e43..22811f9c0 100644 --- a/tika-core/src/main/java/org/apache/tika/Tika.java +++ b/tika-core/src/main/java/org/apache/tika/Tika.java @@ -672,6 +672,10 @@ public class Tika { //--< Object > public String toString() { +return getString(); +} + +public static String getString() { String version = null; try (InputStream stream = Tika.class diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java index f5c3cca3a..10fb951e0 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java @@ -120,7 +120,7 @@ public class TikaServerProcess { } public static void main(String[] args) throws Exception { -LOG.info("Starting {} server", new Tika()); +LOG.info("Starting {} server", Tika.getString()); try { Options options = getOptions(); CommandLineParser cliParser = new DefaultParser(); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 857692750..5f0e76ec8 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -91,7 +91,7 @@ import org.apache.tika.utils.ExceptionUtils; public class TikaResource { public static final String GREETING = -"This is Tika Server (" + new Tika().toString() + "). Please PUT\n"; +"This is Tika Server (" + Tika.getString() + "). Please PUT\n"; private static final String META_PREFIX = "meta_"; private static final Logger LOG = LoggerFactory.getLogger(TikaResource.class); private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_+\\.A-Z0-9 ]+$"); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java index b1a81f230..ed7471f50 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java @@ -49,7 +49,7 @@ public class TikaVersionTest extends CXFTestBase { WebClient.create(endPoint + VERSION_PATH).type("text/plain").accept("text/plain") .get(); -assertEquals(new Tika().toString(), +assertEquals(Tika.getString(), getStringFromInputStream((InputStream) response.getEntity())); } } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/s
(tika) branch TIKA-4216 updated (ba2d729af -> 5a4eba49e)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4216 in repository https://gitbox.apache.org/repos/asf/tika.git from ba2d729af TIKA-4216 -- Avoid checking for imagemagick if image processing is disabled add 5a4eba49e TIKA-4216 -- Avoid checking for imagemagick if image processing is disabled -- allow for user specified calls to use imagemagick No new revisions were added by this update. Summary of changes: .../org/apache/tika/parser/ocr/TesseractOCRParser.java | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-)
(tika) branch TIKA-4216 created (now ba2d729af)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4216 in repository https://gitbox.apache.org/repos/asf/tika.git at ba2d729af TIKA-4216 -- Avoid checking for imagemagick if image processing is disabled This branch includes the following new commits: new ba2d729af TIKA-4216 -- Avoid checking for imagemagick if image processing is disabled The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4216 -- Avoid checking for imagemagick if image processing is disabled
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4216 in repository https://gitbox.apache.org/repos/asf/tika.git commit ba2d729af6b3194b9ba81d4041016f6d8e870e99 Author: tallison AuthorDate: Thu Mar 21 09:30:24 2024 -0400 TIKA-4216 -- Avoid checking for imagemagick if image processing is disabled --- .../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java| 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index a79e05b1d..aa26f4688 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -528,7 +528,11 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements @Override public void initialize(Map params) throws TikaConfigException { hasTesseract = hasTesseract(); -hasImageMagick = hasImageMagick(); +if (isEnableImagePreprocessing()) { +hasImageMagick = hasImageMagick(); +} else { +hasImageMagick = false; +} if (preloadLangs) { preloadLangs(); if (!StringUtils.isBlank(defaultConfig.getLanguage())) {
(tika) branch TIKA-4215 created (now f819cbb43)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4215 in repository https://gitbox.apache.org/repos/asf/tika.git at f819cbb43 TIKA-4215 -- avoid loading all the tika resources just to get the version This branch includes the following new commits: new f819cbb43 TIKA-4215 -- avoid loading all the tika resources just to get the version The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4215 -- avoid loading all the tika resources just to get the version
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4215 in repository https://gitbox.apache.org/repos/asf/tika.git commit f819cbb431646baebc68e07a1771e768ca54a04a Author: tallison AuthorDate: Thu Mar 21 09:25:41 2024 -0400 TIKA-4215 -- avoid loading all the tika resources just to get the version --- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +- tika-core/src/main/java/org/apache/tika/Tika.java | 4 .../src/main/java/org/apache/tika/server/core/TikaServerProcess.java | 2 +- .../main/java/org/apache/tika/server/core/resource/TikaResource.java | 2 +- .../src/test/java/org/apache/tika/server/core/TikaVersionTest.java| 2 +- .../src/test/java/org/apache/tika/server/core/TikaWelcomeTest.java| 4 ++-- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 6ae0f8ca7..bd78d4338 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -657,7 +657,7 @@ public class TikaCLI { } private void version() { -System.out.println(new Tika().toString()); +System.out.println(Tika.getString()); } private boolean testForHelp(String[] args) { diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java index 601703e43..22811f9c0 100644 --- a/tika-core/src/main/java/org/apache/tika/Tika.java +++ b/tika-core/src/main/java/org/apache/tika/Tika.java @@ -672,6 +672,10 @@ public class Tika { //--< Object > public String toString() { +return getString(); +} + +public static String getString() { String version = null; try (InputStream stream = Tika.class diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java index f5c3cca3a..10fb951e0 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java @@ -120,7 +120,7 @@ public class TikaServerProcess { } public static void main(String[] args) throws Exception { -LOG.info("Starting {} server", new Tika()); +LOG.info("Starting {} server", Tika.getString()); try { Options options = getOptions(); CommandLineParser cliParser = new DefaultParser(); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 857692750..5f0e76ec8 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -91,7 +91,7 @@ import org.apache.tika.utils.ExceptionUtils; public class TikaResource { public static final String GREETING = -"This is Tika Server (" + new Tika().toString() + "). Please PUT\n"; +"This is Tika Server (" + Tika.getString() + "). Please PUT\n"; private static final String META_PREFIX = "meta_"; private static final Logger LOG = LoggerFactory.getLogger(TikaResource.class); private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_+\\.A-Z0-9 ]+$"); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java index b1a81f230..ed7471f50 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaVersionTest.java @@ -49,7 +49,7 @@ public class TikaVersionTest extends CXFTestBase { WebClient.create(endPoint + VERSION_PATH).type("text/plain").accept("text/plain") .get(); -assertEquals(new Tika().toString(), +assertEquals(Tika.getString(), getStringFromInputStream((InputStream) response.getEntity())); } } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaWelcomeTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaWelcomeTest.java index 3c97d329c..428ec71f0 100644 --- a/tika-server/tika-server-core/src/test/java/org
(tika) branch TIKA-4211 deleted (was 2239ea9a8)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4211 in repository https://gitbox.apache.org/repos/asf/tika.git was 2239ea9a8 TIKA-4211 -- first attempt The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4211 -- first attempt (#1670)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 7dc3d28a5 TIKA-4211 -- first attempt (#1670) 7dc3d28a5 is described below commit 7dc3d28a5574f6e40981dca7666ccb97b9ebe467 Author: Tim Allison AuthorDate: Thu Mar 21 08:42:37 2024 -0400 TIKA-4211 -- first attempt (#1670) --- .../tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index df566a284..38e9c8aac 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -344,7 +344,8 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { for (String relation : new String[]{XSLFRelation.VML_DRAWING.getRelation(), XSLFRelation.SLIDE_LAYOUT.getRelation(), XSLFRelation.NOTES_MASTER.getRelation(), -XSLFRelation.NOTES.getRelation()}) { +XSLFRelation.NOTES.getRelation(), XSLFRelation.CHART.getRelation(), +XSLFRelation.DIAGRAM_DRAWING.getRelation()}) { try { for (PackageRelationship packageRelationship : slidePart .getRelationshipsByType(relation)) {
(tika) branch TIKA-4213 deleted (was bfaecac53)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4213 in repository https://gitbox.apache.org/repos/asf/tika.git was bfaecac53 TIKA-4213 -- use more standard sql -- timestamp with time zone The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4213 -- improve jdbc pipes reporter (#1669)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new e63730e12 TIKA-4213 -- improve jdbc pipes reporter (#1669) e63730e12 is described below commit e63730e126e74b4ac36e5f2b8c6790963eb41c14 Author: Tim Allison AuthorDate: Thu Mar 21 08:42:25 2024 -0400 TIKA-4213 -- improve jdbc pipes reporter (#1669) * TIKA-4213 -- improve jdbc pipes reporter --- .../pipes/reporters/jdbc/JDBCPipesReporter.java| 52 -- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java index 0082eb9de..ee52bf80f 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java @@ -22,6 +22,8 @@ import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.sql.Statement; +import java.sql.Timestamp; +import java.time.Instant; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -68,7 +70,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl private String connectionString; private Optional postConnectionString = Optional.empty(); -private final ArrayBlockingQueue queue = +private final ArrayBlockingQueue queue = new ArrayBlockingQueue(ARRAY_BLOCKING_QUEUE_SIZE); CompletableFuture reportWorkerFuture; @@ -146,7 +148,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl return; } try { -queue.offer(new KeyStatusPair(t.getEmitKey().getEmitKey(), result.getStatus()), +queue.offer(new IdStatusPair(t.getId(), result.getStatus()), MAX_WAIT_MILLIS, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //swallow @@ -167,7 +169,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl @Override public void close() throws IOException { try { -queue.offer(KeyStatusPair.END_SEMAPHORE, 60, TimeUnit.SECONDS); +queue.offer(IdStatusPair.END_SEMAPHORE, 60, TimeUnit.SECONDS); } catch (InterruptedException e) { return; } @@ -186,20 +188,20 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl } } -private static class KeyStatusPair { +private static class IdStatusPair { -static KeyStatusPair END_SEMAPHORE = new KeyStatusPair(null, null); -private final String emitKey; +static IdStatusPair END_SEMAPHORE = new IdStatusPair(null, null); +private final String id; private final PipesResult.STATUS status; -public KeyStatusPair(String emitKey, PipesResult.STATUS status) { -this.emitKey = emitKey; +public IdStatusPair(String id, PipesResult.STATUS status) { +this.id = id; this.status = status; } @Override public String toString() { -return "KeyStatusPair{" + "emitKey='" + emitKey + '\'' + ", status=" + status + '}'; +return "KeyStatusPair{" + "id='" + id + '\'' + ", status=" + status + '}'; } } @@ -208,18 +210,18 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl private static final int MAX_TRIES = 3; private final String connectionString; private final Optional postConnectionString; -private final ArrayBlockingQueue queue; +private final ArrayBlockingQueue queue; private final int cacheSize; private final long reportWithinMs; -List cache = new ArrayList<>(); +List cache = new ArrayList<>(); private Connection connection; private PreparedStatement insert; public ReportWorker(String connectionString, Optional postConnectionString, -ArrayBlockingQueue queue, int cacheSize, +ArrayBlockingQueue queue, int cacheSize, long reportWithinMs) { this.connectionString = connectionString; this.postConnectionString = postConnectionString; @@ -242,18 +244,19 @@ public class JDBCPipesReporter ext
(tika) branch TIKA-4213 updated (51477fff2 -> bfaecac53)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4213 in repository https://gitbox.apache.org/repos/asf/tika.git from 51477fff2 TIKA-4213 add bfaecac53 TIKA-4213 -- use more standard sql -- timestamp with time zone No new revisions were added by this update. Summary of changes: .../java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
(tika) 01/01: TIKA-4211 -- first attempt
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4211 in repository https://gitbox.apache.org/repos/asf/tika.git commit 2239ea9a8487db630519a35f117f8528daebd3eb Author: tallison AuthorDate: Wed Mar 20 15:16:53 2024 -0400 TIKA-4211 -- first attempt --- .../tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index df566a284..38e9c8aac 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -344,7 +344,8 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { for (String relation : new String[]{XSLFRelation.VML_DRAWING.getRelation(), XSLFRelation.SLIDE_LAYOUT.getRelation(), XSLFRelation.NOTES_MASTER.getRelation(), -XSLFRelation.NOTES.getRelation()}) { +XSLFRelation.NOTES.getRelation(), XSLFRelation.CHART.getRelation(), +XSLFRelation.DIAGRAM_DRAWING.getRelation()}) { try { for (PackageRelationship packageRelationship : slidePart .getRelationshipsByType(relation)) {
(tika) branch TIKA-4211 created (now 2239ea9a8)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4211 in repository https://gitbox.apache.org/repos/asf/tika.git at 2239ea9a8 TIKA-4211 -- first attempt This branch includes the following new commits: new 2239ea9a8 TIKA-4211 -- first attempt The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4213
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4213 in repository https://gitbox.apache.org/repos/asf/tika.git commit 51477fff2e8979d087a11de7d23a21ead7bbd1de Author: tallison AuthorDate: Wed Mar 20 14:31:34 2024 -0400 TIKA-4213 --- .../pipes/reporters/jdbc/JDBCPipesReporter.java| 52 -- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java index 0082eb9de..0d6165140 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java @@ -22,6 +22,8 @@ import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.sql.Statement; +import java.sql.Timestamp; +import java.time.Instant; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -68,7 +70,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl private String connectionString; private Optional postConnectionString = Optional.empty(); -private final ArrayBlockingQueue queue = +private final ArrayBlockingQueue queue = new ArrayBlockingQueue(ARRAY_BLOCKING_QUEUE_SIZE); CompletableFuture reportWorkerFuture; @@ -146,7 +148,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl return; } try { -queue.offer(new KeyStatusPair(t.getEmitKey().getEmitKey(), result.getStatus()), +queue.offer(new IdStatusPair(t.getId(), result.getStatus()), MAX_WAIT_MILLIS, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //swallow @@ -167,7 +169,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl @Override public void close() throws IOException { try { -queue.offer(KeyStatusPair.END_SEMAPHORE, 60, TimeUnit.SECONDS); +queue.offer(IdStatusPair.END_SEMAPHORE, 60, TimeUnit.SECONDS); } catch (InterruptedException e) { return; } @@ -186,20 +188,20 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl } } -private static class KeyStatusPair { +private static class IdStatusPair { -static KeyStatusPair END_SEMAPHORE = new KeyStatusPair(null, null); -private final String emitKey; +static IdStatusPair END_SEMAPHORE = new IdStatusPair(null, null); +private final String id; private final PipesResult.STATUS status; -public KeyStatusPair(String emitKey, PipesResult.STATUS status) { -this.emitKey = emitKey; +public IdStatusPair(String id, PipesResult.STATUS status) { +this.id = id; this.status = status; } @Override public String toString() { -return "KeyStatusPair{" + "emitKey='" + emitKey + '\'' + ", status=" + status + '}'; +return "KeyStatusPair{" + "id='" + id + '\'' + ", status=" + status + '}'; } } @@ -208,18 +210,18 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl private static final int MAX_TRIES = 3; private final String connectionString; private final Optional postConnectionString; -private final ArrayBlockingQueue queue; +private final ArrayBlockingQueue queue; private final int cacheSize; private final long reportWithinMs; -List cache = new ArrayList<>(); +List cache = new ArrayList<>(); private Connection connection; private PreparedStatement insert; public ReportWorker(String connectionString, Optional postConnectionString, -ArrayBlockingQueue queue, int cacheSize, +ArrayBlockingQueue queue, int cacheSize, long reportWithinMs) { this.connectionString = connectionString; this.postConnectionString = postConnectionString; @@ -242,18 +244,19 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl public void run() { long lastReported = System.currentTimeMillis(); while (true) { -//blocking -KeyStatusPair p = null; +IdStatusP
(tika) branch TIKA-4213 created (now 51477fff2)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4213 in repository https://gitbox.apache.org/repos/asf/tika.git at 51477fff2 TIKA-4213 This branch includes the following new commits: new 51477fff2 TIKA-4213 The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch TIKA-4207 updated: TIKA-4207 -- small improvements to AsyncResource and WMFParser
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 7ca6d1759 TIKA-4207 -- small improvements to AsyncResource and WMFParser 7ca6d1759 is described below commit 7ca6d17599e60f93a653eff727d2a014f36aa471 Author: tallison AuthorDate: Wed Mar 20 14:29:40 2024 -0400 TIKA-4207 -- small improvements to AsyncResource and WMFParser --- .../java/org/apache/tika/parser/microsoft/WMFParser.java| 3 ++- .../org/apache/tika/server/core/resource/AsyncResource.java | 13 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java index 73b95b58c..3c55a14b0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java @@ -23,6 +23,7 @@ import java.nio.charset.Charset; import java.util.Collections; import java.util.Set; +import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwmf.record.HwmfRecord; import org.apache.poi.hwmf.record.HwmfRecordType; @@ -63,7 +64,7 @@ public class WMFParser implements Parser { try { HwmfPicture picture = null; try { -picture = new HwmfPicture(stream); +picture = new HwmfPicture(CloseShieldInputStream.wrap(stream)); } catch (ArrayIndexOutOfBoundsException e) { //POI can throw this on corrupt files throw new TikaException(e.getClass().getSimpleName() + ": " + e.getMessage(), e); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java index a23162065..a4d4ed489 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java @@ -45,6 +45,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.serialization.JsonFetchEmitTupleList; import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.async.AsyncProcessor; +import org.apache.tika.pipes.async.OfferLargerThanQueueSize; import org.apache.tika.pipes.emitter.EmitData; import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.fetcher.FetchKey; @@ -117,10 +118,14 @@ public class AsyncResource { } } //Instant start = Instant.now(); -boolean offered = asyncProcessor.offer(request.getTuples(), maxQueuePauseMs); -if (offered) { -return ok(request.getTuples().size()); -} else { +try { +boolean offered = asyncProcessor.offer(request.getTuples(), maxQueuePauseMs); +if (offered) { +return ok(request.getTuples().size()); +} else { +return throttle(request.getTuples().size()); +} +} catch (OfferLargerThanQueueSize e) { return throttle(request.getTuples().size()); } }
(tika) branch TIKA-4207 updated: TIKA-4207 -- small improvements to AsyncResource
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 223e2d086 TIKA-4207 -- small improvements to AsyncResource 223e2d086 is described below commit 223e2d0863bab70061961c8ba2e9c4a5dc4f1b4e Author: tallison AuthorDate: Wed Mar 13 09:47:01 2024 -0400 TIKA-4207 -- small improvements to AsyncResource --- .../tika/server/core/resource/AsyncResource.java | 19 --- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java index 78d29abfa..a23162065 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java @@ -22,7 +22,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; -import java.time.Instant; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -35,6 +34,7 @@ import jakarta.ws.rs.Produces; import jakarta.ws.rs.core.Context; import jakarta.ws.rs.core.HttpHeaders; import jakarta.ws.rs.core.UriInfo; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -46,7 +46,6 @@ import org.apache.tika.metadata.serialization.JsonFetchEmitTupleList; import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.async.AsyncProcessor; import org.apache.tika.pipes.emitter.EmitData; -import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.fetcher.FetchKey; @@ -107,12 +106,18 @@ public class AsyncResource { return badFetcher(t.getFetchKey()); } if (!emitterManager.getSupported().contains(t.getEmitKey().getEmitterName())) { -return badEmitter(t.getEmitKey()); +return badEmitter(t.getEmitKey().getEmitterName()); +} +if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes() && + !StringUtils.isAllBlank(t.getEmbeddedDocumentBytesConfig().getEmitter())) { +String bytesEmitter = t.getEmbeddedDocumentBytesConfig().getEmitter(); +if (!emitterManager.getSupported().contains(bytesEmitter)) { +return badEmitter(bytesEmitter); +} } } -Instant start = Instant.now(); +//Instant start = Instant.now(); boolean offered = asyncProcessor.offer(request.getTuples(), maxQueuePauseMs); -asyncProcessor.getCapacity(); if (offered) { return ok(request.getTuples().size()); } else { @@ -135,8 +140,8 @@ public class AsyncResource { return map; } -private Map badEmitter(EmitKey emitKey) { -throw new BadRequestException("can't find emitter for " + emitKey.getEmitterName()); +private Map badEmitter(String emitterName) { +throw new BadRequestException("can't find emitter for " + emitterName); } private Map badFetcher(FetchKey fetchKey) {
(tika) branch TIKA-4207 updated: TIKA-4207 -- add capacity to async throttled response
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 069b05bce TIKA-4207 -- add capacity to async throttled response 069b05bce is described below commit 069b05bcebe991af10c924c836f8b5cb9992126a Author: tallison AuthorDate: Wed Mar 13 09:40:22 2024 -0400 TIKA-4207 -- add capacity to async throttled response --- .../main/java/org/apache/tika/server/core/resource/AsyncResource.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java index 2cc7b1294..78d29abfa 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java @@ -112,6 +112,7 @@ public class AsyncResource { } Instant start = Instant.now(); boolean offered = asyncProcessor.offer(request.getTuples(), maxQueuePauseMs); +asyncProcessor.getCapacity(); if (offered) { return ok(request.getTuples().size()); } else { @@ -130,6 +131,7 @@ public class AsyncResource { Map map = new HashMap<>(); map.put("status", "throttled"); map.put("msg", "not able to receive request of size " + requestSize + " at this time"); +map.put("capacity", asyncProcessor.getCapacity()); return map; }
(tika) branch TIKA-4207 updated: TIKA-4207 -- trivial binary file name refinement
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4207 by this push: new e2b952c80 TIKA-4207 -- trivial binary file name refinement e2b952c80 is described below commit e2b952c80c699a82ec9d735dc299dd3d392c10f2 Author: tallison AuthorDate: Wed Mar 13 09:03:48 2024 -0400 TIKA-4207 -- trivial binary file name refinement --- .../apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java| 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java index cbc1f3411..214c2ab4e 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java @@ -19,6 +19,7 @@ package org.apache.tika.extractor; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import org.apache.tika.io.FilenameUtils; import org.apache.tika.metadata.Metadata; @@ -40,13 +41,16 @@ public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocum StringBuilder emitKey = new StringBuilder(containerEmitKey) - .append("/").append(containerEmitKey).append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix()) +.append("/") +.append(FilenameUtils.getName(containerEmitKey)) +.append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix()) .append(embeddedIdString); if (embeddedDocumentBytesConfig.getSuffixStrategy().equals( EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) { String fName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); String suffix = FilenameUtils.getSuffixFromPath(fName); +suffix = suffix.toLowerCase(Locale.US); emitKey.append(suffix); } return emitKey.toString();
(tika) branch TIKA-4207 updated: TIKA-4207 basically working
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 3e18b889d TIKA-4207 basically working 3e18b889d is described below commit 3e18b889ded54e746c4dbd25580d9ac8f73720cf Author: tallison AuthorDate: Tue Mar 12 19:52:26 2024 -0400 TIKA-4207 basically working --- .../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 16 ++ .../test/java/org/apache/tika/cli/TikaCLITest.java | 2 - .../AbstractEmbeddedDocumentByteStore.java | 17 ++- .../extractor/ParsingAndEmbeddedDocExtractor.java | 162 - .../ParsingAndEmbeddedDocExtractorFactory.java | 40 - .../ParsingEmbeddedDocumentExtractor.java | 41 +- .../apache/tika/parser/RecursiveParserWrapper.java | 2 + .../java/org/apache/tika/pipes/FetchEmitTuple.java | 2 +- .../java/org/apache/tika/pipes/PipesServer.java| 144 +- .../extractor/EmbeddedDocumentBytesConfig.java | 73 +- .../extractor/EmbeddedDocumentEmitterStore.java| 18 ++- .../org/apache/tika/parser/mock/MockParser.java| 26 +--- .../org/apache/tika/pipes/PipesServerTest.java | 60 +++- ...rocessorTest.java => AsyncChaosMonkeyTest.java} | 2 +- .../resources/org/apache/tika/pipes/TIKA-4207.xml | 30 tika-pipes/tika-async-cli/pom.xml | 7 + .../apache/tika/async/cli/AsyncProcessorTest.java | 138 ++ .../apache/tika/async/cli/TikaAsyncCLITest.java| 2 +- .../test/resources/configs/TIKA-4207-emitter.xml | 35 + .../resources/{ => configs}/tika-config-broken.xml | 0 .../basic_embedded.xml}| 29 ++-- tika-pipes/tika-pipes-iterators/pom.xml| 1 + .../tika-pipes-iterator-json}/pom.xml | 43 +++--- .../pipesiterator/json/JsonPipesIterator.java | 65 + .../pipesiterator/json/TestJsonPipesIterator.java | 85 +++ .../test-documents/test-with-embedded-bytes.json | 100 + .../src/test/resources/test-documents/test.json| 100 + .../metadata/serialization/JsonFetchEmitTuple.java | 42 +- .../serialization/JsonFetchEmitTupleTest.java | 20 +++ 29 files changed, 940 insertions(+), 362 deletions(-) diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java index 1f6c8fc2c..d9f6d053f 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.tika.cli; import static java.nio.charset.StandardCharsets.UTF_8; diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index c160db396..fa16e124a 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -30,9 +30,7 @@ import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; -import org.apache.commons.io.FileUtils; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java index c435a3e6e..cbc1f3411 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java @@ -30,30 +30,31 @@ public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocum List ids = new ArrayList<>(); -public String getFetchKey(String c
(tika) branch TIKA-4207 created (now 0674ea469)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git at 0674ea469 TIKA-4207 -- WIP, checkpoint commit. Doesn't compile...:D This branch includes the following new commits: new 0674ea469 TIKA-4207 -- WIP, checkpoint commit. Doesn't compile...:D The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4207 -- WIP, checkpoint commit. Doesn't compile...:D
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git commit 0674ea4693d57cbc30f8a59417f469a48ce53c2f Author: tallison AuthorDate: Fri Mar 8 20:13:49 2024 -0500 TIKA-4207 -- WIP, checkpoint commit. Doesn't compile...:D --- .../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 73 .../test/java/org/apache/tika/cli/TikaCLITest.java | 57 +-- .../AbstractEmbeddedDocumentByteStore.java | 63 +++ .../extractor/BasicEmbeddedDocumentByteStore.java | 46 + .../tika/extractor/EmbeddedDocumentByteStore.java | 32 .../extractor/ParsingAndEmbeddedDocExtractor.java | 162 ++ .../ParsingAndEmbeddedDocExtractorFactory.java | 40 + .../java/org/apache/tika/pipes/FetchEmitTuple.java | 52 -- .../java/org/apache/tika/pipes/PipesServer.java| 188 + .../extractor/EmbeddedDocumentBytesConfig.java | 93 ++ .../extractor/EmbeddedDocumentEmitterStore.java| 63 +++ .../org/apache/tika/pipes/PipesServerTest.java | 2 +- .../metadata/serialization/JsonFetchEmitTuple.java | 41 - 13 files changed, 811 insertions(+), 101 deletions(-) diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java new file mode 100644 index 0..1f6c8fc2c --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java @@ -0,0 +1,73 @@ +package org.apache.tika.cli; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TikaCLIAsyncTest extends TikaCLITest { + +private static Path ASYNC_CONFIG; +@TempDir +private static Path ASYNC_OUTPUT_DIR; + +@BeforeAll +public static void setUpClass() throws Exception { +ASYNC_CONFIG = Files.createTempFile(ASYNC_OUTPUT_DIR, "async-config-", ".xml"); +String xml = "" + "" + "3" + +"" + ASYNC_CONFIG.toAbsolutePath() + "" + +"" + "" + +"" + +"fsf" + "" + TEST_DATA_FILE.getAbsolutePath() + +"" + "" + "" + "" + +"" + +"fse" + "" + ASYNC_OUTPUT_DIR.toAbsolutePath() + +"" + "true" + "" + "" + +"" + +"" + TEST_DATA_FILE.getAbsolutePath() + "" + +"fsf" + "fse" + +"" + ""; +Files.write(ASYNC_CONFIG, xml.getBytes(UTF_8)); +} + +@Test +public void testAsync() throws Exception { +String content = getParamOutContent("-a", "--config=" + ASYNC_CONFIG.toAbsolutePath()); + +int json = 0; +for (File f : ASYNC_OUTPUT_DIR.toFile().listFiles()) { +if (f.getName().endsWith(".json")) { +//check one file for pretty print +if (f.getName().equals("coffee.xls.json")) { +checkForPrettyPrint(f); +} +json++; +} +} +assertEquals(17, json); +} + +private void checkForPrettyPrint(File f) throws IOException { +String json = FileUtils.readFileToString(f, UTF_8); +int previous = json.indexOf("Content-Length"); +assertTrue(previous > -1); +for (String k : new String[]{"Content-Type", "dc:creator", +"dcterms:created", "dcterms:modified", "X-TIKA:content\""}) { +int i = json.indexOf(k); +assertTrue( i > -1, "should have found " + k); +assertTrue(i > previous, "bad order: " + k + " at " + i + " not less than " + previous); +previous = i; +} +} + + +} diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index ebd1d90b9..c160db396 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -45,11 +45,8 @@ import org.apache.tika.utils.ProcessUtils; */ public class
(tika) 02/02: TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git commit 7302ccd17cf07506bc7f774b7aeb9cfd2408c05b Author: tallison AuthorDate: Wed Feb 28 09:50:30 2024 -0500 TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle Apologies for initially pushing this commit to main instead of an issue branch. :/ (cherry picked from commit 1c1018950c88454ee9a91456931f9d18dde13124) --- .../main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java | 3 ++- .../java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java index 7081f6bc7..abe44405a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java @@ -119,7 +119,8 @@ public class ChmExtractor { getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable()); setIndexOfContent(ChmCommons - .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT)); + .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), +ChmConstants.CONTENT)); setLzxBlockOffset( (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()) .getOffset() + getChmItsfHeader().getDataOffset())); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java index e95047a43..815820346 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java @@ -53,7 +53,8 @@ public class TestChmLzxState { ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader); int indexOfControlData = ChmCommons - .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA); + .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(), +ChmConstants.CONTROL_DATA); int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
(tika) branch branch_2x updated (0ff5834e3 -> 7302ccd17)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git from 0ff5834e3 TIKA-4162: update aws, miredot-plugin new c94093684 TIKA-4204 -- improve lookup of dataspace/storage items new 7302ccd17 TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: .../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++ .../org/apache/tika/parser/microsoft/chm/ChmExtractor.java| 3 ++- .../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java | 2 +- .../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java | 3 ++- 4 files changed, 12 insertions(+), 7 deletions(-)
(tika) 01/02: TIKA-4204 -- improve lookup of dataspace/storage items
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git commit c94093684622a2a05d06e9225fd5cc1166814038 Author: tallison AuthorDate: Wed Feb 28 09:41:08 2024 -0500 TIKA-4204 -- improve lookup of dataspace/storage items (cherry picked from commit eefe884c81a2a94c212e5ed9aa5bbb659e653782) --- .../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++ .../org/apache/tika/parser/microsoft/chm/ChmExtractor.java| 2 +- .../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java | 2 +- .../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java index 4af06e446..bb0de014c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java @@ -222,7 +222,7 @@ public class ChmCommons { */ public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException { -return (indexOf(text, pattern)) - 4; +return (indexOfDataSpaceStorageElement(text, pattern)) - 4; } /** @@ -233,7 +233,7 @@ public class ChmCommons { * @return an index, if nothing found returns -1 * @throws ChmParsingException */ -public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException { +public static int indexOfDataSpaceStorageElement(byte[] text, byte[] pattern) throws ChmParsingException { int[] next = null; int i = 0, j = -1; @@ -281,15 +281,18 @@ public class ChmCommons { /** * Searches for some pattern in the directory listing entry list + * This requires that the entry name start with "::DataSpaceStorage" + * See TIKA-4204 * * @param list * @param pattern * @return an index, if nothing found returns -1 */ -public static int indexOf(List list, String pattern) { +public static int indexOfDataSpaceStorageElement(List list, String pattern) { int place = 0; for (DirectoryListingEntry directoryListingEntry : list) { -if (directoryListingEntry.toString().contains(pattern)) { +if (directoryListingEntry.getName().startsWith("::DataSpace/Storage") && +directoryListingEntry.getName().contains(pattern)) { return place; } ++place; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java index ba1738b65..7081f6bc7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java @@ -119,7 +119,7 @@ public class ChmExtractor { getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable()); setIndexOfContent(ChmCommons -.indexOf(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT)); + .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT)); setLzxBlockOffset( (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()) .getOffset() + getChmItsfHeader().getDataOffset())); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java index ccd59bbee..a57e03d5b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java +++ b/tika-p
(tika) branch TIKA-4202b deleted (was f8a40d9a5)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4202b in repository https://gitbox.apache.org/repos/asf/tika.git was f8a40d9a5 TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630) 2bc0f9bdc is described below commit 2bc0f9bdce21559f592ef71919d242974be027fb Author: Tim Allison AuthorDate: Wed Feb 28 10:04:14 2024 -0500 TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630) --- tika-core/src/main/java/org/apache/tika/metadata/PDF.java| 4 .../main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 ++ .../src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java | 4 .../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 8 .../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 9 + 5 files changed, 31 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index a6c753fcd..b15c10383 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -209,6 +209,10 @@ public interface PDF { Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), new Property[]{ TikaCoreProperties.VERSION_COUNT }); +/** + * This counts the number of pages that would have been OCR'd or were OCR'd depending + * on the OCR settings. If NO_OCR is selected, this will + */ Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index e03e14a4f..4d0a08226 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -533,8 +533,14 @@ class AbstractPDF2XHTML extends PDFTextStripper { void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY ocrStrategy) throws IOException, TikaException, SAXException { if (ocrStrategy.equals(NO_OCR)) { +//I don't think this is reachable? return; } +//count the number of times that OCR would have been called +OCRPageCounter c = context.get(OCRPageCounter.class); +if (c != null) { +c.increment(); +} MediaType ocrImageMediaType = MediaType.image("ocr-" + config.getOcrImageFormatName()); if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) { if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java index 3b382099b..418419eee 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java @@ -16,6 +16,10 @@ */ package org.apache.tika.parser.pdf; +/** + * This counts the number of pages that OCR would have been + * run or was run depending on the settings. + */ public class OCRPageCounter { private int count; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 0269a58ef..9406cac53 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1432,6 +1432,14 @@ public class
(tika) branch TIKA-4205 deleted (was 3d4c94308)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4205 in repository https://gitbox.apache.org/repos/asf/tika.git was 3d4c94308 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 36a0dca43 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629) 36a0dca43 is described below commit 36a0dca435d38b123bc62567b328fc9e522ac956 Author: Tim Allison AuthorDate: Wed Feb 28 10:04:00 2024 -0500 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629) --- tika-eval/tika-eval-app/pom.xml | 2 -- .../java/org/apache/tika/eval/app/AbstractProfiler.java | 17 - .../java/org/apache/tika/eval/app/ExtractProfiler.java | 4 .../src/main/java/org/apache/tika/eval/app/db/Cols.java | 3 +++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 18671052c..b93783f75 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -93,10 +93,8 @@ - org.apache.tika:tika-core:jar: org.apache.tika:tika-serialization:jar: org.apache.tika:tika-langdetect-opennlp:jar: - commons-io:commons-io:jar: commons-codec:commons-codec:jar: org.apache.commons:commons-lang3:jar: org.apache.commons:commons-math3:jar: diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java index 2397bbcab..0cd609d3b 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java @@ -71,6 +71,7 @@ import org.apache.tika.eval.core.util.EvalExceptionUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PDF; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; @@ -387,6 +388,10 @@ public abstract class AbstractProfiler extends FileResourceConsumer { if (nPages != null) { data.put(Cols.NUM_PAGES, Integer.toString(nPages)); } +Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT); +if (nOCRPages != null) { +data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages)); +} //if the outer wrapper document if (i == 0) { @@ -395,10 +400,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer { data.put(Cols.EMBEDDED_DEPTH, "0"); } else { data.put(Cols.IS_EMBEDDED, TRUE); -data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))); +String embeddedFilePath = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); +if (! StringUtils.isBlank(embeddedFilePath)) { +data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath))); +data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath); +} if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) { data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH)); } +if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) { +data.put(Cols.ATTACHMENT_TYPE, m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); +} } String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME)); ext = (ext == null) ? "" : ext.toLowerCase(Locale.US); @@ -486,6 +498,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Integer.toString(commonTokenResult.getUniqueAlphabeticTokens())); data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens())); +double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0; +data.put(Cols.OOV, Double.toString(oov)); } TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class); if (tokenCounts != null) { @@ -498,6 +512,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Double.toString((Double) textStats.get(TokenEntropy.class))); } + SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class); if (summStats != null)
(tika) branch main updated (eefe884c8 -> 1c1018950)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/tika.git from eefe884c8 TIKA-4204 -- improve lookup of dataspace/storage items add 1c1018950 TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle No new revisions were added by this update. Summary of changes: .../main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java | 3 ++- .../java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-)
(tika) branch main updated (45bcf2b5c -> eefe884c8)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/tika.git from 45bcf2b5c Merge pull request #1628 from apache/dependabot/maven/com.qmino-miredot-plugin-2.4.4-Java11 add eefe884c8 TIKA-4204 -- improve lookup of dataspace/storage items No new revisions were added by this update. Summary of changes: .../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++ .../org/apache/tika/parser/microsoft/chm/ChmExtractor.java| 2 +- .../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java | 2 +- .../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-)
(tika) 01/01: TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4205 in repository https://gitbox.apache.org/repos/asf/tika.git commit 3d4c94308d5b64ecc7306850b0ec935e615e7c6f Author: tallison AuthorDate: Wed Feb 28 09:26:21 2024 -0500 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler --- tika-eval/tika-eval-app/pom.xml | 2 -- .../java/org/apache/tika/eval/app/AbstractProfiler.java | 17 - .../java/org/apache/tika/eval/app/ExtractProfiler.java | 4 .../src/main/java/org/apache/tika/eval/app/db/Cols.java | 3 +++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 18671052c..b93783f75 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -93,10 +93,8 @@ - org.apache.tika:tika-core:jar: org.apache.tika:tika-serialization:jar: org.apache.tika:tika-langdetect-opennlp:jar: - commons-io:commons-io:jar: commons-codec:commons-codec:jar: org.apache.commons:commons-lang3:jar: org.apache.commons:commons-math3:jar: diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java index 2397bbcab..0cd609d3b 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java @@ -71,6 +71,7 @@ import org.apache.tika.eval.core.util.EvalExceptionUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PDF; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; @@ -387,6 +388,10 @@ public abstract class AbstractProfiler extends FileResourceConsumer { if (nPages != null) { data.put(Cols.NUM_PAGES, Integer.toString(nPages)); } +Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT); +if (nOCRPages != null) { +data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages)); +} //if the outer wrapper document if (i == 0) { @@ -395,10 +400,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer { data.put(Cols.EMBEDDED_DEPTH, "0"); } else { data.put(Cols.IS_EMBEDDED, TRUE); -data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))); +String embeddedFilePath = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); +if (! StringUtils.isBlank(embeddedFilePath)) { +data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath))); +data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath); +} if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) { data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH)); } +if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) { +data.put(Cols.ATTACHMENT_TYPE, m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); +} } String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME)); ext = (ext == null) ? "" : ext.toLowerCase(Locale.US); @@ -486,6 +498,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Integer.toString(commonTokenResult.getUniqueAlphabeticTokens())); data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens())); +double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0; +data.put(Cols.OOV, Double.toString(oov)); } TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class); if (tokenCounts != null) { @@ -498,6 +512,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Double.toString((Double) textStats.get(TokenEntropy.class))); } + SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class); if (summStats != null) { data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum())); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/o
(tika) branch TIKA-4205 created (now 3d4c94308)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4205 in repository https://gitbox.apache.org/repos/asf/tika.git at 3d4c94308 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler This branch includes the following new commits: new 3d4c94308 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4202b in repository https://gitbox.apache.org/repos/asf/tika.git commit f8a40d9a57bca397f35ddea10ee59fe6796a1fbe Author: tallison AuthorDate: Wed Feb 28 09:23:15 2024 -0500 TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked --- tika-core/src/main/java/org/apache/tika/metadata/PDF.java| 4 .../main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 ++ .../src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java | 4 .../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 8 .../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 9 + 5 files changed, 31 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index a6c753fcd..b15c10383 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -209,6 +209,10 @@ public interface PDF { Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), new Property[]{ TikaCoreProperties.VERSION_COUNT }); +/** + * This counts the number of pages that would have been OCR'd or were OCR'd depending + * on the OCR settings. If NO_OCR is selected, this will + */ Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index e03e14a4f..4d0a08226 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -533,8 +533,14 @@ class AbstractPDF2XHTML extends PDFTextStripper { void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY ocrStrategy) throws IOException, TikaException, SAXException { if (ocrStrategy.equals(NO_OCR)) { +//I don't think this is reachable? return; } +//count the number of times that OCR would have been called +OCRPageCounter c = context.get(OCRPageCounter.class); +if (c != null) { +c.increment(); +} MediaType ocrImageMediaType = MediaType.image("ocr-" + config.getOcrImageFormatName()); if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) { if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java index 3b382099b..418419eee 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java @@ -16,6 +16,10 @@ */ package org.apache.tika.parser.pdf; +/** + * This counts the number of pages that OCR would have been + * run or was run depending on the settings. + */ public class OCRPageCounter { private int count; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 0269a58ef..9406cac53 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1432,6 +1432,14 @@ public class PDFParserTest extends TikaTest { metadataList.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); } +@Test +public void testDefaultPDFOCR() throws Exception { +//test that even with no ocr -- there is no tesser
(tika) branch TIKA-4202b created (now f8a40d9a5)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4202b in repository https://gitbox.apache.org/repos/asf/tika.git at f8a40d9a5 TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked This branch includes the following new commits: new f8a40d9a5 TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch TIKA-4202 deleted (was ebe34e048)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4202 in repository https://gitbox.apache.org/repos/asf/tika.git was ebe34e048 TIKA-4202 -- add ocr page count to PDFs -- checkstyle The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4202 -- add ocr page count to metadata for PDFs (#1621)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 72927ec17 TIKA-4202 -- add ocr page count to metadata for PDFs (#1621) 72927ec17 is described below commit 72927ec17681490655923dd83924215439b664b4 Author: Tim Allison AuthorDate: Fri Feb 23 15:54:24 2024 -0500 TIKA-4202 -- add ocr page count to metadata for PDFs (#1621) * TIKA-4202 -- add ocr page count to PDFs --- .../main/java/org/apache/tika/metadata/PDF.java| 2 ++ .../org/apache/tika/parser/pdf/OCRPageCounter.java | 30 ++ .../java/org/apache/tika/parser/pdf/PDFParser.java | 6 + .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index c2baca0e8..a6c753fcd 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -209,4 +209,6 @@ public interface PDF { Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), new Property[]{ TikaCoreProperties.VERSION_COUNT }); +Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java new file mode 100644 index 0..3b382099b --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +public class OCRPageCounter { + +private int count; + +public void increment() { +count++; +} + +public int getCount() { +return count; +} +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index c93571daf..f21b65d4e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.pdf; +import static org.apache.tika.metadata.PDF.OCR_PAGE_COUNT; + import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; @@ -158,6 +160,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class); TikaInputStream tstream = null; boolean shouldClose = false; +OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class); +context.set(OCRPageCounter.class, new OCRPageCounter()); try { if (shouldSpool(localConfig)) { if (stream instanceof TikaInputStream) { @@ -220,6 +224,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { metadata.set(PDF.IS_ENCRYPTED, "true"); throw new EncryptedDocumentException(e); } finally { +metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount()); +context.set(OCRPageCounter.class, prevOCRCounter); //reset the incrementalUpdateRecord even if null context.set(IncrementalUp
(tika) branch TIKA-4202 updated (83cc605a5 -> ebe34e048)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4202 in repository https://gitbox.apache.org/repos/asf/tika.git from 83cc605a5 TIKA-4202 -- add ocr page count to PDFs add ebe34e048 TIKA-4202 -- add ocr page count to PDFs -- checkstyle No new revisions were added by this update. Summary of changes: .../java/org/apache/tika/parser/pdf/OCRPageCounter.java | 16 1 file changed, 16 insertions(+)
(tika) branch TIKA-4202 created (now 83cc605a5)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4202 in repository https://gitbox.apache.org/repos/asf/tika.git at 83cc605a5 TIKA-4202 -- add ocr page count to PDFs This branch includes the following new commits: new 83cc605a5 TIKA-4202 -- add ocr page count to PDFs The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4202 -- add ocr page count to PDFs
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4202 in repository https://gitbox.apache.org/repos/asf/tika.git commit 83cc605a5c8f2fef735ff4f1a8f9aa676821273b Author: tallison AuthorDate: Fri Feb 23 13:30:23 2024 -0500 TIKA-4202 -- add ocr page count to PDFs --- tika-core/src/main/java/org/apache/tika/metadata/PDF.java | 2 ++ .../java/org/apache/tika/parser/pdf/OCRPageCounter.java| 14 ++ .../main/java/org/apache/tika/parser/pdf/PDFParser.java| 6 ++ .../java/org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index c2baca0e8..a6c753fcd 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -209,4 +209,6 @@ public interface PDF { Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), new Property[]{ TikaCoreProperties.VERSION_COUNT }); +Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java new file mode 100644 index 0..d3dcc9155 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java @@ -0,0 +1,14 @@ +package org.apache.tika.parser.pdf; + +public class OCRPageCounter { + +private int count; + +public void increment() { +count++; +} + +public int getCount() { +return count; +} +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index c93571daf..f21b65d4e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.pdf; +import static org.apache.tika.metadata.PDF.OCR_PAGE_COUNT; + import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; @@ -158,6 +160,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class); TikaInputStream tstream = null; boolean shouldClose = false; +OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class); +context.set(OCRPageCounter.class, new OCRPageCounter()); try { if (shouldSpool(localConfig)) { if (stream instanceof TikaInputStream) { @@ -220,6 +224,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { metadata.set(PDF.IS_ENCRYPTED, "true"); throw new EncryptedDocumentException(e); } finally { +metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount()); +context.set(OCRPageCounter.class, prevOCRCounter); //reset the incrementalUpdateRecord even if null context.set(IncrementalUpdateRecord.class, incomingIncrementalUpdateRecord); PDFRenderingState currState = context.get(PDFRenderingState.class); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 6e9167f37..0269a58ef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -238,7 +238,7 @@ public class PDFParserTest extends TikaTest { assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get("pdf:encrypted")); //pdf:encrypted, X-Parsed
(tika) branch main updated: TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new fd44840a1 TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608) fd44840a1 is described below commit fd44840a113b719872df1c453d46775efe850c60 Author: Tim Allison AuthorDate: Tue Feb 20 11:56:20 2024 -0500 TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608) --- .../tika/parser/iwork/IWorkPackageParser.java | 39 ++ 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java index 490cfe47e..87074304f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java @@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -68,6 +70,7 @@ public class IWorkPackageParser implements Parser { * Serial version UID */ private static final long serialVersionUID = -2160322853809682372L; +private static final int MARK_LIMIT = 1096; /** * This parser handles all iWorks formats. */ @@ -91,9 +94,9 @@ public class IWorkPackageParser implements Parser { continue; } -InputStream entryStream = new BufferedInputStream(zip, 9216); -entryStream.mark(9216); -IWORKDocumentType type = IWORKDocumentType.detectType(entryStream); +InputStream entryStream = new BufferedInputStream(zip); +entryStream.mark(MARK_LIMIT); +IWORKDocumentType type = detectType(entryStream, MARK_LIMIT); entryStream.reset(); // 4096 fails on github if (type != null) { @@ -132,6 +135,25 @@ public class IWorkPackageParser implements Parser { // Don't close the zip InputStream (TIKA-1117). } +private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException { +byte[] bytes = new byte[markLimit]; +try { +int read = IOUtils.read(entryStream, bytes, 0, markLimit); +try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes) +.setOffset(0).setLength(read).get()) { +return IWORKDocumentType.detectType(bis); +} +} catch (UnsupportedZipFeatureException e) { +// There was a problem with extracting the root type +// Password Protected iWorks files are funny, but we can usually +// spot them because they encrypt part of the zip stream + +// Compression field was likely encrypted +return IWORKDocumentType.ENCRYPTED; +} + +} + public enum IWORKDocumentType { KEYNOTE("http://developer.apple.com/namespaces/keynote2;, "presentation", MediaType.application("vnd.apple.keynote")), @@ -189,17 +211,6 @@ public class IWorkPackageParser implements Parser { return type; } } -} else { -// There was a problem with extracting the root type -// Password Protected iWorks files are funny, but we can usually -// spot them because they encrypt part of the zip stream -try { -stream.read(); -} catch (UnsupportedZipFeatureException e) { -// Compression field was likely encrypted -return ENCRYPTED; -} catch (Exception ignored) { -} } return null; }
(tika) branch TIKA-4201 deleted (was e3e325169)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4201 in repository https://gitbox.apache.org/repos/asf/tika.git was e3e325169 TIKA-4201 -- add hard limit to IWorkPackageParser's detect The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch TIKA-4198 deleted (was a4991be96)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git was a4991be96 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns -- revert ossindex and merge from upstream main The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 4c3625fb4 TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607) 4c3625fb4 is described below commit 4c3625fb4599980885063781aeefe441379b5c2c Author: Tim Allison AuthorDate: Tue Feb 20 11:01:21 2024 -0500 TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607) * TIKA-4198 -- add parser for geopkg --- tika-parent/pom.xml| 6 +- .../apache/tika/parser/geopkg/GeoPkgDBParser.java | 54 + .../apache/tika/parser/geopkg/GeoPkgParser.java| 127 + .../GeoPkgTableReader.java}| 59 -- .../tika/parser/sqlite3/SQLite3DBParser.java | 2 +- .../tika/parser/sqlite3/SQLite3TableReader.java| 2 +- .../services/org.apache.tika.parser.Parser | 1 + 7 files changed, 212 insertions(+), 39 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index bf116f50a..47116650a 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1101,9 +1101,9 @@ natural language process module. Serialization is only on data that is configured in tika-config.xml. We don't think we'd be vulnerable to crafted user input. --> -org.apache.uima -uimaj-core -3.4.1 + org.apache.uima + uimaj-core + 3.4.1 true diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java new file mode 100644 index 0..d4b56127d --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.geopkg; + +import java.sql.Connection; +import java.util.Set; + +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.jdbc.JDBCTableReader; +import org.apache.tika.parser.sqlite3.SQLite3DBParser; + +/** + * This is the implementation of the db parser for SQLite. + * + * This parser is internal only; it should not be registered in the services + * file or configured in the TikaConfig xml file. + */ +class GeoPkgDBParser extends SQLite3DBParser { + +private final Set ignoreBlobColumns; + +GeoPkgDBParser(Set ignoreBlobColumns) { +this.ignoreBlobColumns = ignoreBlobColumns; +} + +@Override +public JDBCTableReader getTableReader(Connection connection, String tableName, + ParseContext context) { +return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context), +ignoreBlobColumns); +} + +@Override +protected JDBCTableReader getTableReader(Connection connection, String tableName, + EmbeddedDocumentUtil embeddedDocumentUtil) { +return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil, +ignoreBlobColumns); +} +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java new file mode 100644 index 0..e157a09c9 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * t
(tika) 01/01: TIKA-4201 -- add hard limit to IWorkPackageParser's detect
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4201 in repository https://gitbox.apache.org/repos/asf/tika.git commit e3e325169abcb130689acd13b9cf286040f66cab Author: tallison AuthorDate: Tue Feb 20 10:58:36 2024 -0500 TIKA-4201 -- add hard limit to IWorkPackageParser's detect --- .../tika/parser/iwork/IWorkPackageParser.java | 39 ++ 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java index 490cfe47e..87074304f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java @@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -68,6 +70,7 @@ public class IWorkPackageParser implements Parser { * Serial version UID */ private static final long serialVersionUID = -2160322853809682372L; +private static final int MARK_LIMIT = 1096; /** * This parser handles all iWorks formats. */ @@ -91,9 +94,9 @@ public class IWorkPackageParser implements Parser { continue; } -InputStream entryStream = new BufferedInputStream(zip, 9216); -entryStream.mark(9216); -IWORKDocumentType type = IWORKDocumentType.detectType(entryStream); +InputStream entryStream = new BufferedInputStream(zip); +entryStream.mark(MARK_LIMIT); +IWORKDocumentType type = detectType(entryStream, MARK_LIMIT); entryStream.reset(); // 4096 fails on github if (type != null) { @@ -132,6 +135,25 @@ public class IWorkPackageParser implements Parser { // Don't close the zip InputStream (TIKA-1117). } +private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException { +byte[] bytes = new byte[markLimit]; +try { +int read = IOUtils.read(entryStream, bytes, 0, markLimit); +try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes) +.setOffset(0).setLength(read).get()) { +return IWORKDocumentType.detectType(bis); +} +} catch (UnsupportedZipFeatureException e) { +// There was a problem with extracting the root type +// Password Protected iWorks files are funny, but we can usually +// spot them because they encrypt part of the zip stream + +// Compression field was likely encrypted +return IWORKDocumentType.ENCRYPTED; +} + +} + public enum IWORKDocumentType { KEYNOTE("http://developer.apple.com/namespaces/keynote2;, "presentation", MediaType.application("vnd.apple.keynote")), @@ -189,17 +211,6 @@ public class IWorkPackageParser implements Parser { return type; } } -} else { -// There was a problem with extracting the root type -// Password Protected iWorks files are funny, but we can usually -// spot them because they encrypt part of the zip stream -try { -stream.read(); -} catch (UnsupportedZipFeatureException e) { -// Compression field was likely encrypted -return ENCRYPTED; -} catch (Exception ignored) { -} } return null; }
(tika) branch TIKA-4201 created (now e3e325169)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4201 in repository https://gitbox.apache.org/repos/asf/tika.git at e3e325169 TIKA-4201 -- add hard limit to IWorkPackageParser's detect This branch includes the following new commits: new e3e325169 TIKA-4201 -- add hard limit to IWorkPackageParser's detect The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch TIKA-4198 updated (8b2ca9d08 -> a4991be96)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git from 8b2ca9d08 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns -- temporarily ignore commons-compress cves add ee624917c Bump aws.version from 1.12.660 to 1.12.661 add 9de08df05 Merge pull request #1602 from apache/dependabot/maven/aws.version-1.12.661 add a3d535030 Bump logback.version from 1.4.14 to 1.5.0 add 1fe8ffe8f Merge pull request #1603 from apache/dependabot/maven/logback.version-1.5.0 add 0c5b3f9ee Bump aws.version from 1.12.661 to 1.12.662 add b37a5dca2 Merge pull request #1606 from apache/dependabot/maven/aws.version-1.12.662 add e4b23d811 Bump org.apache.commons:commons-compress from 1.25.0 to 1.26.0 add f4f65eca3 replace deprecated add 8623c9261 restore license URL add 363a20316 try larger buffer size add 3648dc3c3 try larger buffer add 123344819 download archive element to avoid trouble with commpns-compress 1.26.0 add 8cf526b56 adjust failing test add f8fec9c7e add test output to help with future debugging add 12a27e265 revert (wrong file) add c9f612ecf add code to help with future debugging add 9690bd716 add TODO add fb3f21386 Merge pull request #1605 from apache/dependabot/maven/org.apache.commons-commons-compress-1.26.0 add e5d57528d TIKA-4199: complete delegate class add b128334fe Merge remote-tracking branch 'origin/main' into TIKA-4198 add a4991be96 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns -- revert ossindex and merge from upstream main No new revisions were added by this update. Summary of changes: tika-parent/pom.xml| 13 +++--- .../tika/parser/iwork/IWorkPackageParser.java | 12 +++--- .../org/apache/tika/parser/pkg/PackageParser.java | 48 ++ .../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 13 +- .../apache/tika/parser/pkg/Seven7ParserTest.java | 11 - 5 files changed, 80 insertions(+), 17 deletions(-)
(tika) branch TIKA-4198 updated (6b1fd5812 -> 8b2ca9d08)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git from 6b1fd5812 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns -- improve documentation add 8b2ca9d08 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns -- temporarily ignore commons-compress cves No new revisions were added by this update. Summary of changes: tika-parent/pom.xml | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-)
(tika) branch TIKA-4198 updated (7fad80367 -> 6b1fd5812)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git from 7fad80367 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns add 6b1fd5812 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns -- improve documentation No new revisions were added by this update. Summary of changes: .../apache/tika/parser/geopkg/GeoPkgParser.java| 26 +- 1 file changed, 25 insertions(+), 1 deletion(-)
(tika) branch TIKA-4198 updated: TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4198 by this push: new 7fad80367 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns 7fad80367 is described below commit 7fad803673b1ae82ba4ff74aad1a9d12e356224d Author: tallison AuthorDate: Tue Feb 20 06:49:29 2024 -0500 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns --- .../apache/tika/parser/geopkg/GeoPkgDBParser.java | 30 -- .../apache/tika/parser/geopkg/GeoPkgParser.java| 16 +--- .../tika/parser/geopkg/GeoPkgTableReader.java | 14 +- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java index 5dc0f9ff2..d4b56127d 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java @@ -16,29 +16,11 @@ */ package org.apache.tika.parser.geopkg; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; import java.sql.Connection; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; import java.util.Set; -import org.sqlite.SQLiteConfig; - import org.apache.tika.extractor.EmbeddedDocumentUtil; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; -import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.jdbc.AbstractDBParser; import org.apache.tika.parser.jdbc.JDBCTableReader; import org.apache.tika.parser.sqlite3.SQLite3DBParser; @@ -50,15 +32,23 @@ import org.apache.tika.parser.sqlite3.SQLite3DBParser; */ class GeoPkgDBParser extends SQLite3DBParser { +private final Set ignoreBlobColumns; + +GeoPkgDBParser(Set ignoreBlobColumns) { +this.ignoreBlobColumns = ignoreBlobColumns; +} + @Override public JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext context) { -return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context)); +return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context), +ignoreBlobColumns); } @Override protected JDBCTableReader getTableReader(Connection connection, String tableName, EmbeddedDocumentUtil embeddedDocumentUtil) { -return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil); +return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil, +ignoreBlobColumns); } } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java index 6aae7cb04..907e6de39 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java @@ -20,22 +20,22 @@ package org.apache.tika.parser.geopkg; import java.io.IOException; import java.io.InputStream; import java.util.Collections; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.config.Initializable; +import org.apache.tika.config.Field; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; import org.apache.tika.parser.sqlite3.SQLite3Parser; /** @@ -52,10 +52,13 @@ public class GeoPkgParser extends SQLite3Parser { private static final Set SUPPORTED_TYPES
(tika) 01/01: TIKA-4198 -- add parser for geopkg
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git commit 32eb5aa9589f8365ba858aa6286e80bb8e54473c Author: tallison AuthorDate: Fri Feb 16 09:50:59 2024 -0500 TIKA-4198 -- add parser for geopkg --- .../apache/tika/parser/geopkg/GeoPkgDBParser.java | 64 +++ .../apache/tika/parser/geopkg/GeoPkgParser.java| 95 ++ .../GeoPkgTableReader.java}| 55 ++--- .../tika/parser/sqlite3/SQLite3DBParser.java | 2 +- .../tika/parser/sqlite3/SQLite3TableReader.java| 2 +- .../services/org.apache.tika.parser.Parser | 1 + 6 files changed, 186 insertions(+), 33 deletions(-) diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java new file mode 100644 index 0..5dc0f9ff2 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.geopkg; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.sqlite.SQLiteConfig; + +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.jdbc.AbstractDBParser; +import org.apache.tika.parser.jdbc.JDBCTableReader; +import org.apache.tika.parser.sqlite3.SQLite3DBParser; + +/** + * This is the implementation of the db parser for SQLite. + * + * This parser is internal only; it should not be registered in the services + * file or configured in the TikaConfig xml file. + */ +class GeoPkgDBParser extends SQLite3DBParser { + +@Override +public JDBCTableReader getTableReader(Connection connection, String tableName, + ParseContext context) { +return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context)); +} + +@Override +protected JDBCTableReader getTableReader(Connection connection, String tableName, + EmbeddedDocumentUtil embeddedDocumentUtil) { +return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil); +} +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java new file mode 100644 index 0..6aae7cb04 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
(tika) branch TIKA-4198 created (now 32eb5aa95)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4198 in repository https://gitbox.apache.org/repos/asf/tika.git at 32eb5aa95 TIKA-4198 -- add parser for geopkg This branch includes the following new commits: new 32eb5aa95 TIKA-4198 -- add parser for geopkg The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch TIKA-4191 deleted (was a91d79c4f)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4191 in repository https://gitbox.apache.org/repos/asf/tika.git was a91d79c4f TIKA-4191 -- add back commons-cli The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4191 -- reduce tika-core's scope to "provided" where possible (#1575)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new fb6ba1a33 TIKA-4191 -- reduce tika-core's scope to "provided" where possible (#1575) fb6ba1a33 is described below commit fb6ba1a33a225d91de1e2d162317ae629ee8c3ab Author: Tim Allison AuthorDate: Mon Feb 12 14:38:06 2024 -0500 TIKA-4191 -- reduce tika-core's scope to "provided" where possible (#1575) * TIKA-4191 -- change tika-core's scope to "provided" where possible --- CHANGES.txt | 2 ++ tika-app/pom.xml | 1 + tika-batch/pom.xml | 3 +++ tika-eval/tika-eval-app/pom.xml | 5 + tika-eval/tika-eval-core/pom.xml | 1 + tika-fuzzing/pom.xml | 1 + tika-java7/pom.xml | 1 + tika-server/tika-server-core/pom.xml | 6 +- tika-translate/pom.xml | 1 + tika-xmp/pom.xml | 1 + 10 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 163753e9b..187aa2f55 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -13,6 +13,8 @@ Release 3.0.0-BETA - 12/01/2023 * Removed xerces2 as a dependency (TIKA-4135). + * tika-core now has a scope of "provided" in most non-app modules (TIKA-4191). + * Tika will look for "custom-mimetypes.xml" directly on the classpath, NOT under "/org/apache/tika/mime/". (TIKA-4147). diff --git a/tika-app/pom.xml b/tika-app/pom.xml index b8bd673a8..88f1157a6 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -73,6 +73,7 @@ tika-emitter-fs ${project.version} + ${project.groupId} tika-async-cli diff --git a/tika-batch/pom.xml b/tika-batch/pom.xml index 1a7867578..85fea830e 100644 --- a/tika-batch/pom.xml +++ b/tika-batch/pom.xml @@ -39,16 +39,19 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} tika-serialization ${project.version} + provided org.apache.commons commons-compress + provided commons-cli diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 80113700f..18671052c 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -30,6 +30,11 @@ + + org.apache.tika + tika-core + ${project.version} + org.apache.tika tika-eval-core diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml index 0644e72e9..ac6470bee 100644 --- a/tika-eval/tika-eval-core/pom.xml +++ b/tika-eval/tika-eval-core/pom.xml @@ -34,6 +34,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml index b3d801bef..2faa23ce7 100644 --- a/tika-fuzzing/pom.xml +++ b/tika-fuzzing/pom.xml @@ -37,6 +37,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} diff --git a/tika-java7/pom.xml b/tika-java7/pom.xml index 4c209898b..969c4b4ee 100644 --- a/tika-java7/pom.xml +++ b/tika-java7/pom.xml @@ -91,6 +91,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index 21bc0571f..0039cce4b 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -36,12 +36,16 @@ + + org.apache.tika + tika-core + ${project.version} + ${project.groupId} tika-translate ${project.version} - ${project.groupId} tika-langdetect-optimaize diff --git a/tika-translate/pom.xml b/tika-translate/pom.xml index 71c6d15c7..3d1129d27 100644 --- a/tika-translate/pom.xml +++ b/tika-translate/pom.xml @@ -39,6 +39,7 @@ org.apache.tika tika-core ${project.version} + provided org.apache.tika diff --git a/tika-xmp/pom.xml b/tika-xmp/pom.xml index 68561178e..5917b584b 100644 --- a/tika-xmp/pom.xml +++ b/tika-xmp/pom.xml @@ -83,6 +83,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId}
(tika) branch branch_2x updated: TIKA-4197 -- downgrade jackrabbit so that 2.x can still be built with Java 8
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new 06d27f473 TIKA-4197 -- downgrade jackrabbit so that 2.x can still be built with Java 8 06d27f473 is described below commit 06d27f4731b9970bb751d27e91ca070e17f4098f Author: tallison AuthorDate: Mon Feb 12 13:31:54 2024 -0500 TIKA-4197 -- downgrade jackrabbit so that 2.x can still be built with Java 8 --- tika-parent/pom.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 03b6c24a6..eb3ed8736 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -343,7 +343,8 @@ 62.2 1.4.0 -2.21.23 + +2.21.22 2.16.1 1.3.2 2.0
(tika) branch TIKA-4191 updated (6a7d55edc -> a91d79c4f)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4191 in repository https://gitbox.apache.org/repos/asf/tika.git from 6a7d55edc TIKA-4191 -- change tika-core's scope to "provided" where possible add a91d79c4f TIKA-4191 -- add back commons-cli No new revisions were added by this update. Summary of changes: tika-batch/pom.xml | 1 - 1 file changed, 1 deletion(-)
(tika) branch TIKA-4195 deleted (was 3add44416)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4195 in repository https://gitbox.apache.org/repos/asf/tika.git was 3add44416 TIKA-4195 -- jsoup parser conceals backoff to default encoding -- fix unit test The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 455409bf8 TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591) 455409bf8 is described below commit 455409bf80801152e7c855ddc994fedc32c4cfcf Author: Tim Allison AuthorDate: Mon Feb 12 13:11:47 2024 -0500 TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591) * TIKA-4195 -- jsoup parser conceals backoff to default encoding --- .../org/apache/tika/detect/AutoDetectReader.java | 38 -- .../tika/detect/CompositeEncodingDetector.java | 7 .../apache/tika/metadata/TikaCoreProperties.java | 16 + .../apache/tika/parser/html/HtmlParserTest.java| 2 +- .../org/apache/tika/parser/txt/TXTParserTest.java | 2 ++ .../tika/parser/RecursiveParserWrapperTest.java| 5 +-- 6 files changed, 49 insertions(+), 21 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java index 5cb920aae..bd7d4f2a9 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java +++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java @@ -22,8 +22,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; -import java.util.Collections; -import java.util.List; import org.xml.sax.InputSource; @@ -31,6 +29,7 @@ import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; @@ -68,26 +67,27 @@ public class AutoDetectReader extends BufferedReader { /** * @param streamstream from which to read -- make sure that it supports mark! * @param metadata - * @param detectors + * @param detector * @param handler * @throws IOException * @throws TikaException */ private AutoDetectReader(InputStream stream, Metadata metadata, - List detectors, LoadErrorHandler handler) + EncodingDetector detector, LoadErrorHandler handler) throws IOException, TikaException { -this(stream, detect(stream, metadata, detectors, handler)); +this(stream, detect(stream, metadata, detector, handler)); } public AutoDetectReader(InputStream stream, Metadata metadata, EncodingDetector encodingDetector) throws IOException, TikaException { -this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector), +this(getBuffered(stream), metadata, encodingDetector, DEFAULT_LOADER.getLoadErrorHandler()); } public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader) throws IOException, TikaException { -this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class), +this(getBuffered(stream), metadata, +new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)), loader.getLoadErrorHandler()); } @@ -101,19 +101,17 @@ public class AutoDetectReader extends BufferedReader { } private static Charset detect(InputStream input, Metadata metadata, - List detectors, LoadErrorHandler handler) + EncodingDetector detector, LoadErrorHandler handler) throws IOException, TikaException { // Ask all given detectors for the character encoding -for (EncodingDetector detector : detectors) { -try { -Charset charset = detector.detect(input, metadata); -if (charset != null) { -return charset; -} -} catch (NoClassDefFoundError e) { -// TIKA-1041: Detector dependencies not present. -handler.handleLoadError(detector.getClass().getName(), e); +try { +Charset charset = detector.detect(input, metadata); +if (charset != null) { +return charset; } +} catch (NoClassDefFoundError e) { +// TIKA-1041: Detector dependencies not present. +handler.handleLoadError(detector.getClass().getName(), e); } // Try determining the encoding based on hints in document metadata @@ -122,7 +120,11 @@ public class AutoDetectReader extends BufferedReader { String charset
(tika) branch branch_2x updated: [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new 62fd50050 [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589) 62fd50050 is described below commit 62fd50050e7b06cceae68a7e7241919a5cd1d7f8 Author: Lonzak AuthorDate: Mon Feb 12 18:52:54 2024 +0100 [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589) (cherry picked from commit c2acd713bb31b88419ebc70dd31c4bfb23bd390f) --- tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index b76adebd1..7a6d660e5 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5147,6 +5147,8 @@ +
(tika) branch main updated: [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new c2acd713b [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589) c2acd713b is described below commit c2acd713bb31b88419ebc70dd31c4bfb23bd390f Author: Lonzak AuthorDate: Mon Feb 12 18:52:54 2024 +0100 [TIKA-4194] Fix for unrecognized pkcs12 keystores (#1589) --- tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 2930fa720..675ba1180 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5253,6 +5253,8 @@ +
(tika) branch TIKA-4195 updated (748073437 -> 3add44416)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4195 in repository https://gitbox.apache.org/repos/asf/tika.git from 748073437 TIKA-4195 -- jsoup parser conceals backoff to default encoding add 3add44416 TIKA-4195 -- jsoup parser conceals backoff to default encoding -- fix unit test No new revisions were added by this update. Summary of changes: .../test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
(tika) branch TIKA-4196 deleted (was b6d4b41a3)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4196 in repository https://gitbox.apache.org/repos/asf/tika.git was b6d4b41a3 TIKA-4196 -- add a bom EncodingDetector The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4196 -- add a bom EncodingDetector (#1590)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 7c758c31e TIKA-4196 -- add a bom EncodingDetector (#1590) 7c758c31e is described below commit 7c758c31e6e3f52b4c5f8ad2ac8169dc0f8b310a Author: Tim Allison AuthorDate: Mon Feb 12 12:34:06 2024 -0500 TIKA-4196 -- add a bom EncodingDetector (#1590) --- .../org/apache/tika/parser/txt/BOMDetector.java| 93 ++ .../apache/tika/parser/txt/BOMDetectorTest.java| 91 + 2 files changed, 184 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java new file mode 100644 index 0..c96bfda5d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; + +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.IOUtils; + +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.metadata.Metadata; + +public class BOMDetector implements EncodingDetector { + +private static final ByteOrderMark[] BOMS = +//order matters -- have to try the 32 before the 16 +new ByteOrderMark[] { +ByteOrderMark.UTF_8, +ByteOrderMark.UTF_32BE, +ByteOrderMark.UTF_32LE, +ByteOrderMark.UTF_16BE, +ByteOrderMark.UTF_16LE +}; +private static final Charset[] CHARSETS = new Charset[BOMS.length]; + +private static final int MIN_BYTES = 2; +private static final int MAX_BYTES = 4; + +static { +for (int i = 0; i < BOMS.length; i++) { +try { +CHARSETS[i] = Charset.forName(BOMS[i].getCharsetName()); +} catch (UnsupportedCharsetException e) { +//log it +} +} +} +@Override +public Charset detect(InputStream input, Metadata metadata) throws IOException { +input.mark(MAX_BYTES); +byte[] bytes = new byte[MAX_BYTES]; +try { +int numRead = IOUtils.read(input, bytes); +if (numRead < MIN_BYTES) { +return null; +} else if (numRead < MAX_BYTES) { +//s +byte[] tmpBytes = new byte[numRead]; +System.arraycopy(bytes, 0, tmpBytes, 0, numRead); +bytes = tmpBytes; +} +} finally { +input.reset(); +} +for (int i = 0; i < BOMS.length; i++) { +ByteOrderMark bom = BOMS[i]; +if (startsWith(bom, bytes)) { +return CHARSETS[i]; +} +} +return null; +} + +private boolean startsWith(ByteOrderMark bom, byte[] bytes) { +byte[] bomBytes = bom.getBytes(); +if (bytes.length < bomBytes.length) { +return false; +} +for (int i = 0; i < bomBytes.length; i++) { +if (bomBytes[i] != bytes[i]) { +return false; +} +} +return true; +} +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java new file mode 100644 index 0..b008607dc --- /dev/null +++ b/tika-parsers/tika-parsers-stand
(tika) 01/01: TIKA-4195 -- jsoup parser conceals backoff to default encoding
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4195 in repository https://gitbox.apache.org/repos/asf/tika.git commit 7480734379928d7d32a13aa0b56b9b00183a7773 Author: tallison AuthorDate: Mon Feb 12 12:33:13 2024 -0500 TIKA-4195 -- jsoup parser conceals backoff to default encoding --- .../org/apache/tika/detect/AutoDetectReader.java | 38 -- .../tika/detect/CompositeEncodingDetector.java | 7 .../apache/tika/metadata/TikaCoreProperties.java | 16 + .../apache/tika/parser/html/HtmlParserTest.java| 2 +- .../org/apache/tika/parser/txt/TXTParserTest.java | 2 ++ 5 files changed, 46 insertions(+), 19 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java index 5cb920aae..bd7d4f2a9 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java +++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java @@ -22,8 +22,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; -import java.util.Collections; -import java.util.List; import org.xml.sax.InputSource; @@ -31,6 +29,7 @@ import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; @@ -68,26 +67,27 @@ public class AutoDetectReader extends BufferedReader { /** * @param streamstream from which to read -- make sure that it supports mark! * @param metadata - * @param detectors + * @param detector * @param handler * @throws IOException * @throws TikaException */ private AutoDetectReader(InputStream stream, Metadata metadata, - List detectors, LoadErrorHandler handler) + EncodingDetector detector, LoadErrorHandler handler) throws IOException, TikaException { -this(stream, detect(stream, metadata, detectors, handler)); +this(stream, detect(stream, metadata, detector, handler)); } public AutoDetectReader(InputStream stream, Metadata metadata, EncodingDetector encodingDetector) throws IOException, TikaException { -this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector), +this(getBuffered(stream), metadata, encodingDetector, DEFAULT_LOADER.getLoadErrorHandler()); } public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader) throws IOException, TikaException { -this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class), +this(getBuffered(stream), metadata, +new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)), loader.getLoadErrorHandler()); } @@ -101,19 +101,17 @@ public class AutoDetectReader extends BufferedReader { } private static Charset detect(InputStream input, Metadata metadata, - List detectors, LoadErrorHandler handler) + EncodingDetector detector, LoadErrorHandler handler) throws IOException, TikaException { // Ask all given detectors for the character encoding -for (EncodingDetector detector : detectors) { -try { -Charset charset = detector.detect(input, metadata); -if (charset != null) { -return charset; -} -} catch (NoClassDefFoundError e) { -// TIKA-1041: Detector dependencies not present. -handler.handleLoadError(detector.getClass().getName(), e); +try { +Charset charset = detector.detect(input, metadata); +if (charset != null) { +return charset; } +} catch (NoClassDefFoundError e) { +// TIKA-1041: Detector dependencies not present. +handler.handleLoadError(detector.getClass().getName(), e); } // Try determining the encoding based on hints in document metadata @@ -122,7 +120,11 @@ public class AutoDetectReader extends BufferedReader { String charset = type.getParameters().get("charset"); if (charset != null) { try { -return CharsetUtils.forName(charset); +Charset cs = CharsetUtils.forName(charset); +metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name()); +me
(tika) branch TIKA-4195 created (now 748073437)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4195 in repository https://gitbox.apache.org/repos/asf/tika.git at 748073437 TIKA-4195 -- jsoup parser conceals backoff to default encoding This branch includes the following new commits: new 748073437 TIKA-4195 -- jsoup parser conceals backoff to default encoding The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch TIKA-4196 created (now b6d4b41a3)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4196 in repository https://gitbox.apache.org/repos/asf/tika.git at b6d4b41a3 TIKA-4196 -- add a bom EncodingDetector This branch includes the following new commits: new b6d4b41a3 TIKA-4196 -- add a bom EncodingDetector The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4196 -- add a bom EncodingDetector
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4196 in repository https://gitbox.apache.org/repos/asf/tika.git commit b6d4b41a32ea5032b99e655878e57b2ce616be22 Author: tallison AuthorDate: Mon Feb 12 12:06:44 2024 -0500 TIKA-4196 -- add a bom EncodingDetector --- .../org/apache/tika/parser/txt/BOMDetector.java| 93 ++ .../apache/tika/parser/txt/BOMDetectorTest.java| 91 + 2 files changed, 184 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java new file mode 100644 index 0..c96bfda5d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; + +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.IOUtils; + +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.metadata.Metadata; + +public class BOMDetector implements EncodingDetector { + +private static final ByteOrderMark[] BOMS = +//order matters -- have to try the 32 before the 16 +new ByteOrderMark[] { +ByteOrderMark.UTF_8, +ByteOrderMark.UTF_32BE, +ByteOrderMark.UTF_32LE, +ByteOrderMark.UTF_16BE, +ByteOrderMark.UTF_16LE +}; +private static final Charset[] CHARSETS = new Charset[BOMS.length]; + +private static final int MIN_BYTES = 2; +private static final int MAX_BYTES = 4; + +static { +for (int i = 0; i < BOMS.length; i++) { +try { +CHARSETS[i] = Charset.forName(BOMS[i].getCharsetName()); +} catch (UnsupportedCharsetException e) { +//log it +} +} +} +@Override +public Charset detect(InputStream input, Metadata metadata) throws IOException { +input.mark(MAX_BYTES); +byte[] bytes = new byte[MAX_BYTES]; +try { +int numRead = IOUtils.read(input, bytes); +if (numRead < MIN_BYTES) { +return null; +} else if (numRead < MAX_BYTES) { +//s +byte[] tmpBytes = new byte[numRead]; +System.arraycopy(bytes, 0, tmpBytes, 0, numRead); +bytes = tmpBytes; +} +} finally { +input.reset(); +} +for (int i = 0; i < BOMS.length; i++) { +ByteOrderMark bom = BOMS[i]; +if (startsWith(bom, bytes)) { +return CHARSETS[i]; +} +} +return null; +} + +private boolean startsWith(ByteOrderMark bom, byte[] bytes) { +byte[] bomBytes = bom.getBytes(); +if (bytes.length < bomBytes.length) { +return false; +} +for (int i = 0; i < bomBytes.length; i++) { +if (bomBytes[i] != bytes[i]) { +return false; +} +} +return true; +} +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java new file mode 100644 index 0..b008607dc --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Softw
(tika) branch TIKA-4188 deleted (was f15c00962)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git was f15c00962 TIKA-4188 -- upgrade jwarc and add unit test The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4188 (#1587)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 7d48d00ac TIKA-4188 (#1587) 7d48d00ac is described below commit 7d48d00ac1febfb1ac70e4887268b28fb4951b78 Author: Tim Allison AuthorDate: Fri Feb 9 10:43:40 2024 -0500 TIKA-4188 (#1587) * TIKA-4188 -- add parsing for arc files --- .../detect/gzip/GZipSpecializationDetector.java| 4 ++ .../org/apache/tika/parser/warc/WARCParser.java| 14 -- .../apache/tika/parser/warc/WARCParserTest.java| 31 - .../test/resources/test-documents/example.arc.gz | Bin 0 -> 1027 bytes .../src/test/resources/test-documents/testARC.arc | 50 + 5 files changed, 94 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java index e3d743ad3..b87115b3b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java @@ -38,6 +38,8 @@ public class GZipSpecializationDetector implements Detector { public static MediaType GZ = MediaType.application("gzip"); public static MediaType WARC_GZ = MediaType.application("warc+gz"); +public static MediaType ARC_GZ = MediaType.application("arc+gz"); + @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { if (input == null) { @@ -84,6 +86,8 @@ public class GZipSpecializationDetector implements Detector { String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8); if (s.startsWith("WARC/")) { return WARC_GZ; +} else if (s.startsWith("filedesc://")) { +return ARC_GZ; } return GZ; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java index 2c61cae91..ad4894b54 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java @@ -49,11 +49,16 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; +/** + * This uses jwarc to parse warc files and arc files + */ public class WARCParser implements Parser { private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<>(Arrays.asList(MediaType.application("warc"), -MediaType.application("warc+gz"; +MediaType.application("warc+gz"), +MediaType.application("x-internet-archive"), +MediaType.application("arc+gz"; public static String WARC_PREFIX = "warc:"; public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:"; @@ -130,9 +135,10 @@ public class WARCParser implements Parser { setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata); processWarcMetadata(warcResponse, metadata); processHttpResponseMetadata(warcResponse.http(), metadata); - -String id = warcResponse.id().toString(); -metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id); +if (warcResponse.warcinfoID().isPresent()) { +String id = warcResponse.id().toString(); +metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id); +} WarcPayload payload = optionalPayload.get(); metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size())); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standa
(tika) branch TIKA-4188 updated (6ac12b6c8 -> f15c00962)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git from 6ac12b6c8 TIKA-4188 -- upgrade jwarc and add unit test add f15c00962 TIKA-4188 -- upgrade jwarc and add unit test No new revisions were added by this update. Summary of changes: .../src/test/java/org/apache/tika/parser/warc/WARCParserTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
(tika) branch TIKA-4188 updated: TIKA-4188 -- upgrade jwarc and add unit test
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4188 by this push: new 6ac12b6c8 TIKA-4188 -- upgrade jwarc and add unit test 6ac12b6c8 is described below commit 6ac12b6c8bff62269e4ecb5a9f6d00f8a7495d20 Author: tallison AuthorDate: Fri Feb 9 09:43:28 2024 -0500 TIKA-4188 -- upgrade jwarc and add unit test --- tika-parent/pom.xml| 2 +- .../apache/tika/parser/warc/WARCParserTest.java| 12 +++- .../src/test/resources/test-documents/example.arc | 69 -- 3 files changed, 12 insertions(+), 71 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index fe002e538..5f76e65c7 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -378,7 +378,7 @@ 5.10.2 2.4.0 7.5.5 -0.28.5 +0.28.6 3.6.1 0.9.3 2.22.1 diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java index bb7031550..8dc35bcf9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java @@ -18,7 +18,6 @@ package org.apache.tika.parser.warc; import static org.junit.jupiter.api.Assertions.assertEquals; -import java.io.File; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -81,4 +80,15 @@ public class WARCParserTest extends TikaTest { assertEquals("http://www.uq.edu.au/;, metadataList.get(1).get("warc:http:Location")); } + +@Test +public void testExampleARC() throws Exception { +//test file from https://github.com/webrecorder/warcio/blob/master/test/data/example.arc.gz +List metadataList = getRecursiveMetadata("example.arc.gz", +BasicContentHandlerFactory.HANDLER_TYPE.TEXT); +assertEquals(2, metadataList.size()); +assertEquals("application/arc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE)); +assertContains("This domain is established", +metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); +} } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc deleted file mode 100644 index 0d2af2bd2..0 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc +++ /dev/null @@ -1,69 +0,0 @@ -filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75 -1 0 LiveWeb Capture -URL IP-address Archive-date Content-type Archive-length - -http://example.com/ 93.184.216.119 20140216050221 text/html 1591 -HTTP/1.1 200 OK -Accept-Ranges: bytes -Cache-Control: max-age=604800 -Content-Type: text/html -Date: Sun, 16 Feb 2014 05:02:20 GMT -Etag: "359670651" -Expires: Sun, 23 Feb 2014 05:02:20 GMT -Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT -Server: ECS (sjc/4FCE) -X-Cache: HIT -x-ec-custom-error: 1 -Content-Length: 1270 - - - - -Example Domain - - - - - -body { -background-color: #f0f0f2; -margin: 0; -padding: 0; -font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; - -} -div { -width: 600px; -margin: 5em auto; -padding: 50px; -background-color: #fff; -border-radius: 1em; -} -a:link, a:visited { -color: #38488f; -text-decoration: none; -} -@media (max-width: 700px) { -body { -background-color: #fff; -} -div { -width: auto; -margin: 0 auto; -border-radius: 0; -padding: 1em; -} -} - - - - - -Example Domain -This domain is established to be used for illustrative examples in documents. You may use this -domain in examples without prior coordination or asking for permission. -http://www.iana.org/domains/example;>More information... - - - -
(tika) branch TIKA-4193 deleted (was d07fb16b1)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4193 in repository https://gitbox.apache.org/repos/asf/tika.git was d07fb16b1 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 16e1bc9c8 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582) 16e1bc9c8 is described below commit 16e1bc9c8e4f5e253fc519a477da92410730d060 Author: Tim Allison AuthorDate: Thu Feb 8 15:05:02 2024 -0500 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582) --- .../org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java| 4 .../apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java| 1 + 2 files changed, 5 insertions(+) diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java index 0ac65d240..811958af4 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java @@ -48,6 +48,9 @@ public class TikaEvalMetadataFilter extends MetadataFilter { public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numAlphaTokens"); +public static Property NUM_COMMON_TOKENS = +Property.externalInteger(TIKA_EVAL_NS + "numCommonTokens"); + public static Property NUM_UNIQUE_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numUniqueAlphaTokens"); @@ -90,6 +93,7 @@ public class TikaEvalMetadataFilter extends MetadataFilter { CommonTokenResult commonTokenResult = (CommonTokenResult) results.get(CommonTokens.class); metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens()); metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens()); +metadata.set(NUM_COMMON_TOKENS, commonTokenResult.getCommonTokens()); if (commonTokenResult.getAlphabeticTokens() > 0) { metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV()); } else { diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java index 1961698b4..f1fd21c21 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java @@ -42,6 +42,7 @@ public class TikaEvalMetadataFilterTest { assertEquals(11, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS)); assertEquals(10, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS)); assertEquals(9, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS)); +assertEquals(9, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_COMMON_TOKENS)); assertEquals(0.0999,
(tika) branch TIKA-4193 created (now d07fb16b1)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4193 in repository https://gitbox.apache.org/repos/asf/tika.git at d07fb16b1 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter This branch includes the following new commits: new d07fb16b1 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4193 in repository https://gitbox.apache.org/repos/asf/tika.git commit d07fb16b132294ced01a9ce64ae7f8263149f3d8 Author: tallison AuthorDate: Thu Feb 8 14:38:30 2024 -0500 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter --- .../org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java| 4 .../apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java| 1 + 2 files changed, 5 insertions(+) diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java index 0ac65d240..811958af4 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java @@ -48,6 +48,9 @@ public class TikaEvalMetadataFilter extends MetadataFilter { public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numAlphaTokens"); +public static Property NUM_COMMON_TOKENS = +Property.externalInteger(TIKA_EVAL_NS + "numCommonTokens"); + public static Property NUM_UNIQUE_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numUniqueAlphaTokens"); @@ -90,6 +93,7 @@ public class TikaEvalMetadataFilter extends MetadataFilter { CommonTokenResult commonTokenResult = (CommonTokenResult) results.get(CommonTokens.class); metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens()); metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens()); +metadata.set(NUM_COMMON_TOKENS, commonTokenResult.getCommonTokens()); if (commonTokenResult.getAlphabeticTokens() > 0) { metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV()); } else { diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java index 1961698b4..f1fd21c21 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java @@ -42,6 +42,7 @@ public class TikaEvalMetadataFilterTest { assertEquals(11, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS)); assertEquals(10, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS)); assertEquals(9, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS)); +assertEquals(9, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_COMMON_TOKENS)); assertEquals(0.0999,
(tika) branch TIKA-4188 updated: TIKA-4188 -- WIP -- initial steps towards parsing arc files.
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4188 by this push: new 0d584aebc TIKA-4188 -- WIP -- initial steps towards parsing arc files. 0d584aebc is described below commit 0d584aebc4694643d2d3c62d49e8a4ccd8d4e6c9 Author: tallison AuthorDate: Wed Feb 7 11:07:49 2024 -0500 TIKA-4188 -- WIP -- initial steps towards parsing arc files. --- .../detect/gzip/GZipSpecializationDetector.java| 4 ++ .../org/apache/tika/parser/warc/WARCParser.java| 11 ++-- .../apache/tika/parser/warc/WARCParserTest.java| 10 ++- .../src/test/resources/test-documents/example.arc | 69 + .../test/resources/test-documents/example.arc.gz | Bin 0 -> 1027 bytes .../src/test/resources/test-documents/testARC.arc | 10 ++- 6 files changed, 97 insertions(+), 7 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java index e3d743ad3..b87115b3b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java @@ -38,6 +38,8 @@ public class GZipSpecializationDetector implements Detector { public static MediaType GZ = MediaType.application("gzip"); public static MediaType WARC_GZ = MediaType.application("warc+gz"); +public static MediaType ARC_GZ = MediaType.application("arc+gz"); + @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { if (input == null) { @@ -84,6 +86,8 @@ public class GZipSpecializationDetector implements Detector { String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8); if (s.startsWith("WARC/")) { return WARC_GZ; +} else if (s.startsWith("filedesc://")) { +return ARC_GZ; } return GZ; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java index baf8d4a8d..ad4894b54 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java @@ -56,7 +56,9 @@ public class WARCParser implements Parser { private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<>(Arrays.asList(MediaType.application("warc"), -MediaType.application("warc+gz"), MediaType.application("x-internet-archive"; +MediaType.application("warc+gz"), +MediaType.application("x-internet-archive"), +MediaType.application("arc+gz"; public static String WARC_PREFIX = "warc:"; public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:"; @@ -133,9 +135,10 @@ public class WARCParser implements Parser { setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata); processWarcMetadata(warcResponse, metadata); processHttpResponseMetadata(warcResponse.http(), metadata); - -String id = warcResponse.id().toString(); -metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id); +if (warcResponse.warcinfoID().isPresent()) { +String id = warcResponse.id().toString(); +metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id); +} WarcPayload payload = optionalPayload.get(); metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size())); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/ja
(tika) branch main updated: TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore (#1577)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new d86b7d6a0 TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore (#1577) d86b7d6a0 is described below commit d86b7d6a0e88d9efffa56f2409fc33d088da12c0 Author: Tim Allison AuthorDate: Tue Feb 6 16:11:57 2024 -0500 TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore (#1577) --- .../org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java index 6bde42ba0..a60784f0c 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java @@ -97,9 +97,11 @@ public class CallablePipesIterator implements Callable { added++; } for (int i = 0; i < numConsumers; i++) { -boolean offered = queue.offer(PipesIterator.COMPLETED_SEMAPHORE); +boolean offered = queue.offer(PipesIterator.COMPLETED_SEMAPHORE, timeoutMillis, +TimeUnit.MILLISECONDS); if (!offered) { -throw new TimeoutException("timed out trying to offer tuple"); +throw new TimeoutException("timed out trying to offer the completed " + +"semaphore"); } } } else {
(tika) branch TIKA-4189 deleted (was c45caf438)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4189 in repository https://gitbox.apache.org/repos/asf/tika.git was c45caf438 TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch TIKA-4188 created (now 5bcb9c9ce)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git at 5bcb9c9ce TIKA-4188 -- add parsing for arc files This branch includes the following new commits: new 5bcb9c9ce TIKA-4188 -- add parsing for arc files The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4188 -- add parsing for arc files
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git commit 5bcb9c9ce5e83abed0bec138a24c2f4cd56a890f Author: tallison AuthorDate: Tue Feb 6 13:30:56 2024 -0500 TIKA-4188 -- add parsing for arc files --- .../org/apache/tika/parser/warc/WARCParser.java| 5 ++- .../apache/tika/parser/warc/WARCParserTest.java| 13 ++- .../src/test/resources/test-documents/testARC.arc | 42 ++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java index 2c61cae91..baf8d4a8d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java @@ -49,11 +49,14 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; +/** + * This uses jwarc to parse warc files and arc files + */ public class WARCParser implements Parser { private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<>(Arrays.asList(MediaType.application("warc"), -MediaType.application("warc+gz"; +MediaType.application("warc+gz"), MediaType.application("x-internet-archive"; public static String WARC_PREFIX = "warc:"; public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:"; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java index c92f8ec15..57cc65bf4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java @@ -31,7 +31,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; public class WARCParserTest extends TikaTest { -// the cc.warc.gz and gzip_extra_sl.warc.gz files come +// the cc.warc.gz and gzip_extra_sl.warc.gz and the testARC.arc files come // from the jwarc unit tests. @Test @@ -64,4 +64,15 @@ public class WARCParserTest extends TikaTest { assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE)); } + +@Test +public void testARC() throws Exception { +//test file comes from: +// https://github.com/iipc/jwarc/blob/master/test/org/netpreserve/jwarc/apitests/ArcTest.java + +List metadataList = getRecursiveMetadata("testARC.arc", +BasicContentHandlerFactory.HANDLER_TYPE.TEXT); +debug(metadataList); + +} } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc new file mode 100644 index 0..d2b4970be --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc @@ -0,0 +1,42 @@ +filedesc://example.arc 0.0.0.0 20050614070144 text/plain 1338 +1 1 InternetArchive +URL IP-address Archive-date Content-type Archive-length + +http://purl.org/dc/elements/1.1/; xmlns:dcterms="http://purl.org/dc/terms/; xmlns:arc="http://archive.org/arc/1.0/; xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance; xmlns="http://archive.org/arc/1.0/; xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd;> +Heritrix 1.5.0-200506132127 http://crawler.archive.org +example.org +127.0.0.1 +CRAWL +Example crawl +Example +Example +Example +http://purl.org/dc/elements/1.1/; xsi:type="dcterms:W3CDTF">2005-06-14T06:37:49+00:00 +Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://example.o
(tika) branch main updated: TIKA-4190 -- turn off autocommit (#1576)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new c25ff4c31 TIKA-4190 -- turn off autocommit (#1576) c25ff4c31 is described below commit c25ff4c313877ffba71936d5d680176d75520bf7 Author: Tim Allison AuthorDate: Tue Feb 6 13:11:36 2024 -0500 TIKA-4190 -- turn off autocommit (#1576) --- .../org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java| 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java index e72d6ba19..0e3f2cb04 100644 --- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java +++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java @@ -304,7 +304,17 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close insertAll(d.getEmitKey().getEmitKey(), d.getMetadataList()); } } -insertStatement.executeBatch(); +if (LOGGER.isDebugEnabled()) { +long start = System.currentTimeMillis(); +insertStatement.executeBatch(); +connection.commit(); +LOGGER.debug("took {}ms to insert {} rows ", System.currentTimeMillis() - start, +emitData.size()); +} else { +insertStatement.executeBatch(); +connection.commit(); +} + } private void insertAll(String emitKey, List metadataList) throws SQLException { @@ -361,6 +371,7 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close if (connection != null) { try { +connection.commit(); connection.close(); } catch (SQLException e) { LOGGER.warn("exception closing connection", e); @@ -370,6 +381,7 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close private void createConnection() throws SQLException { connection = DriverManager.getConnection(connectionString); +connection.setAutoCommit(false); if (postConnectionString.isPresent()) { try (Statement st = connection.createStatement()) { st.execute(postConnectionString.get());
(tika) branch TIKA-4190 deleted (was d7561152d)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4190 in repository https://gitbox.apache.org/repos/asf/tika.git was d7561152d TIKA-4190 -- turn off autocommit The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) 01/01: TIKA-4191 -- change tika-core's scope to "provided" where possible
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4191 in repository https://gitbox.apache.org/repos/asf/tika.git commit 6a7d55edc636a5a5c13d40894cad82d59395844c Author: tallison AuthorDate: Tue Feb 6 12:45:22 2024 -0500 TIKA-4191 -- change tika-core's scope to "provided" where possible --- CHANGES.txt | 2 ++ tika-app/pom.xml | 1 + tika-batch/pom.xml | 4 tika-eval/tika-eval-app/pom.xml | 5 + tika-eval/tika-eval-core/pom.xml | 1 + tika-fuzzing/pom.xml | 1 + tika-java7/pom.xml | 1 + tika-server/tika-server-core/pom.xml | 6 +- tika-translate/pom.xml | 1 + tika-xmp/pom.xml | 1 + 10 files changed, 22 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 163753e9b..187aa2f55 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -13,6 +13,8 @@ Release 3.0.0-BETA - 12/01/2023 * Removed xerces2 as a dependency (TIKA-4135). + * tika-core now has a scope of "provided" in most non-app modules (TIKA-4191). + * Tika will look for "custom-mimetypes.xml" directly on the classpath, NOT under "/org/apache/tika/mime/". (TIKA-4147). diff --git a/tika-app/pom.xml b/tika-app/pom.xml index b8bd673a8..88f1157a6 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -73,6 +73,7 @@ tika-emitter-fs ${project.version} + ${project.groupId} tika-async-cli diff --git a/tika-batch/pom.xml b/tika-batch/pom.xml index 1a7867578..f0852a288 100644 --- a/tika-batch/pom.xml +++ b/tika-batch/pom.xml @@ -39,20 +39,24 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} tika-serialization ${project.version} + provided org.apache.commons commons-compress + provided commons-cli commons-cli + provided diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 80113700f..18671052c 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -30,6 +30,11 @@ + + org.apache.tika + tika-core + ${project.version} + org.apache.tika tika-eval-core diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml index 0644e72e9..ac6470bee 100644 --- a/tika-eval/tika-eval-core/pom.xml +++ b/tika-eval/tika-eval-core/pom.xml @@ -34,6 +34,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml index b3d801bef..2faa23ce7 100644 --- a/tika-fuzzing/pom.xml +++ b/tika-fuzzing/pom.xml @@ -37,6 +37,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} diff --git a/tika-java7/pom.xml b/tika-java7/pom.xml index 4c209898b..969c4b4ee 100644 --- a/tika-java7/pom.xml +++ b/tika-java7/pom.xml @@ -91,6 +91,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId} diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index 21bc0571f..0039cce4b 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -36,12 +36,16 @@ + + org.apache.tika + tika-core + ${project.version} + ${project.groupId} tika-translate ${project.version} - ${project.groupId} tika-langdetect-optimaize diff --git a/tika-translate/pom.xml b/tika-translate/pom.xml index 71c6d15c7..3d1129d27 100644 --- a/tika-translate/pom.xml +++ b/tika-translate/pom.xml @@ -39,6 +39,7 @@ org.apache.tika tika-core ${project.version} + provided org.apache.tika diff --git a/tika-xmp/pom.xml b/tika-xmp/pom.xml index 68561178e..5917b584b 100644 --- a/tika-xmp/pom.xml +++ b/tika-xmp/pom.xml @@ -83,6 +83,7 @@ ${project.groupId} tika-core ${project.version} + provided ${project.groupId}
(tika) branch TIKA-4191 created (now 6a7d55edc)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4191 in repository https://gitbox.apache.org/repos/asf/tika.git at 6a7d55edc TIKA-4191 -- change tika-core's scope to "provided" where possible This branch includes the following new commits: new 6a7d55edc TIKA-4191 -- change tika-core's scope to "provided" where possible The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4190 -- turn off autocommit
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4190 in repository https://gitbox.apache.org/repos/asf/tika.git commit d7561152d40706f5cac128f8af6cf733c6520189 Author: tallison AuthorDate: Mon Feb 5 12:59:29 2024 -0500 TIKA-4190 -- turn off autocommit --- .../org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java| 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java index e72d6ba19..0e3f2cb04 100644 --- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java +++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java @@ -304,7 +304,17 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close insertAll(d.getEmitKey().getEmitKey(), d.getMetadataList()); } } -insertStatement.executeBatch(); +if (LOGGER.isDebugEnabled()) { +long start = System.currentTimeMillis(); +insertStatement.executeBatch(); +connection.commit(); +LOGGER.debug("took {}ms to insert {} rows ", System.currentTimeMillis() - start, +emitData.size()); +} else { +insertStatement.executeBatch(); +connection.commit(); +} + } private void insertAll(String emitKey, List metadataList) throws SQLException { @@ -361,6 +371,7 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close if (connection != null) { try { +connection.commit(); connection.close(); } catch (SQLException e) { LOGGER.warn("exception closing connection", e); @@ -370,6 +381,7 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close private void createConnection() throws SQLException { connection = DriverManager.getConnection(connectionString); +connection.setAutoCommit(false); if (postConnectionString.isPresent()) { try (Statement st = connection.createStatement()) { st.execute(postConnectionString.get());
(tika) branch TIKA-4190 created (now d7561152d)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4190 in repository https://gitbox.apache.org/repos/asf/tika.git at d7561152d TIKA-4190 -- turn off autocommit This branch includes the following new commits: new d7561152d TIKA-4190 -- turn off autocommit The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4189 in repository https://gitbox.apache.org/repos/asf/tika.git commit c45caf4381fde5d3e88f9760d81f1f5b0d142c73 Author: tallison AuthorDate: Mon Feb 5 12:55:32 2024 -0500 TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore --- .../org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java index 6bde42ba0..a60784f0c 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java @@ -97,9 +97,11 @@ public class CallablePipesIterator implements Callable { added++; } for (int i = 0; i < numConsumers; i++) { -boolean offered = queue.offer(PipesIterator.COMPLETED_SEMAPHORE); +boolean offered = queue.offer(PipesIterator.COMPLETED_SEMAPHORE, timeoutMillis, +TimeUnit.MILLISECONDS); if (!offered) { -throw new TimeoutException("timed out trying to offer tuple"); +throw new TimeoutException("timed out trying to offer the completed " + +"semaphore"); } } } else {
(tika) branch TIKA-4189 created (now c45caf438)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4189 in repository https://gitbox.apache.org/repos/asf/tika.git at c45caf438 TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore This branch includes the following new commits: new c45caf438 TIKA-4189 -- CallablePipesIterator should wait for timeoutMillis when trying to offer the final semaphore The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch main updated: TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction (#1567)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new d9289fd46 TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction (#1567) d9289fd46 is described below commit d9289fd46e9619c7900086eb6572040984a7754a Author: Tim Allison AuthorDate: Wed Jan 31 08:45:18 2024 -0500 TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction (#1567) --- CHANGES.txt| 2 + .../org/apache/tika/mime/tika-mimetypes.xml| 105 - .../tika/parser/sqlite3/SQLite3DBParser.java | 32 +++ .../apache/tika/parser/sqlite3/SQLite3Parser.java | 15 +++ .../tika/parser/sqlite3/SQLite3ParserTest.java | 4 + .../apache/tika/parser/jdbc/AbstractDBParser.java | 14 +++ 6 files changed, 171 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index f9ac540e6..163753e9b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023 Other Changes/Updates + * Improve detection of sqlite3-based file formats (TIKA-4187). + * Upgrade PDFBox to 3.0.1 (TIKA-3347) * Deprecated AbstractParser for removal in 4.x (TIKA-4132). diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 54f7cc6f6..2930fa720 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -4858,11 +4858,114 @@ + - + + +https://www.geopackage.org/ + + + + + + + + + + + +https://www.geopackage.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_comment>Stata DTA Script DO diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java index f4c9d745e..947272a0a 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java @@ -27,6 +27,7 @@ import java.sql.SQLException; import java.sql.Statement; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Set; import org.sqlite.SQLiteConfig; @@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.jdbc.AbstractDBParser; @@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader; class SQLite3DBParser extends AbstractDBParser { protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC"; + +protected static final Map METADATA_KEYS = Map.of( +SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from pragma_application_id", +SQLite3Parser.SQLITE_USER_VERSION, "select user_version from pragma_user_version" +); + //If the InputStream wasn't a TikaInputStream, copy to this tmp file Path tmpFile = null; @@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser { EmbeddedDocumentUtil embeddedDocumentUtil) { return new SQLite3TableReader(connection, tableName, embeddedDocumentUtil); } + +@Override +protected void extractMetadata(Connection connection, Metadata metadata) { +//TODO -- figure out how to get the version of sqlite3 that last modified this file and +// version-valid-for. +// version-valid-for is at offset 92, last modified by app version isat offset 96 -- +// not clear how to get this info via sql +//'file' extracts this
(tika) branch TIKA-4187 deleted (was 73694d21a)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4187 in repository https://gitbox.apache.org/repos/asf/tika.git was 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch TIKA-4187 created (now 73694d21a)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4187 in repository https://gitbox.apache.org/repos/asf/tika.git at 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction This branch includes the following new commits: new 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4187 in repository https://gitbox.apache.org/repos/asf/tika.git commit 73694d21ab19e7e1134ee4f2bf8b76e8c35387bf Author: tallison AuthorDate: Tue Jan 30 12:22:59 2024 -0500 TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction --- CHANGES.txt| 2 + .../org/apache/tika/mime/tika-mimetypes.xml| 105 - .../tika/parser/sqlite3/SQLite3DBParser.java | 32 +++ .../apache/tika/parser/sqlite3/SQLite3Parser.java | 15 +++ .../tika/parser/sqlite3/SQLite3ParserTest.java | 4 + .../apache/tika/parser/jdbc/AbstractDBParser.java | 14 +++ 6 files changed, 171 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index f9ac540e6..163753e9b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023 Other Changes/Updates + * Improve detection of sqlite3-based file formats (TIKA-4187). + * Upgrade PDFBox to 3.0.1 (TIKA-3347) * Deprecated AbstractParser for removal in 4.x (TIKA-4132). diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 54f7cc6f6..2930fa720 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -4858,11 +4858,114 @@ + - + + +https://www.geopackage.org/ + + + + + + + + + + + +https://www.geopackage.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_comment>Stata DTA Script DO diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java index f4c9d745e..947272a0a 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java @@ -27,6 +27,7 @@ import java.sql.SQLException; import java.sql.Statement; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Set; import org.sqlite.SQLiteConfig; @@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.jdbc.AbstractDBParser; @@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader; class SQLite3DBParser extends AbstractDBParser { protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC"; + +protected static final Map METADATA_KEYS = Map.of( +SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from pragma_application_id", +SQLite3Parser.SQLITE_USER_VERSION, "select user_version from pragma_user_version" +); + //If the InputStream wasn't a TikaInputStream, copy to this tmp file Path tmpFile = null; @@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser { EmbeddedDocumentUtil embeddedDocumentUtil) { return new SQLite3TableReader(connection, tableName, embeddedDocumentUtil); } + +@Override +protected void extractMetadata(Connection connection, Metadata metadata) { +//TODO -- figure out how to get the version of sqlite3 that last modified this file and +// version-valid-for. +// version-valid-for is at offset 92, last modified by app version isat offset 96 -- +// not clear how to get this info via sql +//'file' extracts this info; we should to :\ +//See: https://www.sqlite.org/fileformat.html +for (Map.Entry e : METADATA_KEYS.entrySet()) { +try (Statement st = connection.createStatement()) { +
(tika) branch main updated: Improve one-off debugging from literal file path
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new f99e8f9e4 Improve one-off debugging from literal file path f99e8f9e4 is described below commit f99e8f9e4130047ae20d67d862f029ebd29c1fd1 Author: tallison AuthorDate: Fri Jan 26 11:25:40 2024 -0500 Improve one-off debugging from literal file path --- tika-core/src/test/java/org/apache/tika/TikaTest.java | 4 1 file changed, 4 insertions(+) diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index c20229b59..a0a6377b8 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -328,6 +328,10 @@ public abstract class TikaTest { } } +protected List getRecursiveMetadataFromFullPath(String path) throws Exception { +return getRecursiveMetadata(Paths.get(path), true); +} + protected List getRecursiveMetadata(String filePath, boolean suppressException) throws Exception { return getRecursiveMetadata(filePath, new Metadata(), new ParseContext(),
(tika) branch TIKA-4184 deleted (was 88a8efaf7)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4184 in repository https://gitbox.apache.org/repos/asf/tika.git was 88a8efaf7 TIKA-4184 -- fix couple of assertNotNulls The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4184 -- fix couple of assertNotNulls (#1560)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 33ace7f36 TIKA-4184 -- fix couple of assertNotNulls (#1560) 33ace7f36 is described below commit 33ace7f36f9e065310fcdf653a52c7f9f767358c Author: Tim Allison AuthorDate: Thu Jan 25 07:43:50 2024 -0500 TIKA-4184 -- fix couple of assertNotNulls (#1560) --- .../test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java | 8 ++-- .../src/test/java/org/apache/tika/mime/MimeDetectionTest.java | 8 ++-- .../java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java | 8 ++-- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java index af7f56bb9..293f423d2 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java @@ -17,6 +17,7 @@ package org.apache.tika.detect; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -100,7 +101,7 @@ public class MimeDetectionWithNNTest { private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { -assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); +assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } @@ -120,11 +121,6 @@ public class MimeDetectionWithNNTest { } } -private void assertNotNull(String string, InputStream in) { -// TODO Auto-generated method stub - -} - /** * Test for type detection of empty documents. */ diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java index 690da4f29..434ff6c20 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_16BE; import static java.nio.charset.StandardCharsets.UTF_16LE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; @@ -162,7 +163,7 @@ public class MimeDetectionTest { private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { -assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); +assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } @@ -182,11 +183,6 @@ public class MimeDetectionTest { } } -private void assertNotNull(String string, InputStream in) { -// TODO Auto-generated method stub - -} - /** * Test for type detection of empty documents. * diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java index d9a65bf1b..c9d0073c2 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_16BE; import static java.nio.charset.StandardCharsets.UTF_16LE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; @@ -135,7 +136,7 @@ public class ProbabilisticMimeDetectionTest { private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { -assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); +assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } @@ -155,11 +156,6 @@ public class ProbabilisticMimeDetectionTes
(tika) 01/01: TIKA-4184 -- fix couple of assertNotNulls
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4184 in repository https://gitbox.apache.org/repos/asf/tika.git commit 88a8efaf7926618387fd737474416b00440a7969 Author: tallison AuthorDate: Thu Jan 25 07:18:48 2024 -0500 TIKA-4184 -- fix couple of assertNotNulls --- .../test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java | 8 ++-- .../src/test/java/org/apache/tika/mime/MimeDetectionTest.java | 8 ++-- .../java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java | 8 ++-- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java index af7f56bb9..293f423d2 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java @@ -17,6 +17,7 @@ package org.apache.tika.detect; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -100,7 +101,7 @@ public class MimeDetectionWithNNTest { private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { -assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); +assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } @@ -120,11 +121,6 @@ public class MimeDetectionWithNNTest { } } -private void assertNotNull(String string, InputStream in) { -// TODO Auto-generated method stub - -} - /** * Test for type detection of empty documents. */ diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java index 690da4f29..434ff6c20 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_16BE; import static java.nio.charset.StandardCharsets.UTF_16LE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; @@ -162,7 +163,7 @@ public class MimeDetectionTest { private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { -assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); +assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } @@ -182,11 +183,6 @@ public class MimeDetectionTest { } } -private void assertNotNull(String string, InputStream in) { -// TODO Auto-generated method stub - -} - /** * Test for type detection of empty documents. * diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java index d9a65bf1b..c9d0073c2 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_16BE; import static java.nio.charset.StandardCharsets.UTF_16LE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; @@ -135,7 +136,7 @@ public class ProbabilisticMimeDetectionTest { private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException { -assertNotNull("Test stream: [" + urlOrFileName + "] is null!", in); +assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); } @@ -155,11 +156,6 @@ public class ProbabilisticMimeDetectionTest { } } -private void assertNotNull(String string, InputStream in) { -// TODO Auto-generated method stub - -} - /** * Test for type detection of empty documents. *
(tika) branch TIKA-4184 created (now 88a8efaf7)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4184 in repository https://gitbox.apache.org/repos/asf/tika.git at 88a8efaf7 TIKA-4184 -- fix couple of assertNotNulls This branch includes the following new commits: new 88a8efaf7 TIKA-4184 -- fix couple of assertNotNulls The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch main updated: [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" to follow RFC-9239 (#1556)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new ae737cd26 [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" to follow RFC-9239 (#1556) ae737cd26 is described below commit ae737cd2625b5e2659c20c27713785df8bfc1957 Author: Marcos Pereira AuthorDate: Wed Jan 24 10:56:58 2024 -0500 [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" to follow RFC-9239 (#1556) * [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" Following RFC 9239. This also adds support for ".msj" ( as documented in the RFC). --- CHANGES.txt| 3 ++ README.md | 6 +-- .../org/apache/tika/mime/tika-mimetypes.xml| 9 +++-- .../java/org/apache/tika/TikaDetectionTest.java| 3 +- .../org/apache/tika/mime/MimeTypesReaderTest.java | 43 ++ .../tika/mime/ProbabilisticMimeDetectionTest.java | 2 +- .../java/org/apache/tika/mime/TestMimeTypes.java | 12 +++--- 7 files changed, 48 insertions(+), 30 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 9953e46d3..f9ac540e6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -16,6 +16,9 @@ Release 3.0.0-BETA - 12/01/2023 * Tika will look for "custom-mimetypes.xml" directly on the classpath, NOT under "/org/apache/tika/mime/". (TIKA-4147). + * Return media type "text/javascript" instead of "application/javascript" + to follow RFC-9239. (TIKA-4119). + Other Changes/Updates * Upgrade PDFBox to 3.0.1 (TIKA-3347) diff --git a/README.md b/README.md index 97d709656..88fe59cb8 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Tika jars can be fetched from Maven Central or your favourite Maven mirror. **Tika 1.X reached End of Life (EOL) on September 30, 2022.** -Tika is based on **Java 8** and uses the [Maven 3](https://maven.apache.org) build system. +Tika is based on **Java 11** and uses the [Maven 3](https://maven.apache.org) build system. **N.B.** [Docker](https://www.docker.com/products/personal) is used for tests in tika-integration-tests. As of Tika 2.5.1, if Docker is not installed, those tests are skipped. Docker is required for a successful build on earlier 2.x versions. @@ -50,7 +50,7 @@ Maven Dependencies Apache Tika provides *Bill of Material* (BOM) artifact to align Tika module versions and simplify version management. To avoid convergence errors in your own project, import this -bom or Tika's parent pom.xml in your dependencey management section. +bom or Tika's parent pom.xml in your dependency management section. If you use Apache Maven: @@ -170,7 +170,7 @@ Notification on all code changes are sent to the following mailing list: The mailing lists are open to anyone and publicly archived. You can subscribe the mailing lists by sending a message to -[LIST]-subscr...@tika.apache.org (for example user-subscribe@...). +[LIST]-subscr...@tika.apache.org (for example, user-subscribe@...). To unsubscribe, send a message to [LIST]-unsubscr...@tika.apache.org. For more instructions, send a message to [LIST]-h...@tika.apache.org. diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index b76adebd1..54f7cc6f6 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -366,15 +366,18 @@ - + + - <_comment>JavaScript Source Code + + + - + diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java index 1cd0f40a2..215865886 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java @@ -56,7 +56,8 @@ public class TikaDetectionTest { assertEquals("application/java-archive", tika.detect("x.jar")); assertEquals("application/java-serialized-object", tika.detect("x.ser")); assertEquals("application/java-vm", tika.detect("x.class")); -assertEquals("application/javascript", tika.detect("x.js")); +assertEquals("text/javascript", tika.detect("x.js")); +assertEquals("text/javascript", tika.detect("x.mjs")); assertEquals("application/json", tika.de
(tika) branch branch_2x updated: TIKA-4179 -- downgrade zookeeper in age-recogniser so that 2.x can still be built with Java 8 (#1532)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new 1b9755416 TIKA-4179 -- downgrade zookeeper in age-recogniser so that 2.x can still be built with Java 8 (#1532) 1b9755416 is described below commit 1b97554166b59ecf302ca77c06b7dcf965530799 Author: Tim Allison AuthorDate: Mon Jan 8 16:47:19 2024 -0500 TIKA-4179 -- downgrade zookeeper in age-recogniser so that 2.x can still be built with Java 8 (#1532) --- tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml b/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml index 0663d855d..88b4fad9b 100644 --- a/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml @@ -97,7 +97,8 @@ org.apache.zookeeper zookeeper - 3.9.1 + + 3.7.2 io.netty
(tika) branch TIKA-4179 deleted (was 0e14c8cdb)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4179 in repository https://gitbox.apache.org/repos/asf/tika.git was 0e14c8cdb TIKA-4179 -- downgrade zookeeper in age-recogniser so that 2.x can still be built with Java 8 The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.