This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 36a0dca43 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629) 36a0dca43 is described below commit 36a0dca435d38b123bc62567b328fc9e522ac956 Author: Tim Allison <talli...@apache.org> AuthorDate: Wed Feb 28 10:04:00 2024 -0500 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629) --- tika-eval/tika-eval-app/pom.xml | 2 -- .../java/org/apache/tika/eval/app/AbstractProfiler.java | 17 ++++++++++++++++- .../java/org/apache/tika/eval/app/ExtractProfiler.java | 4 ++++ .../src/main/java/org/apache/tika/eval/app/db/Cols.java | 3 +++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 18671052c..b93783f75 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -93,10 +93,8 @@ </createDependencyReducedPom> <artifactSet> <excludes> - <exclude>org.apache.tika:tika-core:jar:</exclude> <exclude>org.apache.tika:tika-serialization:jar:</exclude> <exclude>org.apache.tika:tika-langdetect-opennlp:jar:</exclude> - <exclude>commons-io:commons-io:jar:</exclude> <exclude>commons-codec:commons-codec:jar:</exclude> <exclude>org.apache.commons:commons-lang3:jar:</exclude> <exclude>org.apache.commons:commons-math3:jar:</exclude> diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java index 2397bbcab..0cd609d3b 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java @@ -71,6 +71,7 @@ import org.apache.tika.eval.core.util.EvalExceptionUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PDF; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; @@ -387,6 +388,10 @@ public abstract class AbstractProfiler extends FileResourceConsumer { if (nPages != null) { data.put(Cols.NUM_PAGES, Integer.toString(nPages)); } + Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT); + if (nOCRPages != null) { + data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages)); + } //if the outer wrapper document if (i == 0) { @@ -395,10 +400,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer { data.put(Cols.EMBEDDED_DEPTH, "0"); } else { data.put(Cols.IS_EMBEDDED, TRUE); - data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))); + String embeddedFilePath = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); + if (! StringUtils.isBlank(embeddedFilePath)) { + data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath))); + data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath); + } if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) { data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH)); } + if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) { + data.put(Cols.ATTACHMENT_TYPE, m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } } String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME)); ext = (ext == null) ? "" : ext.toLowerCase(Locale.US); @@ -486,6 +498,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Integer.toString(commonTokenResult.getUniqueAlphabeticTokens())); data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens())); + double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0; + data.put(Cols.OOV, Double.toString(oov)); } TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class); if (tokenCounts != null) { @@ -498,6 +512,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Double.toString((Double) textStats.get(TokenEntropy.class))); } + SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class); if (summStats != null) { data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum())); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java index ad0ce0bac..4e7d45088 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java @@ -64,12 +64,15 @@ public class ExtractProfiler extends AbstractProfiler { new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN), new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER), + new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024), + new ColInfo(Cols.ATTACHMENT_TYPE, Types.VARCHAR, 32), new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), new ColInfo(Cols.MIME_ID, Types.INTEGER), new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER), new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER), new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER), new ColInfo(Cols.NUM_PAGES, Types.INTEGER), + new ColInfo(Cols.NUM_OCR_PAGES, Types.INTEGER), new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)); public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), @@ -84,6 +87,7 @@ public class ExtractProfiler extends AbstractProfiler { new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER), + new ColInfo(Cols.OOV, Types.DOUBLE), new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024), new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12), new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT), diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java index b6f617ce6..35d70b430 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java @@ -26,6 +26,7 @@ public enum Cols { //profile table ID, LENGTH, FILE_NAME, FILE_EXTENSION, ELAPSED_TIME_MILLIS, NUM_METADATA_VALUES, IS_EMBEDDED, EMBEDDED_FILE_PATH, MIME_ID, TIKA_MIME_ID, FILE_MIME_ID, SHA256, MD5, NUM_ATTACHMENTS, + ATTACHMENT_TYPE, EMBEDDED_DEPTH, HAS_CONTENT, @@ -34,8 +35,10 @@ public enum Cols { NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens COMMON_TOKENS_LANG, //which language was used for the common tokens metric? NUM_UNIQUE_COMMON_TOKENS, NUM_COMMON_TOKENS, TOP_N_TOKENS, LANG_ID_1, LANG_ID_PROB_1, LANG_ID_2, + OOV, LANG_ID_PROB_2, TOKEN_ENTROPY_RATE, TOKEN_LENGTH_SUM, TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV, UNICODE_CHAR_BLOCKS, NUM_PAGES, //number of pages a document alleges it has + NUM_OCR_PAGES, CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH //content comparisons