This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4535 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 73b2583a5163498ee91116452f6f036f5c93e81e Author: tallison <[email protected]> AuthorDate: Thu Oct 30 14:11:13 2025 -0400 TIKA-4535 -- limit use of TikaConfig.getDefaultConfig to where we need the full config. --- .../java/org/apache/tika/io/FilenameUtils.java | 2 +- .../tika/eval/app/ExtractComparerRunner.java | 3 ++- .../apache/tika/eval/app/ExtractProfileRunner.java | 3 ++- .../org/apache/tika/eval/app/db/MimeBuffer.java | 15 ++++++------- .../org/apache/tika/eval/app/io/ExtractReader.java | 7 +++--- .../org/apache/tika/parser/isatab/ISATabUtils.java | 25 ++++++++++++---------- 6 files changed, 29 insertions(+), 26 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java index d4230d441..3cab9abb3 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java @@ -31,7 +31,7 @@ import org.apache.tika.utils.StringUtils; public class FilenameUtils { - private static final MimeTypes MIME_TYPES = TikaConfig.getDefaultConfig().getMimeRepository(); + private static final MimeTypes MIME_TYPES = MimeTypes.getDefaultMimeTypes(); private static final Pattern PROTOCOL_PATTERN = Pattern.compile("[A-Za-z0-9]{1,10}://+"); /** * Reserved characters diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java index 8f86ab81e..1fde33370 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java @@ -55,6 +55,7 @@ import org.apache.tika.eval.app.io.DBWriter; import org.apache.tika.eval.app.io.ExtractReader; import org.apache.tika.eval.app.io.ExtractReaderException; import org.apache.tika.eval.app.io.IDBWriter; +import org.apache.tika.mime.MimeTypes; import org.apache.tika.pipes.core.FetchEmitTuple; import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; @@ -191,7 +192,7 @@ public class ExtractComparerRunner { jdbcUtil.createTables(builder.getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); //step 2. create mime buffer - return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), TikaConfig.getDefaultConfig()); + return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), MimeTypes.getDefaultMimeTypes()); } private static void USAGE() throws IOException { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java index a73a2f579..92e7c240f 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java @@ -55,6 +55,7 @@ import org.apache.tika.eval.app.io.DBWriter; import org.apache.tika.eval.app.io.ExtractReader; import org.apache.tika.eval.app.io.ExtractReaderException; import org.apache.tika.eval.app.io.IDBWriter; +import org.apache.tika.mime.MimeTypes; import org.apache.tika.pipes.core.FetchEmitTuple; import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; @@ -185,7 +186,7 @@ public class ExtractProfileRunner { jdbcUtil.createTables(builder.getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); //step 2. create mime buffer - return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), TikaConfig.getDefaultConfig()); + return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), MimeTypes.getDefaultMimeTypes()); } private static void USAGE() throws IOException { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java index 34cf18061..e3002ff3d 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java @@ -31,14 +31,14 @@ import org.apache.tika.mime.MimeTypes; public class MimeBuffer extends AbstractDBBuffer { private final PreparedStatement st; - private final TikaConfig config; + private final MimeTypes mimeTypes; private final Connection connection; - public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException { + public MimeBuffer(Connection connection, TableInfo mimeTable, MimeTypes mimeTypes) throws SQLException { st = connection.prepareStatement( "insert into " + mimeTable.getName() + "( " + Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() + ", " + Cols.FILE_EXTENSION.name() + ") values (?,?,?)"); - this.config = config; + this.mimeTypes = mimeTypes; this.connection = connection; } @@ -49,7 +49,7 @@ public class MimeBuffer extends AbstractDBBuffer { st.setInt(1, id); st.setString(2, value); try { - String ext = MimeUtil.getExtension(value, config); + String ext = MimeUtil.getExtension(value, mimeTypes); if (ext == null || ext.isEmpty()) { st.setNull(3, Types.VARCHAR); } else { @@ -92,13 +92,12 @@ public class MimeBuffer extends AbstractDBBuffer { * don't currently return anything for {@link MimeType#getExtension}; * * @param contentType string representing a content type, for example: "application/pdf" - * @param config config from which to get MimeRepository + * @param mimeTypes MimeRepository * @return extension or empty string * @throws MimeTypeException thrown if MimeTypes can't parse the contentType */ - public static String getExtension(String contentType, TikaConfig config) throws MimeTypeException { - MimeTypes types = config.getMimeRepository(); - MimeType mime = types.forName(contentType); + public static String getExtension(String contentType, MimeTypes mimeTypes) throws MimeTypeException { + MimeType mime = mimeTypes.forName(contentType); return getExtension(mime); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java index bfab8b253..9acdd1c8a 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java @@ -41,6 +41,7 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypes; import org.apache.tika.sax.ToTextContentHandler; import org.apache.tika.sax.ToXMLContentHandler; import org.apache.tika.serialization.JsonMetadataList; @@ -52,7 +53,7 @@ public class ExtractReader { private final ALTER_METADATA_LIST alterMetadataList; private final long minExtractLength; private final long maxExtractLength; - private TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + private final MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); /** * Reads full extract, no modification of metadata list, no min or max extract length checking @@ -204,9 +205,7 @@ public class ExtractReader { //but better than nothing. m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName); - MediaType mimeType = tikaConfig - .getMimeRepository() - .detect(null, m); + MediaType mimeType = mimeTypes.detect(null, m); if (mimeType != null) { m.set(Metadata.CONTENT_TYPE, mimeType.toString()); } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java index 507d2da4e..7577b3217 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java @@ -33,6 +33,8 @@ import org.xml.sax.SAXException; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.detect.DefaultEncodingDetector; +import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -79,12 +81,9 @@ public class ISATabUtils { throws IOException, TikaException, SAXException { TikaInputStream tis = TikaInputStream.get(stream); // Automatically detect the character encoding - TikaConfig tikaConfig = context.get(TikaConfig.class); - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } + EncodingDetector encodingDetector = getEncodingDetector(context); try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(tis), - metadata, tikaConfig.getEncodingDetector()); + metadata, encodingDetector); CSVParser csvParser = CSVParser.builder().setReader(reader).setFormat(CSVFormat.TDF).get()) { Iterator<CSVRecord> iterator = csvParser.iterator(); @@ -118,19 +117,23 @@ public class ISATabUtils { } } + private static EncodingDetector getEncodingDetector(ParseContext context) { + TikaConfig tikaConfig = context.get(TikaConfig.class); + if (tikaConfig != null) { + return tikaConfig.getEncodingDetector(); + } + return new DefaultEncodingDetector(); + } + public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { TikaInputStream tis = TikaInputStream.get(stream); // Automatically detect the character encoding - - TikaConfig tikaConfig = context.get(TikaConfig.class); - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } + EncodingDetector encodingDetector = getEncodingDetector(context); try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(tis), - metadata, tikaConfig.getEncodingDetector()); + metadata, encodingDetector); CSVParser csvParser = CSVParser.builder().setReader(reader).setFormat(CSVFormat.TDF).get()) { xhtml.startElement("table");
