This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4278 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4ae0ebda38d8b020fa40a3baefefc9b6acd32734 Author: tallison <[email protected]> AuthorDate: Thu Oct 10 17:04:06 2024 -0400 TIKA-4278 -- remove colon from default and allow users to customize delimiters --- .../org/apache/tika/parser/csv/CSVSniffer.java | 5 -- .../apache/tika/parser/csv/TextAndCSVConfig.java | 57 +++++++++++++++++++ .../apache/tika/parser/csv/TextAndCSVParser.java | 64 +++++++++++----------- .../tika/parser/csv/TextAndCSVParserTest.java | 12 ++++ .../test-configs/tika-config-colon-delimiter.xml | 30 ++++++++++ .../test-documents/testColonDelimited.txt | 13 +++++ 6 files changed, 143 insertions(+), 38 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java index 6eb53eb14..1a772c22d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java @@ -93,11 +93,6 @@ class CSVSniffer { if (bestResult.getConfidence() < minConfidence) { return CSVResult.TEXT; } - // TIKA-4278: colon isn't reliable, e.g. govdocs1/242/242970.txt - if (results.size() > 1 && bestResult.getDelimiter().equals(':') && - results.get(1).getConfidence() == bestResult.getConfidence()) { - return results.get(1); - } return bestResult; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java new file mode 100644 index 000000000..8b7e43c22 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.csv; + +import java.util.HashMap; +import java.util.Map; + +public class TextAndCSVConfig { + + private static final Map<Character, String> DELIMITER_TO_NAME_MAP = new HashMap<>(); + private static final Map<String, Character> NAME_TO_DELIMITER_MAP = new HashMap<>(); + + static { + DELIMITER_TO_NAME_MAP.put(',', "comma"); + DELIMITER_TO_NAME_MAP.put('\t', "tab"); + DELIMITER_TO_NAME_MAP.put('|', "pipe"); + DELIMITER_TO_NAME_MAP.put(';', "semicolon"); + } + + static { + for (Map.Entry<Character, String> e : DELIMITER_TO_NAME_MAP.entrySet()) { + NAME_TO_DELIMITER_MAP.put(e.getValue(), e.getKey()); + } + } + + private Map<String, Character> nameToDelimiterMap = NAME_TO_DELIMITER_MAP; + private Map<Character, String> delimiterToNameMap = DELIMITER_TO_NAME_MAP; + + public Map<String, Character> getNameToDelimiterMap() { + return nameToDelimiterMap; + } + + public Map<Character, String> getDelimiterToNameMap() { + return delimiterToNameMap; + } + + public void setNameToDelimiterMap(Map<String, Character> nameToDelimiterMap) { + this.nameToDelimiterMap = new HashMap<>(nameToDelimiterMap); + this.delimiterToNameMap = new HashMap<>(); + nameToDelimiterMap.entrySet() + .forEach(e -> delimiterToNameMap.put(e.getValue(), e.getKey())); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java index e9b35da67..1f565d77f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java @@ -40,6 +40,7 @@ import org.xml.sax.SAXException; import org.apache.tika.config.Field; import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -95,25 +96,9 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { private static final String TABLE = "table"; private static final int DEFAULT_MARK_LIMIT = 20000; - private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>(); - private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>(); private static final Set<MediaType> SUPPORTED_TYPES = Collections .unmodifiableSet(new HashSet<>(Arrays.asList(CSV, TSV, MediaType.TEXT_PLAIN))); - static { - CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma"); - CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab"); - CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe"); - CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon"); - CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon"); - } - - static { - for (Map.Entry<Character, String> e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) { - STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey()); - } - } - /** * This is the mark limit in characters (not bytes) to * read from the stream when classifying the stream as @@ -157,6 +142,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { return mediaType.getBaseType().equals(TSV) || mediaType.getBaseType().equals(CSV); } + private final TextAndCSVConfig defaultTextAndCSVConfig = new TextAndCSVConfig(); @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -165,12 +151,13 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + TextAndCSVConfig textAndCSVConfig = context.get(TextAndCSVConfig.class, defaultTextAndCSVConfig); - CSVParams params = getOverride(metadata); + CSVParams params = getOverride(metadata, textAndCSVConfig); Reader reader; Charset charset; if (!params.isComplete()) { - reader = detect(params, stream, metadata, context); + reader = detect(params, textAndCSVConfig, stream, metadata, context); if (params.getCharset() != null) { charset = params.getCharset(); } else { @@ -181,7 +168,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { charset = params.getCharset(); } - updateMetadata(params, metadata); + updateMetadata(params, metadata, textAndCSVConfig); //if text or a non-csv/tsv category of text //treat this as text and be done @@ -193,8 +180,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { } CSVFormat csvFormat = CSVFormat.EXCEL.builder().setDelimiter(params.getDelimiter()).build(); - metadata.set(DELIMITER_PROPERTY, - CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiterString().charAt(0))); + metadata.set(DELIMITER_PROPERTY, textAndCSVConfig.getDelimiterToNameMap().get(csvFormat.getDelimiterString().charAt(0))); XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata); int totalRows = 0; @@ -273,7 +259,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { xhtml.endDocument(); } - private Reader detect(CSVParams params, InputStream stream, Metadata metadata, + private Reader detect(CSVParams params, TextAndCSVConfig textAndCSVConfig, InputStream stream, Metadata metadata, ParseContext context) throws IOException, TikaException { //if the file was already identified as not .txt, .csv or .tsv //don't even try to csv or not @@ -302,7 +288,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { if (params.getDelimiter() == null && (params.getMediaType() == null || isCSVOrTSV(params.getMediaType()))) { - CSVSniffer sniffer = new CSVSniffer(markLimit, CHAR_TO_STRING_DELIMITER_MAP.keySet(), minConfidence); + CSVSniffer sniffer = new CSVSniffer(markLimit, textAndCSVConfig.getDelimiterToNameMap().keySet(), minConfidence); CSVResult result = sniffer.getBest(reader, metadata); params.setMediaType(result.getMediaType()); params.setDelimiter(result.getDelimiter()); @@ -310,7 +296,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { return reader; } - private CSVParams getOverride(Metadata metadata) { + private CSVParams getOverride(Metadata metadata, TextAndCSVConfig textAndCSVConfig) { String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE); if (override == null) { return new CSVParams(); @@ -332,22 +318,22 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { return new CSVParams(mediaType, charset); } - String delimiterString = mediaType.getParameters().get(DELIMITER); - if (delimiterString == null) { + String delimiterName = mediaType.getParameters().get(DELIMITER); + if (delimiterName == null) { return new CSVParams(mediaType, charset); } - if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) { + if (textAndCSVConfig.getNameToDelimiterMap().containsKey(delimiterName)) { return new CSVParams(mediaType, charset, - (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString)); + (char) textAndCSVConfig.getNameToDelimiterMap().get(delimiterName)); } - if (delimiterString.length() == 1) { - return new CSVParams(mediaType, charset, delimiterString.charAt(0)); + if (delimiterName.length() == 1) { + return new CSVParams(mediaType, charset, delimiterName.charAt(0)); } //TODO: log bad/unrecognized delimiter string return new CSVParams(mediaType, charset); } - private void updateMetadata(CSVParams params, Metadata metadata) { + private void updateMetadata(CSVParams params, Metadata metadata, TextAndCSVConfig textAndCSVConfig) { MediaType mediaType = null; if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) { mediaType = MediaType.TEXT_PLAIN; @@ -369,8 +355,8 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name()); } if (!MediaType.TEXT_PLAIN.equals(mediaType) && params.getDelimiter() != null) { - if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) { - attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter())); + if (textAndCSVConfig.getDelimiterToNameMap().containsKey(params.getDelimiter())) { + attrs.put(DELIMITER, textAndCSVConfig.getDelimiterToNameMap().get(params.getDelimiter())); } else { attrs.put(DELIMITER, Integer.toString((int) params.getDelimiter())); } @@ -379,4 +365,16 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } + @Field + public void setNameToDelimiterMap(Map<String, String> map) throws TikaConfigException { + Map<String, Character> m = new HashMap<>(); + for (Map.Entry<String, String> e : map.entrySet()) { + if (e.getValue().length() > 1) { + throw new TikaConfigException("delimiter must be a single character: " + e.getValue()); + } + m.put(e.getKey(), e.getValue().charAt(0)); + } + defaultTextAndCSVConfig.setNameToDelimiterMap(m); + } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java index 0e6117277..55d229229 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java @@ -240,6 +240,18 @@ public class TextAndCSVParserTest extends TikaTest { assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE)); } + @Test + public void testCustomizingDelimiter() throws Exception { + TikaConfig tikaConfig = null; + try (InputStream is = TextAndCSVParserTest.class.getResourceAsStream("/test-configs/tika-config-colon-delimiter.xml")) { + tikaConfig = new TikaConfig(is); + } + Parser p = new AutoDetectParser(tikaConfig); + XMLResult r = getXML("testColonDelimited.txt", p); + assertEquals("colon", r.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY)); + assertContains("colon", r.metadata.get(Metadata.CONTENT_TYPE)); + } + private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) { assertContains(expected, xml.replaceAll("[\r\n\t ]", " ")); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml new file mode 100644 index 000000000..71f59223d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.csv.TextAndCSVParser"> + <params> + <param name="nameToDelimiterMap" type="map"> + <entry key="comma" value=","/> + <entry key="colon" value=":"/> + </param> + </params> + </parser> + </parsers> +</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt new file mode 100644 index 000000000..1f3f28352 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt @@ -0,0 +1,13 @@ +a:b:c:d +1:2:3:4 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8 +5:6:7:8
