This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5f43b00ff TIKA-4278 -- remove colon from default and allow users to
customize d… (#1976)
5f43b00ff is described below
commit 5f43b00ff93cb2cd562c5e7f29f2e50b9cba37f8
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 11 10:01:04 2024 -0400
TIKA-4278 -- remove colon from default and allow users to customize d…
(#1976)
* TIKA-4278 -- remove colon from default and allow users to customize
delimiters
---
.../org/apache/tika/parser/csv/CSVSniffer.java | 2 +-
.../apache/tika/parser/csv/TextAndCSVConfig.java | 58 ++++++++++++++++++++
.../apache/tika/parser/csv/TextAndCSVParser.java | 64 +++++++++++-----------
.../tika/parser/csv/TextAndCSVParserTest.java | 12 ++++
.../test-configs/tika-config-colon-delimiter.xml | 30 ++++++++++
.../test-documents/testColonDelimited.txt | 13 +++++
6 files changed, 145 insertions(+), 34 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
index 6eb53eb14..9075a74db 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
@@ -95,7 +95,7 @@ class CSVSniffer {
}
// TIKA-4278: colon isn't reliable, e.g. govdocs1/242/242970.txt
if (results.size() > 1 && bestResult.getDelimiter().equals(':') &&
- results.get(1).getConfidence() == bestResult.getConfidence()) {
+ Math.abs(results.get(1).getConfidence() -
bestResult.getConfidence()) < 0.0001) {
return results.get(1);
}
return bestResult;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java
new file mode 100644
index 000000000..53c3286ed
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVConfig.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TextAndCSVConfig implements Serializable {
+
+ private static final Map<Character, String> DELIMITER_TO_NAME_MAP = new
HashMap<>();
+ private static final Map<String, Character> NAME_TO_DELIMITER_MAP = new
HashMap<>();
+
+ static {
+ DELIMITER_TO_NAME_MAP.put(',', "comma");
+ DELIMITER_TO_NAME_MAP.put('\t', "tab");
+ DELIMITER_TO_NAME_MAP.put('|', "pipe");
+ DELIMITER_TO_NAME_MAP.put(';', "semicolon");
+ }
+
+ static {
+ for (Map.Entry<Character, String> e :
DELIMITER_TO_NAME_MAP.entrySet()) {
+ NAME_TO_DELIMITER_MAP.put(e.getValue(), e.getKey());
+ }
+ }
+
+ private Map<String, Character> nameToDelimiterMap = NAME_TO_DELIMITER_MAP;
+ private Map<Character, String> delimiterToNameMap = DELIMITER_TO_NAME_MAP;
+
+ public Map<String, Character> getNameToDelimiterMap() {
+ return nameToDelimiterMap;
+ }
+
+ public Map<Character, String> getDelimiterToNameMap() {
+ return delimiterToNameMap;
+ }
+
+ public void setNameToDelimiterMap(Map<String, Character>
nameToDelimiterMap) {
+ this.nameToDelimiterMap = new HashMap<>(nameToDelimiterMap);
+ this.delimiterToNameMap = new HashMap<>();
+ nameToDelimiterMap.entrySet()
+ .forEach(e -> delimiterToNameMap.put(e.getValue(),
e.getKey()));
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
index e9b35da67..1f565d77f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -40,6 +40,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.Field;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
@@ -95,25 +96,9 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
private static final String TABLE = "table";
private static final int DEFAULT_MARK_LIMIT = 20000;
- private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP =
new HashMap<>();
- private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP =
new HashMap<>();
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.unmodifiableSet(new HashSet<>(Arrays.asList(CSV, TSV,
MediaType.TEXT_PLAIN)));
- static {
- CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma");
- CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab");
- CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe");
- CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon");
- CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon");
- }
-
- static {
- for (Map.Entry<Character, String> e :
CHAR_TO_STRING_DELIMITER_MAP.entrySet()) {
- STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey());
- }
- }
-
/**
* This is the mark limit in characters (not bytes) to
* read from the stream when classifying the stream as
@@ -157,6 +142,7 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
return mediaType.getBaseType().equals(TSV) ||
mediaType.getBaseType().equals(CSV);
}
+ private final TextAndCSVConfig defaultTextAndCSVConfig = new
TextAndCSVConfig();
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -165,12 +151,13 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
ParseContext context) throws IOException, SAXException,
TikaException {
+ TextAndCSVConfig textAndCSVConfig =
context.get(TextAndCSVConfig.class, defaultTextAndCSVConfig);
- CSVParams params = getOverride(metadata);
+ CSVParams params = getOverride(metadata, textAndCSVConfig);
Reader reader;
Charset charset;
if (!params.isComplete()) {
- reader = detect(params, stream, metadata, context);
+ reader = detect(params, textAndCSVConfig, stream, metadata,
context);
if (params.getCharset() != null) {
charset = params.getCharset();
} else {
@@ -181,7 +168,7 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
charset = params.getCharset();
}
- updateMetadata(params, metadata);
+ updateMetadata(params, metadata, textAndCSVConfig);
//if text or a non-csv/tsv category of text
//treat this as text and be done
@@ -193,8 +180,7 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
}
CSVFormat csvFormat =
CSVFormat.EXCEL.builder().setDelimiter(params.getDelimiter()).build();
- metadata.set(DELIMITER_PROPERTY,
-
CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiterString().charAt(0)));
+ metadata.set(DELIMITER_PROPERTY,
textAndCSVConfig.getDelimiterToNameMap().get(csvFormat.getDelimiterString().charAt(0)));
XHTMLContentHandler xhtmlContentHandler = new
XHTMLContentHandler(handler, metadata);
int totalRows = 0;
@@ -273,7 +259,7 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
xhtml.endDocument();
}
- private Reader detect(CSVParams params, InputStream stream, Metadata
metadata,
+ private Reader detect(CSVParams params, TextAndCSVConfig textAndCSVConfig,
InputStream stream, Metadata metadata,
ParseContext context) throws IOException,
TikaException {
//if the file was already identified as not .txt, .csv or .tsv
//don't even try to csv or not
@@ -302,7 +288,7 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
if (params.getDelimiter() == null &&
(params.getMediaType() == null ||
isCSVOrTSV(params.getMediaType()))) {
- CSVSniffer sniffer = new CSVSniffer(markLimit,
CHAR_TO_STRING_DELIMITER_MAP.keySet(), minConfidence);
+ CSVSniffer sniffer = new CSVSniffer(markLimit,
textAndCSVConfig.getDelimiterToNameMap().keySet(), minConfidence);
CSVResult result = sniffer.getBest(reader, metadata);
params.setMediaType(result.getMediaType());
params.setDelimiter(result.getDelimiter());
@@ -310,7 +296,7 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
return reader;
}
- private CSVParams getOverride(Metadata metadata) {
+ private CSVParams getOverride(Metadata metadata, TextAndCSVConfig
textAndCSVConfig) {
String override =
metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE);
if (override == null) {
return new CSVParams();
@@ -332,22 +318,22 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
return new CSVParams(mediaType, charset);
}
- String delimiterString = mediaType.getParameters().get(DELIMITER);
- if (delimiterString == null) {
+ String delimiterName = mediaType.getParameters().get(DELIMITER);
+ if (delimiterName == null) {
return new CSVParams(mediaType, charset);
}
- if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) {
+ if
(textAndCSVConfig.getNameToDelimiterMap().containsKey(delimiterName)) {
return new CSVParams(mediaType, charset,
- (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString));
+ (char)
textAndCSVConfig.getNameToDelimiterMap().get(delimiterName));
}
- if (delimiterString.length() == 1) {
- return new CSVParams(mediaType, charset,
delimiterString.charAt(0));
+ if (delimiterName.length() == 1) {
+ return new CSVParams(mediaType, charset, delimiterName.charAt(0));
}
//TODO: log bad/unrecognized delimiter string
return new CSVParams(mediaType, charset);
}
- private void updateMetadata(CSVParams params, Metadata metadata) {
+ private void updateMetadata(CSVParams params, Metadata metadata,
TextAndCSVConfig textAndCSVConfig) {
MediaType mediaType = null;
if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) {
mediaType = MediaType.TEXT_PLAIN;
@@ -369,8 +355,8 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
metadata.set(Metadata.CONTENT_ENCODING,
params.getCharset().name());
}
if (!MediaType.TEXT_PLAIN.equals(mediaType) && params.getDelimiter()
!= null) {
- if
(CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
- attrs.put(DELIMITER,
CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
+ if
(textAndCSVConfig.getDelimiterToNameMap().containsKey(params.getDelimiter())) {
+ attrs.put(DELIMITER,
textAndCSVConfig.getDelimiterToNameMap().get(params.getDelimiter()));
} else {
attrs.put(DELIMITER, Integer.toString((int)
params.getDelimiter()));
}
@@ -379,4 +365,16 @@ public class TextAndCSVParser extends
AbstractEncodingDetectorParser {
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
+ @Field
+ public void setNameToDelimiterMap(Map<String, String> map) throws
TikaConfigException {
+ Map<String, Character> m = new HashMap<>();
+ for (Map.Entry<String, String> e : map.entrySet()) {
+ if (e.getValue().length() > 1) {
+ throw new TikaConfigException("delimiter must be a single
character: " + e.getValue());
+ }
+ m.put(e.getKey(), e.getValue().charAt(0));
+ }
+ defaultTextAndCSVConfig.setNameToDelimiterMap(m);
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index 0e6117277..55d229229 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -240,6 +240,18 @@ public class TextAndCSVParserTest extends TikaTest {
assertEquals("text/x-vcalendar; charset=ISO-8859-1",
r.metadata.get(Metadata.CONTENT_TYPE));
}
+ @Test
+ public void testCustomizingDelimiter() throws Exception {
+ TikaConfig tikaConfig = null;
+ try (InputStream is =
TextAndCSVParserTest.class.getResourceAsStream("/test-configs/tika-config-colon-delimiter.xml"))
{
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ XMLResult r = getXML("testColonDelimited.txt", p);
+ assertEquals("colon",
r.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+ assertContains("colon", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
+
private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String
xml) {
assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml
new file mode 100644
index 000000000..71f59223d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-colon-delimiter.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.csv.TextAndCSVParser">
+ <params>
+ <param name="nameToDelimiterMap" type="map">
+ <entry key="comma" value=","/>
+ <entry key="colon" value=":"/>
+ </param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt
new file mode 100644
index 000000000..1f3f28352
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testColonDelimited.txt
@@ -0,0 +1,13 @@
+a:b:c:d
+1:2:3:4
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8
+5:6:7:8