This is an automated email from the ASF dual-hosted git repository.
xiedeyantu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push:
new 118edc9e94 [CALCITE-4460] Support custom delimiter when parsing CSV
tables
118edc9e94 is described below
commit 118edc9e94bb24807b4682875482ffa8acc0fecf
Author: Diveyam Mishra <[email protected]>
AuthorDate: Sat May 16 14:37:05 2026 +0530
[CALCITE-4460] Support custom delimiter when parsing CSV tables
---
.../calcite/adapter/csv/CsvFilterableTable.java | 2 +-
.../calcite/adapter/csv/CsvScannableTable.java | 2 +-
.../adapter/csv/CsvStreamScannableTable.java | 2 +-
.../org/apache/calcite/adapter/csv/CsvTable.java | 4 +-
.../calcite/adapter/csv/CsvTranslatableTable.java | 3 +-
.../apache/calcite/adapter/file/CsvEnumerator.java | 20 ++---
.../calcite/adapter/file/CsvStreamReader.java | 6 +-
.../org/apache/calcite/adapter/file/CsvTable.java | 11 ++-
.../calcite/adapter/file/CsvTableFactory.java | 15 +++-
.../calcite/adapter/file/CsvTranslatableTable.java | 10 ++-
.../apache/calcite/adapter/file/FileSchema.java | 5 +-
.../calcite/adapter/file/FileAdapterTest.java | 85 ++++++++++++++++++++++
file/src/test/resources/custom-separator.json | 36 +++++++++
.../test/resources/sales-csv/PIPE_DELIMITED.csv | 5 ++
site/_docs/file_adapter.md | 20 +++++
15 files changed, 198 insertions(+), 28 deletions(-)
diff --git
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvFilterableTable.java
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvFilterableTable.java
index 4a12352951..ee621740f9 100644
---
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvFilterableTable.java
+++
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvFilterableTable.java
@@ -66,7 +66,7 @@ public CsvFilterableTable(Source source,
return new AbstractEnumerable<@Nullable Object[]>() {
@Override public Enumerator<@Nullable Object[]> enumerator() {
return new CsvEnumerator<>(source, cancelFlag, false, filterValues,
- CsvEnumerator.arrayConverter(fieldTypes, fields, false));
+ CsvEnumerator.arrayConverter(fieldTypes, fields, false), ',');
}
};
}
diff --git
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvScannableTable.java
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvScannableTable.java
index 25d0295054..836af81373 100644
---
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvScannableTable.java
+++
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvScannableTable.java
@@ -58,7 +58,7 @@ public class CsvScannableTable extends CsvTable
return new AbstractEnumerable<@Nullable Object[]>() {
@Override public Enumerator<@Nullable Object[]> enumerator() {
return new CsvEnumerator<>(source, cancelFlag, false, null,
- CsvEnumerator.arrayConverter(fieldTypes, fields, false));
+ CsvEnumerator.arrayConverter(fieldTypes, fields, false), ',');
}
};
}
diff --git
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvStreamScannableTable.java
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvStreamScannableTable.java
index a7947a2f98..7c0d574cc7 100644
---
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvStreamScannableTable.java
+++
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvStreamScannableTable.java
@@ -65,7 +65,7 @@ public class CsvStreamScannableTable extends CsvScannableTable
return new AbstractEnumerable<@Nullable Object[]>() {
@Override public Enumerator<@Nullable Object[]> enumerator() {
return new CsvEnumerator<>(source, cancelFlag, true, null,
- CsvEnumerator.arrayConverter(fieldTypes, fields, true));
+ CsvEnumerator.arrayConverter(fieldTypes, fields, true), ',');
}
};
}
diff --git
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTable.java
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTable.java
index a882e5c824..aac56bb38c 100644
--- a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTable.java
+++ b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTable.java
@@ -51,7 +51,7 @@ public abstract class CsvTable extends AbstractTable {
if (rowType == null) {
rowType =
CsvEnumerator.deduceRowType((JavaTypeFactory) typeFactory, source,
- null, isStream());
+ null, isStream(), ',');
}
return rowType;
}
@@ -61,7 +61,7 @@ public List<RelDataType> getFieldTypes(RelDataTypeFactory
typeFactory) {
if (fieldTypes == null) {
fieldTypes = new ArrayList<>();
CsvEnumerator.deduceRowType((JavaTypeFactory) typeFactory, source,
- fieldTypes, isStream());
+ fieldTypes, isStream(), ',');
}
return fieldTypes;
}
diff --git
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTranslatableTable.java
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTranslatableTable.java
index 78632cfb43..1da1992b6b 100644
---
a/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTranslatableTable.java
+++
b/example/csv/src/main/java/org/apache/calcite/adapter/csv/CsvTranslatableTable.java
@@ -66,7 +66,8 @@ public Enumerable<Object> project(final DataContext root,
source,
cancelFlag,
getFieldTypes(typeFactory),
- ImmutableIntList.of(fields));
+ ImmutableIntList.of(fields),
+ ',');
}
};
}
diff --git
a/file/src/main/java/org/apache/calcite/adapter/file/CsvEnumerator.java
b/file/src/main/java/org/apache/calcite/adapter/file/CsvEnumerator.java
index 012be61450..f62433beab 100644
--- a/file/src/main/java/org/apache/calcite/adapter/file/CsvEnumerator.java
+++ b/file/src/main/java/org/apache/calcite/adapter/file/CsvEnumerator.java
@@ -113,14 +113,15 @@ private static void clearTimeFormats() {
.compile("\"decimal\\(([0-9]+),([0-9]+)\\)");
public CsvEnumerator(Source source, AtomicBoolean cancelFlag,
- List<RelDataType> fieldTypes, List<Integer> fields) {
+ List<RelDataType> fieldTypes, List<Integer> fields, char separator) {
//noinspection unchecked
this(source, cancelFlag, false, null,
- (RowConverter<E>) converter(fieldTypes, fields));
+ (RowConverter<E>) converter(fieldTypes, fields), separator);
}
public CsvEnumerator(Source source, AtomicBoolean cancelFlag, boolean stream,
- @Nullable String @Nullable [] filterValues, RowConverter<E>
rowConverter) {
+ @Nullable String @Nullable [] filterValues, RowConverter<E> rowConverter,
+ char separator) {
this.cancelFlag = cancelFlag;
this.rowConverter = rowConverter;
this.filterValues =
@@ -128,9 +129,9 @@ public CsvEnumerator(Source source, AtomicBoolean
cancelFlag, boolean stream,
: ImmutableNullableList.copyOf(filterValues);
try {
if (stream) {
- this.reader = new CsvStreamReader(source);
+ this.reader = new CsvStreamReader(source, separator);
} else {
- this.reader = openCsv(source);
+ this.reader = openCsv(source, separator);
}
this.reader.readNext(); // skip header row
} catch (IOException e) {
@@ -156,14 +157,15 @@ private static RowConverter<?>
converter(List<RelDataType> fieldTypes,
/** Deduces the names and types of a table's columns by reading the first
line
* of a CSV file. */
public static RelDataType deduceRowType(JavaTypeFactory typeFactory,
- Source source, @Nullable List<RelDataType> fieldTypes, Boolean stream) {
+ Source source, @Nullable List<RelDataType> fieldTypes, Boolean stream,
+ char separator) {
final List<RelDataType> types = new ArrayList<>();
final List<String> names = new ArrayList<>();
if (stream) {
names.add(FileSchemaFactory.ROWTIME_COLUMN_NAME);
types.add(typeFactory.createSqlType(SqlTypeName.TIMESTAMP));
}
- try (CSVReader reader = openCsv(source)) {
+ try (CSVReader reader = openCsv(source, separator)) {
String[] strings = reader.readNext();
if (strings == null) {
strings = new String[]{"EmptyFileHasNoColumns:boolean"};
@@ -247,9 +249,9 @@ public static RelDataType deduceRowType(JavaTypeFactory
typeFactory,
return typeFactory.createStructType(Pair.zip(names, types));
}
- static CSVReader openCsv(Source source) throws IOException {
+ static CSVReader openCsv(Source source, char separator) throws IOException {
requireNonNull(source, "source");
- return new CSVReader(source.reader());
+ return new CSVReader(source.reader(), separator);
}
@Override public E current() {
diff --git
a/file/src/main/java/org/apache/calcite/adapter/file/CsvStreamReader.java
b/file/src/main/java/org/apache/calcite/adapter/file/CsvStreamReader.java
index 54dd3837e2..e9113c7d1a 100644
--- a/file/src/main/java/org/apache/calcite/adapter/file/CsvStreamReader.java
+++ b/file/src/main/java/org/apache/calcite/adapter/file/CsvStreamReader.java
@@ -53,9 +53,9 @@ class CsvStreamReader extends CSVReader implements Closeable {
*/
public static final long DEFAULT_MONITOR_DELAY = 2000;
- CsvStreamReader(Source source) {
+ CsvStreamReader(Source source, char separator) {
this(source,
- CSVParser.DEFAULT_SEPARATOR,
+ separator,
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
DEFAULT_SKIP_LINES,
@@ -106,7 +106,7 @@ private CsvStreamReader(Source source, char separator, char
quoteChar,
/**
* Reads the next line from the buffer and converts to a string array.
*
- * @return a string array with each comma-separated element as a separate
entry.
+ * @return a string array with each delimited element as a separate entry.
*
* @throws IOException if bad things happen during the read
*/
diff --git a/file/src/main/java/org/apache/calcite/adapter/file/CsvTable.java
b/file/src/main/java/org/apache/calcite/adapter/file/CsvTable.java
index 5ebb2b1218..4b4c718d41 100644
--- a/file/src/main/java/org/apache/calcite/adapter/file/CsvTable.java
+++ b/file/src/main/java/org/apache/calcite/adapter/file/CsvTable.java
@@ -37,13 +37,16 @@
public abstract class CsvTable extends AbstractTable {
protected final Source source;
protected final @Nullable RelProtoDataType protoRowType;
+ protected final char separator;
private @Nullable RelDataType rowType;
private @Nullable List<RelDataType> fieldTypes;
- /** Creates a CsvTable. */
- CsvTable(Source source, @Nullable RelProtoDataType protoRowType) {
+ /** Creates a CsvTable with a custom separator. */
+ CsvTable(Source source, @Nullable RelProtoDataType protoRowType,
+ char separator) {
this.source = source;
this.protoRowType = protoRowType;
+ this.separator = separator;
}
@Override public RelDataType getRowType(RelDataTypeFactory typeFactory) {
@@ -53,7 +56,7 @@ public abstract class CsvTable extends AbstractTable {
if (rowType == null) {
rowType =
CsvEnumerator.deduceRowType((JavaTypeFactory) typeFactory, source,
- null, isStream());
+ null, isStream(), separator);
}
return rowType;
}
@@ -63,7 +66,7 @@ public List<RelDataType> getFieldTypes(RelDataTypeFactory
typeFactory) {
if (fieldTypes == null) {
fieldTypes = new ArrayList<>();
CsvEnumerator.deduceRowType((JavaTypeFactory) typeFactory, source,
- fieldTypes, isStream());
+ fieldTypes, isStream(), separator);
}
return fieldTypes;
}
diff --git
a/file/src/main/java/org/apache/calcite/adapter/file/CsvTableFactory.java
b/file/src/main/java/org/apache/calcite/adapter/file/CsvTableFactory.java
index 82e636cb61..fa37d15653 100644
--- a/file/src/main/java/org/apache/calcite/adapter/file/CsvTableFactory.java
+++ b/file/src/main/java/org/apache/calcite/adapter/file/CsvTableFactory.java
@@ -25,6 +25,8 @@
import org.apache.calcite.util.Source;
import org.apache.calcite.util.Sources;
+import au.com.bytecode.opencsv.CSVParser;
+
import org.checkerframework.checker.nullness.qual.Nullable;
import java.io.File;
@@ -50,6 +52,17 @@ public CsvTableFactory() {
final Source source = Sources.file(base, fileName);
final RelProtoDataType protoRowType =
rowType != null ? RelDataTypeImpl.proto(rowType) : null;
- return new CsvTranslatableTable(source, protoRowType);
+ final String separatorStr = (String) operand.get("separator");
+ final char separator;
+ if (separatorStr == null) {
+ separator = CSVParser.DEFAULT_SEPARATOR;
+ } else if (separatorStr.length() == 1) {
+ separator = separatorStr.charAt(0);
+ } else {
+ throw new IllegalArgumentException(
+ "Invalid separator '" + separatorStr
+ + "'. Separator must be a single character.");
+ }
+ return new CsvTranslatableTable(source, protoRowType, separator);
}
}
diff --git
a/file/src/main/java/org/apache/calcite/adapter/file/CsvTranslatableTable.java
b/file/src/main/java/org/apache/calcite/adapter/file/CsvTranslatableTable.java
index ebfc1f2e70..7f81defebe 100644
---
a/file/src/main/java/org/apache/calcite/adapter/file/CsvTranslatableTable.java
+++
b/file/src/main/java/org/apache/calcite/adapter/file/CsvTranslatableTable.java
@@ -47,9 +47,10 @@
*/
public class CsvTranslatableTable extends CsvTable
implements QueryableTable, TranslatableTable {
- /** Creates a CsvTable. */
- CsvTranslatableTable(Source source, @Nullable RelProtoDataType protoRowType)
{
- super(source, protoRowType);
+ /** Creates a CsvTranslatableTable with a custom separator. */
+ CsvTranslatableTable(Source source, @Nullable RelProtoDataType protoRowType,
+ char separator) {
+ super(source, protoRowType, separator);
}
@Override public String toString() {
@@ -65,7 +66,8 @@ public Enumerable<Object> project(final DataContext root,
@Override public Enumerator<Object> enumerator() {
JavaTypeFactory typeFactory = root.getTypeFactory();
return new CsvEnumerator<>(source, cancelFlag,
- getFieldTypes(typeFactory), ImmutableIntList.of(fields));
+ getFieldTypes(typeFactory), ImmutableIntList.of(fields),
+ separator);
}
};
}
diff --git a/file/src/main/java/org/apache/calcite/adapter/file/FileSchema.java
b/file/src/main/java/org/apache/calcite/adapter/file/FileSchema.java
index b842030ad0..f66effb4a3 100644
--- a/file/src/main/java/org/apache/calcite/adapter/file/FileSchema.java
+++ b/file/src/main/java/org/apache/calcite/adapter/file/FileSchema.java
@@ -23,6 +23,8 @@
import org.apache.calcite.util.Sources;
import org.apache.calcite.util.Util;
+import au.com.bytecode.opencsv.CSVParser;
+
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
@@ -142,7 +144,8 @@ private static boolean
addTable(ImmutableMap.Builder<String, Table> builder,
}
final Source sourceSansCsv = sourceSansGz.trimOrNull(".csv");
if (sourceSansCsv != null) {
- final Table table = new CsvTranslatableTable(source, null);
+ final Table table =
+ new CsvTranslatableTable(source, null, CSVParser.DEFAULT_SEPARATOR);
builder.put(Util.first(tableName, sourceSansCsv.path()), table);
return true;
}
diff --git
a/file/src/test/java/org/apache/calcite/adapter/file/FileAdapterTest.java
b/file/src/test/java/org/apache/calcite/adapter/file/FileAdapterTest.java
index 0d0fdc031c..ad774f3964 100644
--- a/file/src/test/java/org/apache/calcite/adapter/file/FileAdapterTest.java
+++ b/file/src/test/java/org/apache/calcite/adapter/file/FileAdapterTest.java
@@ -332,6 +332,91 @@ private static void checkEmpty(ResultSet resultSet) {
sql("model-with-custom-table", "select * from CUSTOM_TABLE.EMPS").ok();
}
+ /** Test case for
+ * <a
href="https://issues.apache.org/jira/browse/CALCITE-4460">[CALCITE-4460]
+ * Support custom delimiter when parsing CSV tables</a>.
+ *
+ * <p>Reads a pipe-delimited file via CsvTableFactory with a custom
+ * separator. */
+ @Test void testCsvCustomSeparatorPipe() {
+ final String sql = "select * from CUSTOM_SEPARATOR.PIPE_DEPTS";
+ sql("custom-separator", sql)
+ .returns("DEPTNO=10; NAME=Sales",
+ "DEPTNO=20; NAME=Marketing",
+ "DEPTNO=30; NAME=Accounts",
+ "DEPTNO=40; NAME=tic|tac|toe")
+ .ok();
+ }
+
+ /** Test case for
+ * <a
href="https://issues.apache.org/jira/browse/CALCITE-4460">[CALCITE-4460]
+ * Support custom delimiter when parsing CSV tables</a>.
+ *
+ * <p>Verifies quoted content is parsed correctly when it contains the custom
+ * separator character. */
+ @Test void testCsvCustomSeparatorEscaping() {
+ final String sql = "select * from CUSTOM_SEPARATOR.PIPE_DEPTS "
+ + "where NAME = 'tic|tac|toe'";
+ sql("custom-separator", sql)
+ .returns("DEPTNO=40; NAME=tic|tac|toe")
+ .ok();
+ }
+
+ /** Test case for
+ * <a
href="https://issues.apache.org/jira/browse/CALCITE-4460">[CALCITE-4460]
+ * Support custom delimiter when parsing CSV tables</a>.
+ *
+ * <p>Verifies that a multi-character separator is rejected. */
+ @Test void testCsvCustomSeparatorInvalidMultiChar() throws SQLException {
+ Properties info = new Properties();
+ info.put("model",
+ "inline:"
+ + "{\n"
+ + " version: '1.0',\n"
+ + " defaultSchema: 'TEST',\n"
+ + " schemas: [\n"
+ + " {\n"
+ + " name: 'TEST',\n"
+ + " tables: [\n"
+ + " {\n"
+ + " name: 'BAD',\n"
+ + " type: 'custom',\n"
+ + " factory:
'org.apache.calcite.adapter.file.CsvTableFactory',\n"
+ + " operand: {\n"
+ + " file: 'sales-csv/DEPTS.csv',\n"
+ + " separator: '||'\n"
+ + " }\n"
+ + " }\n"
+ + " ]\n"
+ + " }\n"
+ + " ]\n"
+ + "}");
+ try {
+ Connection connection =
+ DriverManager.getConnection("jdbc:calcite:", info);
+ connection.close();
+ throw new AssertionError("expected error");
+ } catch (RuntimeException e) {
+ Throwable cause = e;
+ while (cause.getCause() != null) {
+ cause = cause.getCause();
+ }
+ assertThat(cause.getMessage(),
+ is("Invalid separator '||'. "
+ + "Separator must be a single character."));
+ }
+ }
+
+ /** Test case for
+ * <a
href="https://issues.apache.org/jira/browse/CALCITE-4460">[CALCITE-4460]
+ * Support custom delimiter when parsing CSV tables</a>.
+ *
+ * <p>Verifies that omitting the separator defaults to comma. */
+ @Test void testCsvDefaultSeparatorBackwardCompat() {
+ final String sql = "select * from CUSTOM_TABLE.EMPS";
+ sql("model-with-custom-table", sql).ok();
+ }
+
@Test void testPushDownProject() {
final String sql = "explain plan for select * from EMPS";
final String expected = "PLAN=CsvTableScan(table=[[SALES, EMPS]], "
diff --git a/file/src/test/resources/custom-separator.json
b/file/src/test/resources/custom-separator.json
new file mode 100644
index 0000000000..ebc5002394
--- /dev/null
+++ b/file/src/test/resources/custom-separator.json
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+{
+ "version": "1.0",
+ "defaultSchema": "CUSTOM_SEPARATOR",
+ "schemas": [
+ {
+ "name": "CUSTOM_SEPARATOR",
+ "tables": [
+ {
+ "name": "PIPE_DEPTS",
+ "type": "custom",
+ "factory": "org.apache.calcite.adapter.file.CsvTableFactory",
+ "operand": {
+ "file": "sales-csv/PIPE_DELIMITED.csv",
+ "separator": "|"
+ }
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/file/src/test/resources/sales-csv/PIPE_DELIMITED.csv
b/file/src/test/resources/sales-csv/PIPE_DELIMITED.csv
new file mode 100644
index 0000000000..2a094fae30
--- /dev/null
+++ b/file/src/test/resources/sales-csv/PIPE_DELIMITED.csv
@@ -0,0 +1,5 @@
+DEPTNO:int|NAME:string
+10|"Sales"
+20|"Marketing"
+30|"Accounts"
+40|"tic|tac|toe"
diff --git a/site/_docs/file_adapter.md b/site/_docs/file_adapter.md
index a812558179..dc31fec413 100644
--- a/site/_docs/file_adapter.md
+++ b/site/_docs/file_adapter.md
@@ -273,6 +273,26 @@ ## CSV files and model-free browsing
3 rows selected (0.985 seconds)
{% endhighlight %}
+### CSV Custom Separator
+
+When using `CsvTableFactory` to define a table in a model, you can specify an
+optional `separator` operand to use a custom delimiter.
+
+{% highlight json %}
+{
+ "name": "PIPE_DEPTS",
+ "type": "custom",
+ "factory": "org.apache.calcite.adapter.file.CsvTableFactory",
+ "operand": {
+ "file": "sales-csv/PIPE_DELIMITED.csv",
+ "separator": "|"
+ }
+}
+{% endhighlight %}
+
+The separator must be a single character. If not specified, it defaults to a
+comma.
+
## JSON files and model-free browsing
Some files describe their own schema, and for these files, we do not need a
model. For example, `DEPTS.json` has an integer `DEPTNO` column and a string
`NAME` column: