>From Ian Maxon <[email protected]>: Ian Maxon has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18373 )
Change subject: [ASTERIXDB-3429][EXT] Configurable CSV escape char ...................................................................... [ASTERIXDB-3429][EXT] Configurable CSV escape char - user model changes: no - storage format changes: no - interface changes: yes Details: - Allow the CSV/delimited text parser to use escapes for quotes within quoted fields other than quote itself Change-Id: I50bebc4b8b683889855cb5dd048ab27d7c93af76 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18373 Integration-Tests: Jenkins <[email protected]> Reviewed-by: Ian Maxon <[email protected]> Reviewed-by: Ali Alsuliman <[email protected]> Tested-by: Jenkins <[email protected]> --- A asterixdb/asterix-app/data/csv/nonstandard_escape.csv M asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java R asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java M asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp M asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java M asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java M asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp M hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java A asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java M hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java M hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp 22 files changed, 255 insertions(+), 75 deletions(-) Approvals: Ian Maxon: Looks good to me, but someone else must approve Ali Alsuliman: Looks good to me, approved Jenkins: Verified; Verified Objections: Anon. E. Moose #1000171: Violations found diff --git a/asterixdb/asterix-app/data/csv/nonstandard_escape.csv b/asterixdb/asterix-app/data/csv/nonstandard_escape.csv new file mode 100644 index 0000000..b46a9f0 --- /dev/null +++ b/asterixdb/asterix-app/data/csv/nonstandard_escape.csv @@ -0,0 +1 @@ +1,"It says \"The quick, fox jumped over the lazy dog\"" \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp index 65816f6..6b8738c 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp @@ -26,10 +26,12 @@ CREATE TYPE t2 AS {f1: string, f2: string, f3: string}; CREATE TYPE t3 AS {f1: int?, f2: boolean, f3: string?}; CREATE TYPE t4 AS {f1: string, f2: string, f3: string, f4: string}; +CREATE TYPE t5 AS {f1: int, f2: string}; CREATE EXTERNAL DATASET ds1(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_09.csv"), ("format"="CSV"), ("header"="FALSE")); CREATE EXTERNAL DATASET ds2(t2) USING localfs(("path"="asterix_nc1://data/csv/sample_10.csv"), ("format"="Csv"), ("header"="False")); CREATE EXTERNAL DATASET ds3(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_11.csv"), ("format"="csv"), ("header"="FALSE")); CREATE EXTERNAL DATASET ds4(t3) USING localfs(("path"="asterix_nc1://data/csv/sample_12.csv"), ("format"="csv"), ("header"="True"), ("null"="")); CREATE EXTERNAL DATASET ds5(t4) USING localfs(("path"="asterix_nc1://data/csv/sample_13.csv"), ("format"="csv"), ("header"="True")); -CREATE EXTERNAL DATASET ds6(t4) USING localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), ("header"="false")); \ No newline at end of file +CREATE EXTERNAL DATASET ds6(t4) USING localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), ("header"="false")); +CREATE EXTERNAL DATASET ds7(t5) USING localfs(("path"="asterix_nc1://data/csv/nonstandard_escape.csv"), ("format"="csv"), ("header"="false"),("escape"="\\")); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp new file mode 100644 index 0000000..f3a3606 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds7 v SELECT VALUE v ORDER BY v.f1; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp similarity index 100% rename from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp rename to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp index 32df960..b79ae18 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp @@ -26,5 +26,5 @@ use temp; -load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\"")); +load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="csv"),("header"="false")); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp index 71a79c7..156bbf2 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp @@ -26,5 +26,5 @@ use temp; -load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_04_quote_error.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\"")); +load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_04_quote_error.csv"),("format"="csv"),("header"="false")); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp index e7b19f5..68b4f4a 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp @@ -27,5 +27,5 @@ use temp; -load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_05_space_error_1.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\"")); +load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_05_space_error_1.csv"),("format"="csv"),("header"="false")); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp index 32988b6..103fe3a 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp @@ -26,5 +26,5 @@ use temp; -load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_06_space_error_2.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\"")); +load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_06_space_error_2.csv"),("format"="csv"),("header"="false")); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp new file mode 100644 index 0000000..191a8ac --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * + * CSV file loading test + * Expected result: success + * + */ + +drop dataverse temp if exists; +create dataverse temp; + +use temp; + +CREATE TYPE temp.test AS {f1: int, f2: string}; + +create dataset testds(test) primary key id; + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp new file mode 100644 index 0000000..b5633a9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * + * CSV file loading test + * Expected result: success + * + */ + +use temp; + + +load dataset testds using localfs ((`path`=`asterix_nc1://data/csv/nonstandard_escape.csv`),(`format`=`csv`),(`header`=`false`),(`null`=``),(`escape`=`\\`); + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp new file mode 100644 index 0000000..7a9bdc6 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * + * CSV file loading test + * Expected result: success + * + */ + +use temp; + +FROM testds v SELECT VALUE v ORDER BY v.f1; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm new file mode 100644 index 0000000..73d07f3 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm @@ -0,0 +1 @@ +{ "f1": 1, "f2": "It says \"The quick, fox jumped over the lazy dog\"" } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm new file mode 100644 index 0000000..73d07f3 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm @@ -0,0 +1 @@ +{ "f1": 1, "f2": "It says \"The quick, fox jumped over the lazy dog\"" } \ No newline at end of file diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java index ad59f75..84d1541 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java @@ -47,7 +47,7 @@ IExternalDataRuntimeContext context) { IWarningCollector warningCollector = context.getTaskContext().getWarningCollector(); this.cursor = new FieldCursorForDelimitedDataParser(null, delimiter, ExternalDataConstants.QUOTE, - warningCollector, ExternalDataConstants.EMPTY_STRING); + ExternalDataConstants.QUOTE, warningCollector, ExternalDataConstants.EMPTY_STRING); this.record = new CharArrayRecord(); this.valueIndex = valueIndex; this.recordWithMetadata = new RecordWithMetadataAndPK<>(record, metaType.getFieldTypes(), recordType, @@ -64,8 +64,8 @@ int j = 0; FieldCursorForDelimitedDataParser.Result lastResult; while ((lastResult = cursor.nextField()) == FieldCursorForDelimitedDataParser.Result.OK) { - if (cursor.fieldHasDoubleQuote()) { - cursor.eliminateDoubleQuote(); + if (cursor.fieldHasEscapedQuote()) { + cursor.eliminateEscapeChar(); } if (i == valueIndex) { record.setValue(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength()); diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java index 4433b49..2035a8e 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java @@ -55,7 +55,7 @@ String quoteString = config.get(ExternalDataConstants.KEY_QUOTE); ExternalDataUtils.validateChar(quoteString, ExternalDataConstants.KEY_QUOTE); this.quote = quoteString.charAt(0); - this.escape = ExternalDataUtils.validateGetEscape(config); + this.escape = ExternalDataUtils.validateGetEscape(config, config.get(ExternalDataConstants.KEY_FORMAT)); } @Override diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java index b8a7480..7f890e1 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java @@ -61,6 +61,7 @@ private final IWarningCollector warnings; private final char fieldDelimiter; private final char quote; + private final char escape; private final boolean hasHeader; private final ARecordType recordType; private final IARecordBuilder recBuilder; @@ -79,14 +80,15 @@ private FieldCursorForDelimitedDataParser cursor; public DelimitedDataParser(IExternalDataRuntimeContext context, IValueParserFactory[] valueParserFactories, - char fieldDelimiter, char quote, boolean hasHeader, ARecordType recordType, boolean isStreamParser, - String nullString) throws HyracksDataException { + char fieldDelimiter, char quote, char escape, boolean hasHeader, ARecordType recordType, + boolean isStreamParser, String nullString) throws HyracksDataException { this.dataSourceName = context.getDatasourceNameSupplier(); this.lineNumber = context.getLineNumberSupplier(); this.warnings = context.getTaskContext().getWarningCollector(); this.valueEmbedder = context.getValueEmbedder(); this.fieldDelimiter = fieldDelimiter; this.quote = quote; + this.escape = escape; this.hasHeader = hasHeader; this.recordType = recordType; valueParsers = new IValueParser[valueParserFactories.length]; @@ -127,7 +129,7 @@ fieldNames[i] = name; } if (!isStreamParser) { - cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote, warnings, + cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote, escape, warnings, this::getDataSourceName); } this.nullChars = nullString != null ? nullString.toCharArray() : null; @@ -186,8 +188,8 @@ } fieldValueBufferOutput.writeByte(fieldTypeTags[i]); // Eliminate double quotes in the field that we are going to parse - if (cursor.fieldHasDoubleQuote()) { - cursor.eliminateDoubleQuote(); + if (cursor.fieldHasEscapedQuote()) { + cursor.eliminateEscapeChar(); } boolean success = valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength(), fieldValueBufferOutput); @@ -232,8 +234,8 @@ @Override public void setInputStream(InputStream in) throws IOException { // TODO(ali): revisit this in regards to stream - cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, warnings, - this::getDataSourceName); + cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, escape, + warnings, this::getDataSourceName); if (hasHeader) { cursor.nextRecord(); FieldCursorForDelimitedDataParser.Result result; @@ -249,8 +251,8 @@ @Override public boolean reset(InputStream in) throws IOException { // TODO(ali): revisit this in regards to stream - cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, warnings, - this::getDataSourceName); + cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, escape, + warnings, this::getDataSourceName); return true; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java index 8b88a69..bad742e 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java @@ -50,9 +50,11 @@ IValueParserFactory[] valueParserFactories = ExternalDataUtils.getValueParserFactories(recordType); char delimiter = ExternalDataUtils.validateGetDelimiter(configuration); char quote = ExternalDataUtils.validateGetQuote(configuration, delimiter); + char escape = + ExternalDataUtils.validateGetEscape(configuration, configuration.get(ExternalDataConstants.KEY_FORMAT)); boolean hasHeader = ExternalDataUtils.hasHeader(configuration); String nullString = configuration.get(ExternalDataConstants.KEY_NULL_STR); - return new DelimitedDataParser(context, valueParserFactories, delimiter, quote, hasHeader, recordType, + return new DelimitedDataParser(context, valueParserFactories, delimiter, quote, escape, hasHeader, recordType, ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM), nullString); } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java index 5dfa803..3139be7 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java @@ -246,6 +246,8 @@ * Constant characters */ public static final char ESCAPE = '\\'; + + public static final char CSV_ESCAPE = '\"'; public static final char QUOTE = '"'; public static final char SPACE = ' '; public static final char TAB = '\t'; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java index 3298759..e5b2c9e 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java @@ -143,7 +143,10 @@ return quote; } - public static char validateGetEscape(Map<String, String> configuration) throws HyracksDataException { + public static char validateGetEscape(Map<String, String> configuration, String format) throws HyracksDataException { + if (ExternalDataConstants.FORMAT_CSV.equals(format)) { + return validateCharOrDefault(configuration, KEY_ESCAPE, ExternalDataConstants.CSV_ESCAPE); + } return validateCharOrDefault(configuration, KEY_ESCAPE, ExternalDataConstants.ESCAPE); } @@ -578,7 +581,7 @@ } char delimiter = validateGetDelimiter(configuration); validateGetQuote(configuration, delimiter); - validateGetEscape(configuration); + validateGetEscape(configuration, format); String value = configuration.get(ExternalDataConstants.KEY_REDACT_WARNINGS); if (value != null && !isBoolean(value)) { throw new RuntimeDataException(ErrorCode.INVALID_REQ_PARAM_VAL, ExternalDataConstants.KEY_REDACT_WARNINGS, diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java index b8b2ba8..34d17c9 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java @@ -43,16 +43,18 @@ private IValueParserFactory[] valueParserFactories; private char fieldDelimiter; private char quote; + private char escape; public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter) { - this(fieldParserFactories, fieldDelimiter, '\"'); + this(fieldParserFactories, fieldDelimiter, '\"', '\"'); } - public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter, - char quote) { + public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter, char quote, + char escape) { this.valueParserFactories = fieldParserFactories; this.fieldDelimiter = fieldDelimiter; this.quote = quote; + this.escape = escape; } @Override @@ -74,7 +76,7 @@ DataOutput dos = tb.getDataOutput(); FieldCursorForDelimitedDataParser cursor = new FieldCursorForDelimitedDataParser( - new InputStreamReader(in), fieldDelimiter, quote, warningCollector, () -> ""); + new InputStreamReader(in), fieldDelimiter, quote, escape, warningCollector, () -> ""); while (cursor.nextRecord()) { tb.reset(); for (int i = 0; i < valueParsers.length; ++i) { @@ -88,9 +90,9 @@ default: throw new IllegalStateException(); } - // Eliminate double quotes in the field that we are going to parse - if (cursor.fieldHasDoubleQuote()) { - cursor.eliminateDoubleQuote(); + // Eliminate escaped quotes in the field that we are going to parse + if (cursor.fieldHasEscapedQuote()) { + cursor.eliminateEscapeChar(); } if (!valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength(), dos)) { diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java index ffc87cd..7cba88c 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java @@ -53,8 +53,8 @@ private int fEnd; //end position for field private long lineCount; //count of lines private int fieldCount; //count of fields in current record - private int doubleQuoteCount; //count of double quotes - private boolean isDoubleQuoteIncludedInThisField; //does current field include double quotes + private int escapedQuoteCount; //count of escaped quotes + private boolean containsEscapedQuotes; //does current field contain escaped quotes private static final int INITIAL_BUFFER_SIZE = 4096;//initial buffer size private static final int INCREMENT = 4096; //increment size @@ -66,15 +66,18 @@ private State state; //state (see states above) private int lastQuotePosition; //position of last quote - private int lastDoubleQuotePosition; //position of last double quote + private int lastEscapedQuotePosition; //position of last escaped quote private int lastDelimiterPosition; //position of last delimiter + private int lastEscapePosition; //position of last escape private int quoteCount; //count of single quotes private boolean startedQuote; //whether a quote has been started private final char quote; //the quote character private final char fieldDelimiter; //the delimiter - public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote, + private final char escape; + + public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote, char escape, IWarningCollector warningCollector, Supplier<String> dataSourceName) { this.warnings = warningCollector; this.dataSourceName = dataSourceName; @@ -89,13 +92,15 @@ state = State.INIT; this.quote = quote; this.fieldDelimiter = fieldDelimiter; + this.escape = escape; lastDelimiterPosition = -1; lastQuotePosition = -1; - lastDoubleQuotePosition = -1; + lastEscapedQuotePosition = -1; + lastEscapePosition = -1; quoteCount = 0; - doubleQuoteCount = 0; + escapedQuoteCount = 0; startedQuote = false; - isDoubleQuoteIncludedInThisField = false; + containsEscapedQuotes = false; lineCount = 1; fieldCount = 0; } @@ -116,8 +121,8 @@ return fStart == fEnd; } - public boolean fieldHasDoubleQuote() { - return isDoubleQuoteIncludedInThisField; + public boolean fieldHasEscapedQuote() { + return containsEscapedQuotes; } public int getFieldCount() { @@ -133,11 +138,12 @@ fieldCount = 0; lastDelimiterPosition = -1; lastQuotePosition = -1; - lastDoubleQuotePosition = -1; + lastEscapedQuotePosition = -1; + lastEscapePosition = -1; quoteCount = 0; - doubleQuoteCount = 0; + escapedQuoteCount = 0; startedQuote = false; - isDoubleQuoteIncludedInThisField = false; + containsEscapedQuotes = false; start = 0; end = recordLength; state = State.IN_RECORD; @@ -171,22 +177,34 @@ } p -= (s - start); lastQuotePosition -= (s - start); - lastDoubleQuotePosition -= (s - start); + lastEscapedQuotePosition -= (s - start); lastDelimiterPosition -= (s - start); } char ch = buffer[p]; // We perform rough format correctness (delimiter, quote) check here // to set the starting position of a record. // In the field level, more checking will be conducted. + if (ch == escape) { + // this may or may not be an escape. the next character must be a quote for it to be. + lastEscapePosition = p; + } if (ch == quote) { - startedQuote = true; - // check two quotes in a row - "". This is an escaped quote - if (lastQuotePosition == p - 1 && start != p - 1 && lastDoubleQuotePosition != p - 1) { - lastDoubleQuotePosition = p; + boolean couldBeEscapedQuote = + lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1; + if (quote == escape) { + startedQuote = true; + // check two quotes in a row that aren't at the start of a field if quote is escape, e.g. "" + if (couldBeEscapedQuote && start != p - 1) { + lastEscapedQuotePosition = p; + } + } else { + if (couldBeEscapedQuote) { + lastEscapedQuotePosition = p; + } } lastQuotePosition = p; } else if (ch == fieldDelimiter) { - if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) { + if (startedQuote && lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1) { startedQuote = false; lastDelimiterPosition = p; } @@ -266,11 +284,12 @@ fieldCount++; // reset quote related values startedQuote = false; - isDoubleQuoteIncludedInThisField = false; + containsEscapedQuotes = false; lastQuotePosition = -1; - lastDoubleQuotePosition = -1; + lastEscapedQuotePosition = -1; + lastEscapePosition = -1; quoteCount = 0; - doubleQuoteCount = 0; + escapedQuoteCount = 0; char lastChar = '\0'; int p = start; @@ -280,7 +299,7 @@ boolean eof = !readMore(); p -= (s - start); lastQuotePosition -= (lastQuotePosition > -1) ? (s - start) : 0; - lastDoubleQuotePosition -= (lastDoubleQuotePosition > -1) ? (s - start) : 0; + lastEscapedQuotePosition -= (lastEscapedQuotePosition > -1) ? (s - start) : 0; lastDelimiterPosition -= (lastDelimiterPosition > -1) ? (s - start) : 0; if (eof) { state = State.EOF; @@ -288,8 +307,8 @@ fStart = start; fEnd = p; } else { - if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 - && quoteCount == doubleQuoteCount * 2 + 2) { + if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1 + && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) { // set the position of fStart to +1, fEnd to -1 to remove quote character fStart = start + 1; fEnd = p - 1; @@ -319,16 +338,18 @@ return Result.ERROR; } } - // Check double quotes - "". We check [start != p-2] + // Check escaped quotes - \ESC". We check [start != p-2] if escape is quote // to avoid false positive where there is no value in a field, - // since it looks like a double quote. However, it's not a double quote. + // since it looks like an escaped quote. However, it's not an escaped quote. // (e.g. if field2 has no value: // field1,"",field3 ... ) - if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 - && lastQuotePosition != start) { - isDoubleQuoteIncludedInThisField = true; - doubleQuoteCount++; - lastDoubleQuotePosition = p; + boolean couldBeEscaped = lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1; + boolean isEscapedQuote = + quote == escape ? couldBeEscaped && lastQuotePosition != start : couldBeEscaped; + if (isEscapedQuote) { + containsEscapedQuotes = true; + escapedQuoteCount++; + lastEscapedQuotePosition = p; } lastQuotePosition = p; quoteCount++; @@ -343,9 +364,9 @@ return Result.OK; } - if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 + if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1 && lastQuotePosition != start) { - // There is a quote right before the delimiter (e.g. ",) and it is not two quote, + // There is a quote right before the delimiter (e.g. ",) and it is not an escaped quote, // then the field contains a valid string. // We set the position of fStart to +1, fEnd to -1 to remove quote character fStart = start + 1; @@ -354,8 +375,8 @@ lastDelimiterPosition = p; startedQuote = false; return Result.OK; - } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition - && quoteCount == doubleQuoteCount * 2 + 2) { + } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastEscapedQuotePosition + && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) { // There is a quote before the delimiter, however it is not directly placed before the delimiter. // In this case, we throw an exception. // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes. @@ -376,8 +397,8 @@ state = ch == '\n' ? State.EOR : State.CR; lastDelimiterPosition = p; return Result.OK; - } else if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 - && quoteCount == doubleQuoteCount * 2 + 2) { + } else if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1 + && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) { // set the position of fStart to +1, fEnd to -1 to remove quote character fStart = start + 1; fEnd = p - 1; @@ -388,6 +409,12 @@ return Result.OK; } } + if (ch == escape) { + //RFC4180 defines the escape character for quotes as quotes. however CSV is not a well-defined + //format, and so frequently nonstandard escaping such as C-style \ escaping is used. + //Therefore, we need to track potential escapes separately to support these cases. + lastEscapePosition = p; + } // count lines inside quotes if (ch == '\r' || (ch == '\n' && lastChar != '\r')) { lineCount++; @@ -421,17 +448,17 @@ return true; } - // Eliminate escaped double quotes("") in a field - public void eliminateDoubleQuote() { - int lastDoubleQuotePosition = -1; + // Eliminate escaped quotes("" by default) in a field + public void eliminateEscapeChar() { + int lastEsc = -1; int writepos = fStart; int readpos = fStart; int length = fEnd - fStart; // Find positions where double quotes appear for (int i = 0; i < length; i++) { // Skip double quotes - if (buffer[readpos] == quote && lastDoubleQuotePosition != readpos - 1) { - lastDoubleQuotePosition = readpos; + if (buffer[readpos] == escape && lastEsc != readpos - 1) { + lastEsc = readpos; readpos++; } else { // Moving characters except double quote to the front @@ -442,8 +469,8 @@ readpos++; } } - fEnd -= doubleQuoteCount; - isDoubleQuoteIncludedInThisField = false; + fEnd -= escapedQuoteCount; + containsEscapedQuotes = false; } private void warn(String message) { diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java index 5561ad1..10b3cd3 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java @@ -40,7 +40,7 @@ reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); // skip header final FieldCursorForDelimitedDataParser cursor = - new FieldCursorForDelimitedDataParser(reader, ',', '"', null, () -> ""); + new FieldCursorForDelimitedDataParser(reader, ',', '"', '"', null, () -> ""); // get number of fields from header (first record is header) cursor.nextRecord(); int numOfFields = 0; @@ -55,8 +55,8 @@ while (cursor.nextRecord()) { int fieldNumber = 0; while ((lastResult = cursor.nextField()) == FieldCursorForDelimitedDataParser.Result.OK) { - if (cursor.fieldHasDoubleQuote()) { - cursor.eliminateDoubleQuote(); + if (cursor.fieldHasEscapedQuote()) { + cursor.eliminateEscapeChar(); } fieldNumber++; } -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18373 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Change-Id: I50bebc4b8b683889855cb5dd048ab27d7c93af76 Gerrit-Change-Number: 18373 Gerrit-PatchSet: 3 Gerrit-Owner: Ian Maxon <[email protected]> Gerrit-Reviewer: Ali Alsuliman <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Ian Maxon <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-CC: Mehnaz Tabassum Mahin <[email protected]> Gerrit-MessageType: merged
