This is an automated email from the ASF dual-hosted git repository.

wyk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 13ea2d83b1 [ASTERIXDB-3429][EXT] Configurable CSV escape char
13ea2d83b1 is described below

commit 13ea2d83b1c201923f48162f643b91719618f891
Author: Ian Maxon <[email protected]>
AuthorDate: Mon Jun 17 22:35:58 2024 -0700

    [ASTERIXDB-3429][EXT] Configurable CSV escape char
    
    - user model changes: no
    - storage format changes: no
    - interface changes: yes
    
    Details:
    - Allow the CSV/delimited text parser to use escapes
      for quotes within quoted fields other than quote
      itself
    
    Change-Id: I50bebc4b8b683889855cb5dd048ab27d7c93af76
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18373
    Integration-Tests: Jenkins <[email protected]>
    Reviewed-by: Ian Maxon <[email protected]>
    Reviewed-by: Ali Alsuliman <[email protected]>
    Tested-by: Jenkins <[email protected]>
---
 .../asterix-app/data/csv/nonstandard_escape.csv    |   1 +
 .../csv-parser-001/csv-parser-001.1.ddl.sqlpp      |   4 +-
 ...01.8.ddl.sqlpp => csv-parser-001.8.query.sqlpp} |   4 +-
 ...001.8.ddl.sqlpp => csv-parser-001.99.ddl.sqlpp} |   0
 .../load/csv_04/csv_04.2.update.sqlpp              |   2 +-
 .../load/csv_05/csv_05.2.update.sqlpp              |   2 +-
 .../load/csv_06/csv_06.2.update.sqlpp              |   2 +-
 .../load/csv_07/csv_07.2.update.sqlpp              |   2 +-
 .../csv_nonstandard.1.ddl.sqlpp}                   |   6 +-
 .../csv_nonstandard.2.update.sqlpp}                |   2 +-
 .../csv_nonstandard/csv_nonstandard.3.query.sqlpp} |  10 +-
 .../csv-parser-001/csv-parser-001.8.adm            |   1 +
 .../load/csv_nonstandard/csv_nonstandard.1.adm     |   1 +
 .../CSVToRecordWithMetadataAndPKConverter.java     |   6 +-
 .../reader/stream/QuotedLineRecordReader.java      |   2 +-
 .../external/parser/DelimitedDataParser.java       |  20 ++--
 .../parser/factory/DelimitedDataParserFactory.java |   4 +-
 .../external/util/ExternalDataConstants.java       |   2 +
 .../asterix/external/util/ExternalDataUtils.java   |   7 +-
 .../std/file/DelimitedDataTupleParserFactory.java  |  16 +--
 .../file/FieldCursorForDelimitedDataParser.java    | 115 +++++++++++++--------
 .../hyracks/dataflow/std/file/CursorTest.java      |   6 +-
 22 files changed, 136 insertions(+), 79 deletions(-)

diff --git a/asterixdb/asterix-app/data/csv/nonstandard_escape.csv 
b/asterixdb/asterix-app/data/csv/nonstandard_escape.csv
new file mode 100644
index 0000000000..b46a9f0f58
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/nonstandard_escape.csv
@@ -0,0 +1 @@
+1,"It says \"The quick, fox jumped over the lazy dog\""
\ No newline at end of file
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
index 65816f67c8..6b8738c886 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
@@ -26,10 +26,12 @@ CREATE TYPE t1 AS {f1: string, f2: string, f3: string, f4: 
string, f5: string};
 CREATE TYPE t2 AS {f1: string, f2: string, f3: string};
 CREATE TYPE t3 AS {f1: int?, f2: boolean, f3: string?};
 CREATE TYPE t4 AS {f1: string, f2: string, f3: string, f4: string};
+CREATE TYPE t5 AS {f1: int, f2: string};
 
 CREATE EXTERNAL DATASET ds1(t1) USING 
localfs(("path"="asterix_nc1://data/csv/sample_09.csv"), ("format"="CSV"), 
("header"="FALSE"));
 CREATE EXTERNAL DATASET ds2(t2) USING 
localfs(("path"="asterix_nc1://data/csv/sample_10.csv"), ("format"="Csv"), 
("header"="False"));
 CREATE EXTERNAL DATASET ds3(t1) USING 
localfs(("path"="asterix_nc1://data/csv/sample_11.csv"), ("format"="csv"), 
("header"="FALSE"));
 CREATE EXTERNAL DATASET ds4(t3) USING 
localfs(("path"="asterix_nc1://data/csv/sample_12.csv"), ("format"="csv"), 
("header"="True"), ("null"=""));
 CREATE EXTERNAL DATASET ds5(t4) USING 
localfs(("path"="asterix_nc1://data/csv/sample_13.csv"), ("format"="csv"), 
("header"="True"));
-CREATE EXTERNAL DATASET ds6(t4) USING 
localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), 
("header"="false"));
\ No newline at end of file
+CREATE EXTERNAL DATASET ds6(t4) USING 
localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), 
("header"="false"));
+CREATE EXTERNAL DATASET ds7(t5) USING 
localfs(("path"="asterix_nc1://data/csv/nonstandard_escape.csv"), 
("format"="csv"), ("header"="false"),("escape"="\\"));
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp
similarity index 94%
copy from 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
copy to 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp
index 86a1b59399..f3a3606ea5 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp
@@ -17,4 +17,6 @@
  * under the License.
  */
 
-DROP DATAVERSE test;
\ No newline at end of file
+USE test;
+
+FROM ds7 v SELECT VALUE v ORDER BY v.f1;
\ No newline at end of file
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp
similarity index 100%
copy from 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
copy to 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
index 32df9605be..b79ae18168 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
@@ -26,5 +26,5 @@
 use temp;
 
 
-load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="csv"),("header"="false"));
 
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
index 71a79c7a53..156bbf2981 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
@@ -26,5 +26,5 @@
 use temp;
 
 
-load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_04_quote_error.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_04_quote_error.csv"),("format"="csv"),("header"="false"));
 
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
index e7b19f5d43..68b4f4a0da 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
@@ -27,5 +27,5 @@
 use temp;
 
 
-load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_05_space_error_1.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_05_space_error_1.csv"),("format"="csv"),("header"="false"));
 
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
index 32988b69bf..103fe3a8f3 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
@@ -26,5 +26,5 @@
 use temp;
 
 
-load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_06_space_error_2.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_06_space_error_2.csv"),("format"="csv"),("header"="false"));
 
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp
similarity index 85%
copy from 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
copy to 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp
index 32df9605be..191a8ace06 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp
@@ -23,8 +23,12 @@
  *
  */
 
+drop  dataverse temp if exists;
+create  dataverse temp;
+
 use temp;
 
+CREATE TYPE temp.test AS {f1: int, f2: string};
 
-load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+create  dataset testds(test) primary key id;
 
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp
similarity index 84%
copy from 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
copy to 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp
index 32df9605be..b5633a9b01 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp
@@ -26,5 +26,5 @@
 use temp;
 
 
-load  dataset testds using localfs 
(("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load  dataset testds using localfs 
((`path`=`asterix_nc1://data/csv/nonstandard_escape.csv`),(`format`=`csv`),(`header`=`false`),(`null`=``),(`escape`=`\\`);
 
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp
similarity index 86%
rename from 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
rename to 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp
index 86a1b59399..7a9bdc6a7f 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp
@@ -16,5 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use temp;
 
-DROP DATAVERSE test;
\ No newline at end of file
+FROM testds v SELECT VALUE v ORDER BY v.f1;
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm
 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm
new file mode 100644
index 0000000000..73d07f390d
--- /dev/null
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm
@@ -0,0 +1 @@
+{ "f1": 1, "f2": "It says \"The quick, fox jumped over the lazy dog\"" }
\ No newline at end of file
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm
 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm
new file mode 100644
index 0000000000..73d07f390d
--- /dev/null
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm
@@ -0,0 +1 @@
+{ "f1": 1, "f2": "It says \"The quick, fox jumped over the lazy dog\"" }
\ No newline at end of file
diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
index ad59f75c19..84d154172a 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
@@ -47,7 +47,7 @@ public class CSVToRecordWithMetadataAndPKConverter
             IExternalDataRuntimeContext context) {
         IWarningCollector warningCollector = 
context.getTaskContext().getWarningCollector();
         this.cursor = new FieldCursorForDelimitedDataParser(null, delimiter, 
ExternalDataConstants.QUOTE,
-                warningCollector, ExternalDataConstants.EMPTY_STRING);
+                ExternalDataConstants.QUOTE, warningCollector, 
ExternalDataConstants.EMPTY_STRING);
         this.record = new CharArrayRecord();
         this.valueIndex = valueIndex;
         this.recordWithMetadata = new RecordWithMetadataAndPK<>(record, 
metaType.getFieldTypes(), recordType,
@@ -64,8 +64,8 @@ public class CSVToRecordWithMetadataAndPKConverter
         int j = 0;
         FieldCursorForDelimitedDataParser.Result lastResult;
         while ((lastResult = cursor.nextField()) == 
FieldCursorForDelimitedDataParser.Result.OK) {
-            if (cursor.fieldHasDoubleQuote()) {
-                cursor.eliminateDoubleQuote();
+            if (cursor.fieldHasEscapedQuote()) {
+                cursor.eliminateEscapeChar();
             }
             if (i == valueIndex) {
                 record.setValue(cursor.getBuffer(), cursor.getFieldStart(), 
cursor.getFieldLength());
diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
index 4433b496ed..2035a8e473 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
@@ -55,7 +55,7 @@ public class QuotedLineRecordReader extends LineRecordReader {
         String quoteString = config.get(ExternalDataConstants.KEY_QUOTE);
         ExternalDataUtils.validateChar(quoteString, 
ExternalDataConstants.KEY_QUOTE);
         this.quote = quoteString.charAt(0);
-        this.escape = ExternalDataUtils.validateGetEscape(config);
+        this.escape = ExternalDataUtils.validateGetEscape(config, 
config.get(ExternalDataConstants.KEY_FORMAT));
     }
 
     @Override
diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
index b8a7480fcb..7f890e1e2c 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
@@ -61,6 +61,7 @@ public class DelimitedDataParser extends AbstractDataParser 
implements IStreamDa
     private final IWarningCollector warnings;
     private final char fieldDelimiter;
     private final char quote;
+    private final char escape;
     private final boolean hasHeader;
     private final ARecordType recordType;
     private final IARecordBuilder recBuilder;
@@ -79,14 +80,15 @@ public class DelimitedDataParser extends AbstractDataParser 
implements IStreamDa
     private FieldCursorForDelimitedDataParser cursor;
 
     public DelimitedDataParser(IExternalDataRuntimeContext context, 
IValueParserFactory[] valueParserFactories,
-            char fieldDelimiter, char quote, boolean hasHeader, ARecordType 
recordType, boolean isStreamParser,
-            String nullString) throws HyracksDataException {
+            char fieldDelimiter, char quote, char escape, boolean hasHeader, 
ARecordType recordType,
+            boolean isStreamParser, String nullString) throws 
HyracksDataException {
         this.dataSourceName = context.getDatasourceNameSupplier();
         this.lineNumber = context.getLineNumberSupplier();
         this.warnings = context.getTaskContext().getWarningCollector();
         this.valueEmbedder = context.getValueEmbedder();
         this.fieldDelimiter = fieldDelimiter;
         this.quote = quote;
+        this.escape = escape;
         this.hasHeader = hasHeader;
         this.recordType = recordType;
         valueParsers = new IValueParser[valueParserFactories.length];
@@ -127,7 +129,7 @@ public class DelimitedDataParser extends AbstractDataParser 
implements IStreamDa
             fieldNames[i] = name;
         }
         if (!isStreamParser) {
-            cursor = new FieldCursorForDelimitedDataParser(null, 
this.fieldDelimiter, quote, warnings,
+            cursor = new FieldCursorForDelimitedDataParser(null, 
this.fieldDelimiter, quote, escape, warnings,
                     this::getDataSourceName);
         }
         this.nullChars = nullString != null ? nullString.toCharArray() : null;
@@ -186,8 +188,8 @@ public class DelimitedDataParser extends AbstractDataParser 
implements IStreamDa
                     }
                     fieldValueBufferOutput.writeByte(fieldTypeTags[i]);
                     // Eliminate double quotes in the field that we are going 
to parse
-                    if (cursor.fieldHasDoubleQuote()) {
-                        cursor.eliminateDoubleQuote();
+                    if (cursor.fieldHasEscapedQuote()) {
+                        cursor.eliminateEscapeChar();
                     }
                     boolean success = 
valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(),
                             cursor.getFieldLength(), fieldValueBufferOutput);
@@ -232,8 +234,8 @@ public class DelimitedDataParser extends AbstractDataParser 
implements IStreamDa
     @Override
     public void setInputStream(InputStream in) throws IOException {
         // TODO(ali): revisit this in regards to stream
-        cursor = new FieldCursorForDelimitedDataParser(new 
InputStreamReader(in), fieldDelimiter, quote, warnings,
-                this::getDataSourceName);
+        cursor = new FieldCursorForDelimitedDataParser(new 
InputStreamReader(in), fieldDelimiter, quote, escape,
+                warnings, this::getDataSourceName);
         if (hasHeader) {
             cursor.nextRecord();
             FieldCursorForDelimitedDataParser.Result result;
@@ -249,8 +251,8 @@ public class DelimitedDataParser extends AbstractDataParser 
implements IStreamDa
     @Override
     public boolean reset(InputStream in) throws IOException {
         // TODO(ali): revisit this in regards to stream
-        cursor = new FieldCursorForDelimitedDataParser(new 
InputStreamReader(in), fieldDelimiter, quote, warnings,
-                this::getDataSourceName);
+        cursor = new FieldCursorForDelimitedDataParser(new 
InputStreamReader(in), fieldDelimiter, quote, escape,
+                warnings, this::getDataSourceName);
         return true;
     }
 
diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
index 8b88a69de9..bad742e71a 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
@@ -50,9 +50,11 @@ public class DelimitedDataParserFactory extends 
AbstractRecordStreamParserFactor
         IValueParserFactory[] valueParserFactories = 
ExternalDataUtils.getValueParserFactories(recordType);
         char delimiter = ExternalDataUtils.validateGetDelimiter(configuration);
         char quote = ExternalDataUtils.validateGetQuote(configuration, 
delimiter);
+        char escape =
+                ExternalDataUtils.validateGetEscape(configuration, 
configuration.get(ExternalDataConstants.KEY_FORMAT));
         boolean hasHeader = ExternalDataUtils.hasHeader(configuration);
         String nullString = 
configuration.get(ExternalDataConstants.KEY_NULL_STR);
-        return new DelimitedDataParser(context, valueParserFactories, 
delimiter, quote, hasHeader, recordType,
+        return new DelimitedDataParser(context, valueParserFactories, 
delimiter, quote, escape, hasHeader, recordType,
                 
ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM),
 nullString);
     }
 
diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index 5dfa803210..3139be7f70 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -246,6 +246,8 @@ public class ExternalDataConstants {
      * Constant characters
      */
     public static final char ESCAPE = '\\';
+
+    public static final char CSV_ESCAPE = '\"';
     public static final char QUOTE = '"';
     public static final char SPACE = ' ';
     public static final char TAB = '\t';
diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index 3298759bf3..e5b2c9e01a 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -143,7 +143,10 @@ public class ExternalDataUtils {
         return quote;
     }
 
-    public static char validateGetEscape(Map<String, String> configuration) 
throws HyracksDataException {
+    public static char validateGetEscape(Map<String, String> configuration, 
String format) throws HyracksDataException {
+        if (ExternalDataConstants.FORMAT_CSV.equals(format)) {
+            return validateCharOrDefault(configuration, KEY_ESCAPE, 
ExternalDataConstants.CSV_ESCAPE);
+        }
         return validateCharOrDefault(configuration, KEY_ESCAPE, 
ExternalDataConstants.ESCAPE);
     }
 
@@ -578,7 +581,7 @@ public class ExternalDataUtils {
         }
         char delimiter = validateGetDelimiter(configuration);
         validateGetQuote(configuration, delimiter);
-        validateGetEscape(configuration);
+        validateGetEscape(configuration, format);
         String value = 
configuration.get(ExternalDataConstants.KEY_REDACT_WARNINGS);
         if (value != null && !isBoolean(value)) {
             throw new RuntimeDataException(ErrorCode.INVALID_REQ_PARAM_VAL, 
ExternalDataConstants.KEY_REDACT_WARNINGS,
diff --git 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
index b8b2ba89f7..34d17c9434 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
@@ -43,16 +43,18 @@ public class DelimitedDataTupleParserFactory implements 
ITupleParserFactory {
     private IValueParserFactory[] valueParserFactories;
     private char fieldDelimiter;
     private char quote;
+    private char escape;
 
     public DelimitedDataTupleParserFactory(IValueParserFactory[] 
fieldParserFactories, char fieldDelimiter) {
-        this(fieldParserFactories, fieldDelimiter, '\"');
+        this(fieldParserFactories, fieldDelimiter, '\"', '\"');
     }
 
-    public DelimitedDataTupleParserFactory(IValueParserFactory[] 
fieldParserFactories, char fieldDelimiter,
-            char quote) {
+    public DelimitedDataTupleParserFactory(IValueParserFactory[] 
fieldParserFactories, char fieldDelimiter, char quote,
+            char escape) {
         this.valueParserFactories = fieldParserFactories;
         this.fieldDelimiter = fieldDelimiter;
         this.quote = quote;
+        this.escape = escape;
     }
 
     @Override
@@ -74,7 +76,7 @@ public class DelimitedDataTupleParserFactory implements 
ITupleParserFactory {
                     DataOutput dos = tb.getDataOutput();
 
                     FieldCursorForDelimitedDataParser cursor = new 
FieldCursorForDelimitedDataParser(
-                            new InputStreamReader(in), fieldDelimiter, quote, 
warningCollector, () -> "");
+                            new InputStreamReader(in), fieldDelimiter, quote, 
escape, warningCollector, () -> "");
                     while (cursor.nextRecord()) {
                         tb.reset();
                         for (int i = 0; i < valueParsers.length; ++i) {
@@ -88,9 +90,9 @@ public class DelimitedDataTupleParserFactory implements 
ITupleParserFactory {
                                 default:
                                     throw new IllegalStateException();
                             }
-                            // Eliminate double quotes in the field that we 
are going to parse
-                            if (cursor.fieldHasDoubleQuote()) {
-                                cursor.eliminateDoubleQuote();
+                            // Eliminate escaped quotes in the field that we 
are going to parse
+                            if (cursor.fieldHasEscapedQuote()) {
+                                cursor.eliminateEscapeChar();
                             }
                             if (!valueParsers[i].parse(cursor.getBuffer(), 
cursor.getFieldStart(),
                                     cursor.getFieldLength(), dos)) {
diff --git 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
index ffc87cd3ce..7cba88cf77 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
@@ -53,8 +53,8 @@ public class FieldCursorForDelimitedDataParser {
     private int fEnd; //end position for field
     private long lineCount; //count of lines
     private int fieldCount; //count of fields in current record
-    private int doubleQuoteCount; //count of double quotes
-    private boolean isDoubleQuoteIncludedInThisField; //does current field 
include double quotes
+    private int escapedQuoteCount; //count of escaped quotes
+    private boolean containsEscapedQuotes; //does current field contain 
escaped quotes
 
     private static final int INITIAL_BUFFER_SIZE = 4096;//initial buffer size
     private static final int INCREMENT = 4096; //increment size
@@ -66,15 +66,18 @@ public class FieldCursorForDelimitedDataParser {
     private State state; //state (see states above)
 
     private int lastQuotePosition; //position of last quote
-    private int lastDoubleQuotePosition; //position of last double quote
+    private int lastEscapedQuotePosition; //position of last escaped quote
     private int lastDelimiterPosition; //position of last delimiter
+    private int lastEscapePosition; //position of last escape
     private int quoteCount; //count of single quotes
     private boolean startedQuote; //whether a quote has been started
 
     private final char quote; //the quote character
     private final char fieldDelimiter; //the delimiter
 
-    public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, 
char quote,
+    private final char escape;
+
+    public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, 
char quote, char escape,
             IWarningCollector warningCollector, Supplier<String> 
dataSourceName) {
         this.warnings = warningCollector;
         this.dataSourceName = dataSourceName;
@@ -89,13 +92,15 @@ public class FieldCursorForDelimitedDataParser {
         state = State.INIT;
         this.quote = quote;
         this.fieldDelimiter = fieldDelimiter;
+        this.escape = escape;
         lastDelimiterPosition = -1;
         lastQuotePosition = -1;
-        lastDoubleQuotePosition = -1;
+        lastEscapedQuotePosition = -1;
+        lastEscapePosition = -1;
         quoteCount = 0;
-        doubleQuoteCount = 0;
+        escapedQuoteCount = 0;
         startedQuote = false;
-        isDoubleQuoteIncludedInThisField = false;
+        containsEscapedQuotes = false;
         lineCount = 1;
         fieldCount = 0;
     }
@@ -116,8 +121,8 @@ public class FieldCursorForDelimitedDataParser {
         return fStart == fEnd;
     }
 
-    public boolean fieldHasDoubleQuote() {
-        return isDoubleQuoteIncludedInThisField;
+    public boolean fieldHasEscapedQuote() {
+        return containsEscapedQuotes;
     }
 
     public int getFieldCount() {
@@ -133,11 +138,12 @@ public class FieldCursorForDelimitedDataParser {
         fieldCount = 0;
         lastDelimiterPosition = -1;
         lastQuotePosition = -1;
-        lastDoubleQuotePosition = -1;
+        lastEscapedQuotePosition = -1;
+        lastEscapePosition = -1;
         quoteCount = 0;
-        doubleQuoteCount = 0;
+        escapedQuoteCount = 0;
         startedQuote = false;
-        isDoubleQuoteIncludedInThisField = false;
+        containsEscapedQuotes = false;
         start = 0;
         end = recordLength;
         state = State.IN_RECORD;
@@ -171,22 +177,34 @@ public class FieldCursorForDelimitedDataParser {
                             }
                             p -= (s - start);
                             lastQuotePosition -= (s - start);
-                            lastDoubleQuotePosition -= (s - start);
+                            lastEscapedQuotePosition -= (s - start);
                             lastDelimiterPosition -= (s - start);
                         }
                         char ch = buffer[p];
                         // We perform rough format correctness (delimiter, 
quote) check here
                         // to set the starting position of a record.
                         // In the field level, more checking will be conducted.
+                        if (ch == escape) {
+                            // this may or may not be an escape. the next 
character must be a quote for it to be.
+                            lastEscapePosition = p;
+                        }
                         if (ch == quote) {
-                            startedQuote = true;
-                            // check two quotes in a row - "". This is an 
escaped quote
-                            if (lastQuotePosition == p - 1 && start != p - 1 
&& lastDoubleQuotePosition != p - 1) {
-                                lastDoubleQuotePosition = p;
+                            boolean couldBeEscapedQuote =
+                                    lastEscapePosition == p - 1 && 
lastEscapedQuotePosition != p - 1;
+                            if (quote == escape) {
+                                startedQuote = true;
+                                // check two quotes in a row that aren't at 
the start of a field if quote is escape, e.g. ""
+                                if (couldBeEscapedQuote && start != p - 1) {
+                                    lastEscapedQuotePosition = p;
+                                }
+                            } else {
+                                if (couldBeEscapedQuote) {
+                                    lastEscapedQuotePosition = p;
+                                }
                             }
                             lastQuotePosition = p;
                         } else if (ch == fieldDelimiter) {
-                            if (startedQuote && lastQuotePosition == p - 1 && 
lastDoubleQuotePosition != p - 1) {
+                            if (startedQuote && lastQuotePosition == p - 1 && 
lastEscapedQuotePosition != p - 1) {
                                 startedQuote = false;
                                 lastDelimiterPosition = p;
                             }
@@ -266,11 +284,12 @@ public class FieldCursorForDelimitedDataParser {
                 fieldCount++;
                 // reset quote related values
                 startedQuote = false;
-                isDoubleQuoteIncludedInThisField = false;
+                containsEscapedQuotes = false;
                 lastQuotePosition = -1;
-                lastDoubleQuotePosition = -1;
+                lastEscapedQuotePosition = -1;
+                lastEscapePosition = -1;
                 quoteCount = 0;
-                doubleQuoteCount = 0;
+                escapedQuoteCount = 0;
 
                 char lastChar = '\0';
                 int p = start;
@@ -280,7 +299,7 @@ public class FieldCursorForDelimitedDataParser {
                         boolean eof = !readMore();
                         p -= (s - start);
                         lastQuotePosition -= (lastQuotePosition > -1) ? (s - 
start) : 0;
-                        lastDoubleQuotePosition -= (lastDoubleQuotePosition > 
-1) ? (s - start) : 0;
+                        lastEscapedQuotePosition -= (lastEscapedQuotePosition 
> -1) ? (s - start) : 0;
                         lastDelimiterPosition -= (lastDelimiterPosition > -1) 
? (s - start) : 0;
                         if (eof) {
                             state = State.EOF;
@@ -288,8 +307,8 @@ public class FieldCursorForDelimitedDataParser {
                                 fStart = start;
                                 fEnd = p;
                             } else {
-                                if (lastQuotePosition == p - 1 && 
lastDoubleQuotePosition != p - 1
-                                        && quoteCount == doubleQuoteCount * 2 
+ 2) {
+                                if (lastQuotePosition == p - 1 && 
lastEscapedQuotePosition != p - 1
+                                        && quoteCount == escapedQuoteCount * 
(escape == quote ? 2 : 1) + 2) {
                                     // set the position of fStart to +1, fEnd 
to -1 to remove quote character
                                     fStart = start + 1;
                                     fEnd = p - 1;
@@ -319,16 +338,18 @@ public class FieldCursorForDelimitedDataParser {
                                 return Result.ERROR;
                             }
                         }
-                        // Check double quotes - "". We check [start != p-2]
+                        // Check escaped quotes - \ESC". We check [start != 
p-2] if escape is quote
                         // to avoid false positive where there is no value in 
a field,
-                        // since it looks like a double quote. However, it's 
not a double quote.
+                        // since it looks like an escaped quote. However, it's 
not an escaped quote.
                         // (e.g. if field2 has no value:
                         //       field1,"",field3 ... )
-                        if (lastQuotePosition == p - 1 && 
lastDoubleQuotePosition != p - 1
-                                && lastQuotePosition != start) {
-                            isDoubleQuoteIncludedInThisField = true;
-                            doubleQuoteCount++;
-                            lastDoubleQuotePosition = p;
+                        boolean couldBeEscaped = lastEscapePosition == p - 1 
&& lastEscapedQuotePosition != p - 1;
+                        boolean isEscapedQuote =
+                                quote == escape ? couldBeEscaped && 
lastQuotePosition != start : couldBeEscaped;
+                        if (isEscapedQuote) {
+                            containsEscapedQuotes = true;
+                            escapedQuoteCount++;
+                            lastEscapedQuotePosition = p;
                         }
                         lastQuotePosition = p;
                         quoteCount++;
@@ -343,9 +364,9 @@ public class FieldCursorForDelimitedDataParser {
                             return Result.OK;
                         }
 
-                        if (lastQuotePosition == p - 1 && 
lastDoubleQuotePosition != p - 1
+                        if (lastQuotePosition == p - 1 && 
lastEscapedQuotePosition != p - 1
                                 && lastQuotePosition != start) {
-                            // There is a quote right before the delimiter 
(e.g. ",)  and it is not two quote,
+                            // There is a quote right before the delimiter 
(e.g. ",)  and it is not an escaped quote,
                             // then the field contains a valid string.
                             // We set the position of fStart to +1, fEnd to -1 
to remove quote character
                             fStart = start + 1;
@@ -354,8 +375,8 @@ public class FieldCursorForDelimitedDataParser {
                             lastDelimiterPosition = p;
                             startedQuote = false;
                             return Result.OK;
-                        } else if (lastQuotePosition < p - 1 && 
lastQuotePosition != lastDoubleQuotePosition
-                                && quoteCount == doubleQuoteCount * 2 + 2) {
+                        } else if (lastQuotePosition < p - 1 && 
lastQuotePosition != lastEscapedQuotePosition
+                                && quoteCount == escapedQuoteCount * (escape 
== quote ? 2 : 1) + 2) {
                             // There is a quote before the delimiter, however 
it is not directly placed before the delimiter.
                             // In this case, we throw an exception.
                             // quoteCount == doubleQuoteCount * 2 + 2 : only 
true when we have two quotes except double-quotes.
@@ -376,8 +397,8 @@ public class FieldCursorForDelimitedDataParser {
                             state = ch == '\n' ? State.EOR : State.CR;
                             lastDelimiterPosition = p;
                             return Result.OK;
-                        } else if (lastQuotePosition == p - 1 && 
lastDoubleQuotePosition != p - 1
-                                && quoteCount == doubleQuoteCount * 2 + 2) {
+                        } else if (lastQuotePosition == p - 1 && 
lastEscapedQuotePosition != p - 1
+                                && quoteCount == escapedQuoteCount * (escape 
== quote ? 2 : 1) + 2) {
                             // set the position of fStart to +1, fEnd to -1 to 
remove quote character
                             fStart = start + 1;
                             fEnd = p - 1;
@@ -388,6 +409,12 @@ public class FieldCursorForDelimitedDataParser {
                             return Result.OK;
                         }
                     }
+                    if (ch == escape) {
+                        //RFC4180 defines the escape character for quotes as 
quotes. however CSV is not a well-defined
+                        //format, and so frequently nonstandard escaping such 
as C-style \ escaping is used.
+                        //Therefore, we need to track potential escapes 
separately to support these cases.
+                        lastEscapePosition = p;
+                    }
                     // count lines inside quotes
                     if (ch == '\r' || (ch == '\n' && lastChar != '\r')) {
                         lineCount++;
@@ -421,17 +448,17 @@ public class FieldCursorForDelimitedDataParser {
         return true;
     }
 
-    // Eliminate escaped double quotes("") in a field
-    public void eliminateDoubleQuote() {
-        int lastDoubleQuotePosition = -1;
+    // Eliminate escaped quotes("" by default) in a field
+    public void eliminateEscapeChar() {
+        int lastEsc = -1;
         int writepos = fStart;
         int readpos = fStart;
         int length = fEnd - fStart;
         // Find positions where double quotes appear
         for (int i = 0; i < length; i++) {
             // Skip double quotes
-            if (buffer[readpos] == quote && lastDoubleQuotePosition != readpos 
- 1) {
-                lastDoubleQuotePosition = readpos;
+            if (buffer[readpos] == escape && lastEsc != readpos - 1) {
+                lastEsc = readpos;
                 readpos++;
             } else {
                 // Moving characters except double quote to the front
@@ -442,8 +469,8 @@ public class FieldCursorForDelimitedDataParser {
                 readpos++;
             }
         }
-        fEnd -= doubleQuoteCount;
-        isDoubleQuoteIncludedInThisField = false;
+        fEnd -= escapedQuoteCount;
+        containsEscapedQuotes = false;
     }
 
     private void warn(String message) {
diff --git 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
index 5561ad1049..10b3cd3f4c 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
@@ -40,7 +40,7 @@ public class CursorTest {
             reader = new BufferedReader(new InputStreamReader(in, 
StandardCharsets.UTF_8));
             // skip header
             final FieldCursorForDelimitedDataParser cursor =
-                    new FieldCursorForDelimitedDataParser(reader, ',', '"', 
null, () -> "");
+                    new FieldCursorForDelimitedDataParser(reader, ',', '"', 
'"', null, () -> "");
             // get number of fields from header (first record is header)
             cursor.nextRecord();
             int numOfFields = 0;
@@ -55,8 +55,8 @@ public class CursorTest {
             while (cursor.nextRecord()) {
                 int fieldNumber = 0;
                 while ((lastResult = cursor.nextField()) == 
FieldCursorForDelimitedDataParser.Result.OK) {
-                    if (cursor.fieldHasDoubleQuote()) {
-                        cursor.eliminateDoubleQuote();
+                    if (cursor.fieldHasEscapedQuote()) {
+                        cursor.eliminateEscapeChar();
                     }
                     fieldNumber++;
                 }


Reply via email to