>From Shahrzad Shirazi <shaji...@ucr.edu>: Shahrzad Shirazi has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20146 )
Change subject: [NO ISSUE][COMP] Add None as quote option for CSV in external collections ...................................................................... [NO ISSUE][COMP] Add None as quote option for CSV in external collections - user model changes: no - storage format changes: no - interface changes: no Ext-ref: MB-67801 Change-Id: I3812d2d1306282e9f02e8b77e1f79ac6b203cabe Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20146 Tested-by: Ali Alsuliman <ali.al.solai...@gmail.com> Reviewed-by: Ali Alsuliman <ali.al.solai...@gmail.com> --- M asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.9.adm M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java M asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/csv/CSVUtils.java A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-009.1.ddl.sqlpp M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java M hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java A asterixdb/asterix-app/data/csv/sample_14.csv M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java M hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java M hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java 13 files changed, 104 insertions(+), 31 deletions(-) Approvals: Ali Alsuliman: Looks good to me, approved; Verified Objections: Anon. E. Moose #1000171: Violations found diff --git a/asterixdb/asterix-app/data/csv/sample_14.csv b/asterixdb/asterix-app/data/csv/sample_14.csv new file mode 100644 index 0000000..6b7178a --- /dev/null +++ b/asterixdb/asterix-app/data/csv/sample_14.csv Binary files differ diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp index 6b8738c..38f4ca4 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp @@ -35,3 +35,4 @@ CREATE EXTERNAL DATASET ds5(t4) USING localfs(("path"="asterix_nc1://data/csv/sample_13.csv"), ("format"="csv"), ("header"="True")); CREATE EXTERNAL DATASET ds6(t4) USING localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), ("header"="false")); CREATE EXTERNAL DATASET ds7(t5) USING localfs(("path"="asterix_nc1://data/csv/nonstandard_escape.csv"), ("format"="csv"), ("header"="false"),("escape"="\\")); +CREATE EXTERNAL DATASET ds8(t4) USING localfs(("path"="asterix_nc1://data/csv/sample_14.csv"), ("format"="csv"), ("header"="true"), ("quote"="none")); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-009.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-009.1.ddl.sqlpp new file mode 100644 index 0000000..3ac5d68 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-009.1.ddl.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds8 v SELECT VALUE v ORDER BY v.f1; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.9.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.9.adm new file mode 100644 index 0000000..89f647b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.9.adm @@ -0,0 +1,3 @@ +{ "f1": "123", "f2": "1", "f3": "good", "f4": "rec\"ommend" } +{ "f1": "45\"6", "f2": "1", "f3": "good", "f4": "\u0000l\"l" } +{ "f1": "789", "f2": "1", "f3": "\"bad\"", "f4": "\"recommend\"" } \ No newline at end of file diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java index 84d1541..d0713fd4 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java @@ -47,7 +47,7 @@ IExternalDataRuntimeContext context) { IWarningCollector warningCollector = context.getTaskContext().getWarningCollector(); this.cursor = new FieldCursorForDelimitedDataParser(null, delimiter, ExternalDataConstants.QUOTE, - ExternalDataConstants.QUOTE, warningCollector, ExternalDataConstants.EMPTY_STRING); + ExternalDataConstants.QUOTE, warningCollector, ExternalDataConstants.EMPTY_STRING, true); this.record = new CharArrayRecord(); this.valueIndex = valueIndex; this.recordWithMetadata = new RecordWithMetadataAndPK<>(record, metaType.getFieldTypes(), recordType, diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java index 5fe0589..1895e98 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; +import org.apache.asterix.dataflow.data.nontagged.printers.csv.CSVUtils; import org.apache.asterix.external.api.AsterixInputStream; import org.apache.asterix.external.util.ExternalDataConstants; import org.apache.asterix.external.util.ExternalDataUtils; @@ -44,6 +45,7 @@ private int readLength; private boolean inQuote; private IWarningCollector warnings; + private boolean quoteCheckNeeded; private static final List<String> recordReaderFormats = Collections.unmodifiableList( Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_CSV)); private static final String REQUIRED_CONFIGS = KEY_QUOTE; @@ -55,7 +57,12 @@ this.warnings = ctx.getWarningCollector(); String quoteString = config.get(KEY_QUOTE); ExternalDataUtils.validateChar(quoteString, KEY_QUOTE); - this.quote = quoteString.charAt(0); + if (CSVUtils.NONE.equalsIgnoreCase(quoteString)) { + this.quoteCheckNeeded = false; + } else { + this.quoteCheckNeeded = true; + this.quote = quoteString.charAt(0); + } this.escape = ExternalDataUtils.validateGetEscape(config, config.get(ExternalDataConstants.KEY_FORMAT)); } @@ -88,6 +95,9 @@ @Override public boolean hasNext() throws IOException { + if (!quoteCheckNeeded) { + return super.hasNext(); + } while (true) { if (done) { return false; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java index 7f890e1..8fba466 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java @@ -78,10 +78,11 @@ private final IExternalFilterValueEmbedder valueEmbedder; private final ParserContext parserContext; private FieldCursorForDelimitedDataParser cursor; + private final boolean qouteCheckNeeded; public DelimitedDataParser(IExternalDataRuntimeContext context, IValueParserFactory[] valueParserFactories, char fieldDelimiter, char quote, char escape, boolean hasHeader, ARecordType recordType, - boolean isStreamParser, String nullString) throws HyracksDataException { + boolean isStreamParser, String nullString, boolean qouteCheckNeeded) throws HyracksDataException { this.dataSourceName = context.getDatasourceNameSupplier(); this.lineNumber = context.getLineNumberSupplier(); this.warnings = context.getTaskContext().getWarningCollector(); @@ -91,6 +92,7 @@ this.escape = escape; this.hasHeader = hasHeader; this.recordType = recordType; + this.qouteCheckNeeded = qouteCheckNeeded; valueParsers = new IValueParser[valueParserFactories.length]; for (int i = 0; i < valueParserFactories.length; ++i) { valueParsers[i] = valueParserFactories[i].createValueParser(); @@ -130,7 +132,7 @@ } if (!isStreamParser) { cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote, escape, warnings, - this::getDataSourceName); + this::getDataSourceName, qouteCheckNeeded); } this.nullChars = nullString != null ? nullString.toCharArray() : null; this.parserContext = new ParserContext(); @@ -235,7 +237,7 @@ public void setInputStream(InputStream in) throws IOException { // TODO(ali): revisit this in regards to stream cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, escape, - warnings, this::getDataSourceName); + warnings, this::getDataSourceName, qouteCheckNeeded); if (hasHeader) { cursor.nextRecord(); FieldCursorForDelimitedDataParser.Result result; @@ -252,7 +254,7 @@ public boolean reset(InputStream in) throws IOException { // TODO(ali): revisit this in regards to stream cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, escape, - warnings, this::getDataSourceName); + warnings, this::getDataSourceName, qouteCheckNeeded); return true; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java index 65cf552..145c065 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java @@ -51,13 +51,15 @@ private DelimitedDataParser createParser(IExternalDataRuntimeContext context) throws HyracksDataException { IValueParserFactory[] valueParserFactories = ExternalDataUtils.getValueParserFactories(recordType); char delimiter = ExternalDataUtils.validateGetDelimiter(configuration); + boolean qouteCheckNeeded = ExternalDataUtils.isQuoteNeeded(configuration); char quote = ExternalDataUtils.validateGetQuote(configuration, delimiter); char escape = ExternalDataUtils.validateGetEscape(configuration, configuration.get(ExternalDataConstants.KEY_FORMAT)); boolean hasHeader = ExternalDataUtils.hasHeader(configuration); String nullString = configuration.get(KEY_NULL_STR); return new DelimitedDataParser(context, valueParserFactories, delimiter, quote, escape, hasHeader, recordType, - ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM), nullString); + ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM), nullString, + qouteCheckNeeded); } @Override diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java index 591829b..9926e09 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java @@ -72,6 +72,7 @@ import org.apache.asterix.common.library.ILibraryManager; import org.apache.asterix.common.metadata.DataverseName; import org.apache.asterix.common.metadata.Namespace; +import org.apache.asterix.dataflow.data.nontagged.printers.csv.CSVUtils; import org.apache.asterix.external.api.IDataParserFactory; import org.apache.asterix.external.api.IExternalDataSourceFactory.DataSourceType; import org.apache.asterix.external.api.IInputStreamFactory; @@ -165,6 +166,10 @@ return quote; } + public static boolean isQuoteNeeded(Map<String, String> configuration) { + return !CSVUtils.NONE.equalsIgnoreCase(configuration.get(KEY_QUOTE)); + } + public static char validateGetEscape(Map<String, String> configuration, String format) throws HyracksDataException { if (ExternalDataConstants.FORMAT_CSV.equals(format)) { return validateCharOrDefault(configuration, KEY_ESCAPE, ExternalDataConstants.CSV_ESCAPE); @@ -690,10 +695,15 @@ return defaultValue; } validateChar(value, key); - return value.charAt(0); + return CSVUtils.extractSingleChar(value); + } public static void validateChar(String parameterValue, String parameterName) throws RuntimeDataException { + if (parameterName.equals(KEY_QUOTE) && CSVUtils.NONE.equalsIgnoreCase(parameterValue)) { + return; + + } if (parameterValue.length() != 1) { throw new RuntimeDataException(ErrorCode.INVALID_CHAR_LENGTH, parameterValue, parameterName); } diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/csv/CSVUtils.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/csv/CSVUtils.java index 37d4dbd..e169371 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/csv/CSVUtils.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/csv/CSVUtils.java @@ -118,7 +118,7 @@ return value != null ? extractSingleChar(value) : extractSingleChar(defaultValue); } - private static char extractSingleChar(String input) { + public static char extractSingleChar(String input) { if (CSVUtils.NONE.equalsIgnoreCase(input)) { return CSVUtils.NULL_CHAR; } else { diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java index 34d17c9..c0799d9 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java @@ -76,7 +76,7 @@ DataOutput dos = tb.getDataOutput(); FieldCursorForDelimitedDataParser cursor = new FieldCursorForDelimitedDataParser( - new InputStreamReader(in), fieldDelimiter, quote, escape, warningCollector, () -> ""); + new InputStreamReader(in), fieldDelimiter, quote, escape, warningCollector, () -> "", true); while (cursor.nextRecord()) { tb.reset(); for (int i = 0; i < valueParsers.length; ++i) { diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java index 7cba88c..7a16e0f 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java @@ -74,11 +74,12 @@ private final char quote; //the quote character private final char fieldDelimiter; //the delimiter + private final boolean qouteCheckNeeded; private final char escape; public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote, char escape, - IWarningCollector warningCollector, Supplier<String> dataSourceName) { + IWarningCollector warningCollector, Supplier<String> dataSourceName, boolean qouteCheckNeeded) { this.warnings = warningCollector; this.dataSourceName = dataSourceName; this.in = in; @@ -103,6 +104,7 @@ containsEscapedQuotes = false; lineCount = 1; fieldCount = 0; + this.qouteCheckNeeded = qouteCheckNeeded; } public char[] getBuffer() { @@ -188,27 +190,30 @@ // this may or may not be an escape. the next character must be a quote for it to be. lastEscapePosition = p; } - if (ch == quote) { - boolean couldBeEscapedQuote = - lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1; - if (quote == escape) { - startedQuote = true; - // check two quotes in a row that aren't at the start of a field if quote is escape, e.g. "" - if (couldBeEscapedQuote && start != p - 1) { - lastEscapedQuotePosition = p; + if (qouteCheckNeeded) { + if (ch == quote) { + boolean couldBeEscapedQuote = + lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1; + if (quote == escape) { + startedQuote = true; + // check two quotes in a row that aren't at the start of a field if quote is escape, e.g. "" + if (couldBeEscapedQuote && start != p - 1) { + lastEscapedQuotePosition = p; + } + } else { + if (couldBeEscapedQuote) { + lastEscapedQuotePosition = p; + } } - } else { - if (couldBeEscapedQuote) { - lastEscapedQuotePosition = p; + lastQuotePosition = p; + } else if (ch == fieldDelimiter) { + if (startedQuote && lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1) { + startedQuote = false; + lastDelimiterPosition = p; } } - lastQuotePosition = p; - } else if (ch == fieldDelimiter) { - if (startedQuote && lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1) { - startedQuote = false; - lastDelimiterPosition = p; - } - } else if (ch == '\n' && !startedQuote) { + } + if (ch == '\n' && !startedQuote) { start = p + 1; state = State.EOR; lastDelimiterPosition = p; @@ -324,7 +329,7 @@ } } char ch = buffer[p]; - if (ch == quote) { + if (ch == quote && qouteCheckNeeded) { // If this is first quote in the field, then it needs to be placed in the beginning. if (!startedQuote) { if (p == start) { diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java index 10b3cd3..0ea5a9b 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java @@ -40,7 +40,7 @@ reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); // skip header final FieldCursorForDelimitedDataParser cursor = - new FieldCursorForDelimitedDataParser(reader, ',', '"', '"', null, () -> ""); + new FieldCursorForDelimitedDataParser(reader, ',', '"', '"', null, () -> "", true); // get number of fields from header (first record is header) cursor.nextRecord(); int numOfFields = 0; -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20146 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: ionic Gerrit-Change-Id: I3812d2d1306282e9f02e8b77e1f79ac6b203cabe Gerrit-Change-Number: 20146 Gerrit-PatchSet: 23 Gerrit-Owner: Shahrzad Shirazi <shaji...@ucr.edu> Gerrit-Reviewer: Ali Alsuliman <ali.al.solai...@gmail.com> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Hussain Towaileb <hussai...@gmail.com> Gerrit-Reviewer: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Gerrit-Reviewer: Michael Blow <mb...@apache.org> Gerrit-Reviewer: Murtadha Hubail <mhub...@apache.org> Gerrit-Reviewer: Shahrzad Shirazi <shaji...@ucr.edu> Gerrit-MessageType: merged