This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit e91d2c6847829b22bfb9f8b7917cea1000a61980 Author: Ali Alsuliman <[email protected]> AuthorDate: Sun Apr 12 14:05:32 2020 -0700 [ASTERIXDB-2713][EXT] Add CSV & TSV support for external dataset - user model changes: no - storage format changes: no - interface changes: no Details: Add CSV support for external dataset. - support S3 - add boolean parser to Hyracks Change-Id: Id1790fa73461e9f4a5fb443c51c1905ac588cee6 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5743 Tested-by: Jenkins <[email protected]> Integration-Tests: Jenkins <[email protected]> Reviewed-by: Murtadha Hubail <[email protected]> --- asterixdb/asterix-app/data/csv/01.csv | 3 + asterixdb/asterix-app/data/csv/02.csv | 3 + asterixdb/asterix-app/data/csv/sample_09.csv | 17 ++ asterixdb/asterix-app/data/csv/sample_10.csv | 39 +++++ asterixdb/asterix-app/data/csv/sample_11.csv | 4 + asterixdb/asterix-app/data/csv/sample_12.csv | 15 ++ asterixdb/asterix-app/data/tsv/01.tsv | 3 + asterixdb/asterix-app/data/tsv/02.tsv | 3 + asterixdb/asterix-app/data/tsv/sample_01.tsv | 28 ++++ .../asterix/app/translator/QueryTranslator.java | 2 + .../aws/AwsS3ExternalDatasetTest.java | 27 ++++ .../csv-parser-001/csv-parser-001.1.ddl.sqlpp | 32 ++++ .../csv-parser-001/csv-parser-001.2.query.sqlpp | 22 +++ .../csv-parser-001/csv-parser-001.3.query.sqlpp | 22 +++ .../csv-parser-001/csv-parser-001.4.query.sqlpp | 22 +++ .../csv-parser-001/csv-parser-001.5.query.sqlpp | 22 +++ .../csv-parser-001/csv-parser-001.6.ddl.sqlpp | 20 +++ .../tsv-parser-001/tsv-parser-002.1.ddl.sqlpp | 27 ++++ .../tsv-parser-001/tsv-parser-002.2.query.sqlpp | 22 +++ .../tsv-parser-001/tsv-parser-002.3.ddl.sqlpp | 20 +++ .../aws/s3/001/query-dataset.000.ddl.sqlpp | 36 +++++ .../aws/s3/001/query-dataset.002.query.sqlpp | 22 +++ .../aws/s3/001/query-dataset.003.ddl.sqlpp | 20 +++ .../aws/s3/002/query-dataset.000.ddl.sqlpp | 36 +++++ .../aws/s3/002/query-dataset.002.query.sqlpp | 22 +++ .../aws/s3/002/query-dataset.003.ddl.sqlpp | 20 +++ .../csv-parser-001/csv-parser-001.2.adm | 15 ++ .../csv-parser-001/csv-parser-001.3.adm | 32 ++++ .../csv-parser-001/csv-parser-001.4.adm | 4 + .../csv-parser-001/csv-parser-001.5.adm | 13 ++ .../tsv-parser-001/tsv-parser-001.2.adm | 28 ++++ .../aws/s3/001/external_dataset.001.adm | 6 + .../aws/s3/002/external_dataset.001.adm | 6 + .../runtimets/testsuite_external_dataset.xml | 10 ++ .../test/resources/runtimets/testsuite_sqlpp.xml | 12 ++ .../CSVToRecordWithMetadataAndPKConverter.java | 13 +- .../record/reader/aws/AwsS3InputStreamFactory.java | 17 +- .../record/reader/stream/LineRecordReader.java | 5 +- .../reader/stream/QuotedLineRecordReader.java | 67 +++++--- .../external/parser/DelimitedDataParser.java | 31 ++-- .../parser/factory/DelimitedDataParserFactory.java | 44 +---- .../external/provider/AdapterFactoryProvider.java | 15 +- .../util/ExternalDataCompatibilityUtils.java | 17 +- .../external/util/ExternalDataConstants.java | 8 + .../asterix/external/util/ExternalDataUtils.java | 88 +++++----- .../common/data/parsers/BooleanParserFactory.java | 62 +++++++ .../std/file/DelimitedDataTupleParserFactory.java | 9 +- .../file/FieldCursorForDelimitedDataParser.java | 179 ++++++++++++--------- .../hyracks/dataflow/std/file/CursorTest.java | 5 +- 49 files changed, 957 insertions(+), 238 deletions(-) diff --git a/asterixdb/asterix-app/data/csv/01.csv b/asterixdb/asterix-app/data/csv/01.csv new file mode 100644 index 0000000..6957e76 --- /dev/null +++ b/asterixdb/asterix-app/data/csv/01.csv @@ -0,0 +1,3 @@ +1,,"good","recommend" +2,,"bad","not recommend" +3,,"good", \ No newline at end of file diff --git a/asterixdb/asterix-app/data/csv/02.csv b/asterixdb/asterix-app/data/csv/02.csv new file mode 100644 index 0000000..630843f --- /dev/null +++ b/asterixdb/asterix-app/data/csv/02.csv @@ -0,0 +1,3 @@ +4,2018,"good","recommend" +5,2018,,"not recommend" +6,2018,"good", \ No newline at end of file diff --git a/asterixdb/asterix-app/data/csv/sample_09.csv b/asterixdb/asterix-app/data/csv/sample_09.csv new file mode 100644 index 0000000..b14219d --- /dev/null +++ b/asterixdb/asterix-app/data/csv/sample_09.csv @@ -0,0 +1,17 @@ +a,b,c,d,e +0,", boo", 1,2,3 +1,"","",❤, +2,3,4,\n, +3,"quoted ""f"" field",,, +4,4,,, +5,"{""vehicle"": ""car"", ""location"": [2.0, 0.1]}",,, +6,2,3,, +7,8,9,, +8,2,3,, +9,8,9,, +10,"field +""f"" +with multiple lines",,, +11,4,,, +12,5,ʤ,, +John,Green,111 downtown st.,"city, state",99999 \ No newline at end of file diff --git a/asterixdb/asterix-app/data/csv/sample_10.csv b/asterixdb/asterix-app/data/csv/sample_10.csv new file mode 100644 index 0000000..3beee08 --- /dev/null +++ b/asterixdb/asterix-app/data/csv/sample_10.csv @@ -0,0 +1,39 @@ +1,"?/ Text ending with a backslash / \",2000-09-03 07:12:22 +2,non quoted text!yes......,2003-08-09 22:34:19 +3,Text with more sentences. Another sentence.,2003-09-12 05:29:12 +4,"Quoted text.. yes.",2003-09-13 17:21:49 +5,Another text,2003-01-21 23:31:41 +6,Text with' quotes.,2003-09-14 20:15:50 +7,Text with quote's,2003-09-14 18:34:03 +8,"Text with quotes '",2003-01-28 20:32:13 +9,"Text with quotes """,2003-01-18 11:44:15 +10,Text with question marks!?!?,2003-09-18 06:25:56 +11,""" Text that starts with quotes",2003-09-12 00:31:24 +12,"Text with \"" backslash and quotes",2003-09-13 20:30:06 +13,"Text with \"" backslash and quotes\""",2003-09-14 16:20:36 +14,"Text that has comma ,",2003-09-12 08:21:18 +15,"Text that has "","" quoted comma",2003-09-12 08:21:18 +16,",Text that has ",2003-09-12 08:21:18 +17,","",Text that has ",2003-09-12 08:21:18 +18,"Text with commas,inside it., yes",2003-09-13 23:42:14 +19,"Text that has \n inside ",2003-09-12 08:21:18 +20,"Text that has \\\n inside ",2003-09-12 08:21:18 +21,text with :),2003-09-05 19:15:34 +22,"Text that has \\\"" inside \\",2003-09-12 08:21:18 +23,"Text that has \\\"" inside \\""",2003-09-12 08:21:18 +24,"""text that spans multiple +Lines and more +Lines ane more and more +Lines ... +And yet more lines +And more""",2011-09-19 01:09:09 +25,"Text "" +more lines",2011-09-19 01:09:09 +26,""" +",2011-09-19 01:09:09 +27,"Text","" +28,"Text","2011-09-19 01:09:09" +29,"Text\.","2011-09-19 01:09:09" +30,Text\.,"2011-09-19 01:09:09" +31,"\.Text","2011-09-19 01:09:09" +32,\.Text,"2011-09-19 01:09:09" \ No newline at end of file diff --git a/asterixdb/asterix-app/data/csv/sample_11.csv b/asterixdb/asterix-app/data/csv/sample_11.csv new file mode 100644 index 0000000..b9a9571 --- /dev/null +++ b/asterixdb/asterix-app/data/csv/sample_11.csv @@ -0,0 +1,4 @@ +1,","", b", 3,4,5 +","", b",4, 3,4,5 +,,,, +"dd",,,, \ No newline at end of file diff --git a/asterixdb/asterix-app/data/csv/sample_12.csv b/asterixdb/asterix-app/data/csv/sample_12.csv new file mode 100644 index 0000000..2ab7c6d --- /dev/null +++ b/asterixdb/asterix-app/data/csv/sample_12.csv @@ -0,0 +1,15 @@ +1,true,"text" +2,false,"text" +3,true,"text" +4,true,"" +5,false, +6,true,"text"" +more lines" +7,false,""" +" +8,true,"" +9,false,"text""" +10,false,text\. +11,true,"text\." +,false,\.text +13,true,"\.text" \ No newline at end of file diff --git a/asterixdb/asterix-app/data/tsv/01.tsv b/asterixdb/asterix-app/data/tsv/01.tsv new file mode 100644 index 0000000..98876c7 --- /dev/null +++ b/asterixdb/asterix-app/data/tsv/01.tsv @@ -0,0 +1,3 @@ +1 "good" "recommend" +2 "bad" "not recommend" +3 "good" "recommend" \ No newline at end of file diff --git a/asterixdb/asterix-app/data/tsv/02.tsv b/asterixdb/asterix-app/data/tsv/02.tsv new file mode 100644 index 0000000..c01ce7c --- /dev/null +++ b/asterixdb/asterix-app/data/tsv/02.tsv @@ -0,0 +1,3 @@ +4 2018 "good" "recommend" +5 2018 "not recommend" +6 2018 "good" "recommend" \ No newline at end of file diff --git a/asterixdb/asterix-app/data/tsv/sample_01.tsv b/asterixdb/asterix-app/data/tsv/sample_01.tsv new file mode 100644 index 0000000..aab289a --- /dev/null +++ b/asterixdb/asterix-app/data/tsv/sample_01.tsv @@ -0,0 +1,28 @@ +11 55 text field wih , charrrrrrrrrrr true 90 0.666666667 +12 55 text field with " charrrrrrrrrr false 90 0.666666667 +14 55 text field with ' charrrrrrrrrr false 90 0.666666667 +15 55 text field with \ charrrrrrrrrr false 90 0.666666667 +16 55 text field wih \, char true 90 0.666666667 +17 55 text field with \" charrrrrrrrr false 90 0.666666667 +18 55 text field with \' charrrrrrrrr false 90 0.666666667 +19 55 text field with \\ charrrrrrrrr false 90 0.666666667 +20 55 text field ending with charr , false 90 0.666666667 +21 55 text field ending with charr " false 90 0.666666667 +22 55 text field ending with charr ' false 90 0.666666667 +23 55 text field ending with charr \ false 90 0.666666667 +24 55 text field ending with charr \, false 90 0.666666667 +25 55 text field ending with charr \" false 90 0.666666667 +26 55 text field ending with charr \' false 90 0.666666667 +27 55 text field ending with charr \\ false 90 0.666666667 +28 55 ,text field starting with charr false 90 0.666666667 +29 55 "text field starting with charr false 90 0.666666667 +30 55 'text field starting with charr false 90 0.666666667 +31 55 \text field starting with charr false 90 0.666666667 +32 55 \,text field starting with char false 90 0.666666667 +33 55 \"text field starting with char false 90 0.666666667 +34 55 \'text field starting with char false 90 0.666666667 +35 55 \\text field starting with char false 90 0.666666667 +36 55 "text field inside with char" false 90 0.666666667 +37 55 text field with charrrrrrrrr false 90 0.666666667 +38 55 text field with "" charrrrrrrrr false 90 0.666666667 +39 55 text field "with" charrrrrrrrrr false 90 0.666666667 \ No newline at end of file diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java index c1eea7c..e211531 100644 --- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java +++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java @@ -85,6 +85,7 @@ import org.apache.asterix.external.indexing.ExternalFile; import org.apache.asterix.external.indexing.IndexingConstants; import org.apache.asterix.external.operators.FeedIntakeOperatorNodePushable; import org.apache.asterix.external.util.ExternalDataConstants; +import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.asterix.formats.nontagged.TypeTraitProvider; import org.apache.asterix.lang.common.base.IReturningStatement; import org.apache.asterix.lang.common.base.IRewriterFactory; @@ -727,6 +728,7 @@ public class QueryTranslator extends AbstractLangTranslator implements IStatemen case EXTERNAL: ExternalDetailsDecl externalDetails = (ExternalDetailsDecl) dd.getDatasetDetailsDecl(); Map<String, String> properties = createExternalDatasetProperties(dd, metadataProvider, mdTxnCtx); + ExternalDataUtils.defaultConfiguration(properties); datasetDetails = new ExternalDatasetDetails(externalDetails.getAdapter(), properties, new Date(), TransactionState.COMMIT); break; diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java index 3b4cdf8..d2158ba 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java @@ -18,9 +18,12 @@ */ package org.apache.asterix.test.external_dataset.aws; +import static org.apache.hyracks.util.file.FileUtil.joinPath; + import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.URI; +import java.nio.file.Paths; import java.util.Collection; import java.util.HashMap; import java.util.Map; @@ -66,9 +69,13 @@ public class AwsS3ExternalDatasetTest { private static S3Client client; private static final String S3_MOCK_SERVER_BUCKET = "playground"; private static final String S3_MOCK_SERVER_BUCKET_DEFINITION = "json-data/reviews/"; // data resides here + private static final String S3_MOCK_SERVER_BUCKET_CSV_DEFINITION = "csv-data/reviews/"; // data resides here + private static final String S3_MOCK_SERVER_BUCKET_TSV_DEFINITION = "tsv-data/reviews/"; // data resides here private static final String S3_MOCK_SERVER_REGION = "us-west-2"; private static final int S3_MOCK_SERVER_PORT = 8001; private static final String S3_MOCK_SERVER_HOSTNAME = "http://localhost:" + S3_MOCK_SERVER_PORT; + private static final String CSV_DATA_PATH = joinPath("data", "csv"); + private static final String TSV_DATA_PATH = joinPath("data", "tsv"); @BeforeClass public static void setUp() throws Exception { @@ -210,6 +217,26 @@ public class AwsS3ExternalDatasetTest { PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET) .key(S3_MOCK_SERVER_BUCKET_DEFINITION + "2019/q2/2.json").build(), RequestBody.fromString("{\"id\": 14, \"year\": 2019, \"quarter\": 2, \"review\": \"bad\"}")); + + LOGGER.info("Adding CSV files to the bucket"); + client.putObject( + PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET) + .key(S3_MOCK_SERVER_BUCKET_CSV_DEFINITION + "01.csv").build(), + RequestBody.fromFile(Paths.get(CSV_DATA_PATH, "01.csv"))); + client.putObject( + PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET) + .key(S3_MOCK_SERVER_BUCKET_CSV_DEFINITION + "2018/01.csv").build(), + RequestBody.fromFile(Paths.get(CSV_DATA_PATH, "02.csv"))); + + LOGGER.info("Adding TSV files to the bucket"); + client.putObject( + PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET) + .key(S3_MOCK_SERVER_BUCKET_TSV_DEFINITION + "01.tsv").build(), + RequestBody.fromFile(Paths.get(TSV_DATA_PATH, "01.tsv"))); + client.putObject( + PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET) + .key(S3_MOCK_SERVER_BUCKET_TSV_DEFINITION + "2018/01.tsv").build(), + RequestBody.fromFile(Paths.get(TSV_DATA_PATH, "02.tsv"))); LOGGER.info("Files added successfully"); } } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp new file mode 100644 index 0000000..5728e78 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test IF EXISTS; +CREATE DATAVERSE test; + +USE test; + +CREATE TYPE t1 AS {f1: string, f2: string, f3: string, f4: string, f5: string}; +CREATE TYPE t2 AS {f1: string, f2: string, f3: string}; +CREATE TYPE t3 AS {f1: int?, f2: boolean, f3: string?}; + +CREATE EXTERNAL DATASET ds1(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_09.csv"), ("format"="csv")); +CREATE EXTERNAL DATASET ds2(t2) USING localfs(("path"="asterix_nc1://data/csv/sample_10.csv"), ("format"="csv")); +CREATE EXTERNAL DATASET ds3(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_11.csv"), ("format"="csv")); +CREATE EXTERNAL DATASET ds4(t3) USING localfs(("path"="asterix_nc1://data/csv/sample_12.csv"), ("format"="csv")); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.2.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.2.query.sqlpp new file mode 100644 index 0000000..d870372 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.2.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds1 v SELECT VALUE v; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.3.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.3.query.sqlpp new file mode 100644 index 0000000..64a2f8a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.3.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds2 v SELECT VALUE v; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.4.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.4.query.sqlpp new file mode 100644 index 0000000..313198c --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.4.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds3 v SELECT VALUE v; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.5.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.5.query.sqlpp new file mode 100644 index 0000000..065de4e --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.5.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds4 v SELECT VALUE v; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.6.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.6.ddl.sqlpp new file mode 100644 index 0000000..86a1b59 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.6.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.1.ddl.sqlpp new file mode 100644 index 0000000..c0faf16 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.1.ddl.sqlpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test IF EXISTS; +CREATE DATAVERSE test; + +USE test; + +CREATE TYPE t1 AS {f1: int, f2: int, f3: string, f4: boolean, f5: bigint, f6: double}; + +CREATE EXTERNAL DATASET ds1(t1) USING localfs(("path"="asterix_nc1://data/tsv/sample_01.tsv"), ("format"="tsv")) \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.2.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.2.query.sqlpp new file mode 100644 index 0000000..d870372 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.2.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM ds1 v SELECT VALUE v; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.3.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.3.ddl.sqlpp new file mode 100644 index 0000000..86a1b59 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.3.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.000.ddl.sqlpp new file mode 100644 index 0000000..b906039 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.000.ddl.sqlpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test IF EXISTS; +CREATE DATAVERSE test; +USE test; + +DROP TYPE test IF EXISTS; +CREATE TYPE test AS {id: int, year: int?, review: string, details: string?}; + +DROP DATASET test IF EXISTS; +CREATE EXTERNAL DATASET test(test) USING S3 ( +("accessKey"="dummyAccessKey"), +("secretKey"="dummySecretKey"), +("region"="us-west-2"), +("serviceEndpoint"="http://localhost:8001"), +("container"="playground"), +("definition"="csv-data/reviews"), +("format"="csv") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.002.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.002.query.sqlpp new file mode 100644 index 0000000..6e31eb3 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.002.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM test SELECT VALUE test ORDER BY id ASC; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.003.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.003.ddl.sqlpp new file mode 100644 index 0000000..0ff713d --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.003.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATASET test IF EXISTS; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.000.ddl.sqlpp new file mode 100644 index 0000000..d385bee --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.000.ddl.sqlpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test IF EXISTS; +CREATE DATAVERSE test; +USE test; + +DROP TYPE test IF EXISTS; +CREATE TYPE test AS {id: int, year: int?, review: string, details: string?}; + +DROP DATASET test IF EXISTS; +CREATE EXTERNAL DATASET test(test) USING S3 ( +("accessKey"="dummyAccessKey"), +("secretKey"="dummySecretKey"), +("region"="us-west-2"), +("serviceEndpoint"="http://localhost:8001"), +("container"="playground"), +("definition"="tsv-data/reviews"), +("format"="tsv") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.002.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.002.query.sqlpp new file mode 100644 index 0000000..6e31eb3 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.002.query.sqlpp @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +FROM test SELECT VALUE test ORDER BY id ASC; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.003.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.003.ddl.sqlpp new file mode 100644 index 0000000..0ff713d --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.003.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATASET test IF EXISTS; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.2.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.2.adm new file mode 100644 index 0000000..5c84fb8 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.2.adm @@ -0,0 +1,15 @@ +{ "f1": "a", "f2": "b", "f3": "c", "f4": "d", "f5": "e" } +{ "f1": "0", "f2": ", boo", "f3": " 1", "f4": "2", "f5": "3" } +{ "f1": "1", "f2": "", "f3": "", "f4": "❤", "f5": "" } +{ "f1": "2", "f2": "3", "f3": "4", "f4": "\\n", "f5": "" } +{ "f1": "3", "f2": "quoted \"f\" field", "f3": "", "f4": "", "f5": "" } +{ "f1": "4", "f2": "4", "f3": "", "f4": "", "f5": "" } +{ "f1": "5", "f2": "{\"vehicle\": \"car\", \"location\": [2.0, 0.1]}", "f3": "", "f4": "", "f5": "" } +{ "f1": "6", "f2": "2", "f3": "3", "f4": "", "f5": "" } +{ "f1": "7", "f2": "8", "f3": "9", "f4": "", "f5": "" } +{ "f1": "8", "f2": "2", "f3": "3", "f4": "", "f5": "" } +{ "f1": "9", "f2": "8", "f3": "9", "f4": "", "f5": "" } +{ "f1": "10", "f2": "field\n\"f\"\nwith multiple lines", "f3": "", "f4": "", "f5": "" } +{ "f1": "11", "f2": "4", "f3": "", "f4": "", "f5": "" } +{ "f1": "12", "f2": "5", "f3": "ʤ", "f4": "", "f5": "" } +{ "f1": "John", "f2": "Green", "f3": "111 downtown st.", "f4": "city, state", "f5": "99999" } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.3.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.3.adm new file mode 100644 index 0000000..80f5fb7 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.3.adm @@ -0,0 +1,32 @@ +{ "f1": "1", "f2": "?/ Text ending with a backslash / \\", "f3": "2000-09-03 07:12:22" } +{ "f1": "2", "f2": "non quoted text!yes......", "f3": "2003-08-09 22:34:19" } +{ "f1": "3", "f2": "Text with more sentences. Another sentence.", "f3": "2003-09-12 05:29:12" } +{ "f1": "4", "f2": "Quoted text.. yes.", "f3": "2003-09-13 17:21:49" } +{ "f1": "5", "f2": "Another text", "f3": "2003-01-21 23:31:41" } +{ "f1": "6", "f2": "Text with' quotes.", "f3": "2003-09-14 20:15:50" } +{ "f1": "7", "f2": "Text with quote's", "f3": "2003-09-14 18:34:03" } +{ "f1": "8", "f2": "Text with quotes '", "f3": "2003-01-28 20:32:13" } +{ "f1": "9", "f2": "Text with quotes \"", "f3": "2003-01-18 11:44:15" } +{ "f1": "10", "f2": "Text with question marks!?!?", "f3": "2003-09-18 06:25:56" } +{ "f1": "11", "f2": "\" Text that starts with quotes", "f3": "2003-09-12 00:31:24" } +{ "f1": "12", "f2": "Text with \\\" backslash and quotes", "f3": "2003-09-13 20:30:06" } +{ "f1": "13", "f2": "Text with \\\" backslash and quotes\\\"", "f3": "2003-09-14 16:20:36" } +{ "f1": "14", "f2": "Text that has comma ,", "f3": "2003-09-12 08:21:18" } +{ "f1": "15", "f2": "Text that has \",\" quoted comma", "f3": "2003-09-12 08:21:18" } +{ "f1": "16", "f2": ",Text that has ", "f3": "2003-09-12 08:21:18" } +{ "f1": "17", "f2": ",\",Text that has ", "f3": "2003-09-12 08:21:18" } +{ "f1": "18", "f2": "Text with commas,inside it., yes", "f3": "2003-09-13 23:42:14" } +{ "f1": "19", "f2": "Text that has \\n inside ", "f3": "2003-09-12 08:21:18" } +{ "f1": "20", "f2": "Text that has \\\\\\n inside ", "f3": "2003-09-12 08:21:18" } +{ "f1": "21", "f2": "text with :)", "f3": "2003-09-05 19:15:34" } +{ "f1": "22", "f2": "Text that has \\\\\\\" inside \\\\", "f3": "2003-09-12 08:21:18" } +{ "f1": "23", "f2": "Text that has \\\\\\\" inside \\\\\"", "f3": "2003-09-12 08:21:18" } +{ "f1": "24", "f2": "\"text that spans multiple\nLines and more\nLines ane more and more\nLines ...\nAnd yet more lines\nAnd more\"", "f3": "2011-09-19 01:09:09" } +{ "f1": "25", "f2": "Text \"\nmore lines", "f3": "2011-09-19 01:09:09" } +{ "f1": "26", "f2": "\"\n", "f3": "2011-09-19 01:09:09" } +{ "f1": "27", "f2": "Text", "f3": "" } +{ "f1": "28", "f2": "Text", "f3": "2011-09-19 01:09:09" } +{ "f1": "29", "f2": "Text\\.", "f3": "2011-09-19 01:09:09" } +{ "f1": "30", "f2": "Text\\.", "f3": "2011-09-19 01:09:09" } +{ "f1": "31", "f2": "\\.Text", "f3": "2011-09-19 01:09:09" } +{ "f1": "32", "f2": "\\.Text", "f3": "2011-09-19 01:09:09" } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.4.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.4.adm new file mode 100644 index 0000000..5c61b4a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.4.adm @@ -0,0 +1,4 @@ +{ "f1": "1", "f2": ",\", b", "f3": " 3", "f4": "4", "f5": "5" } +{ "f1": ",\", b", "f2": "4", "f3": " 3", "f4": "4", "f5": "5" } +{ "f1": "", "f2": "", "f3": "", "f4": "", "f5": "" } +{ "f1": "dd", "f2": "", "f3": "", "f4": "", "f5": "" } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.5.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.5.adm new file mode 100644 index 0000000..4b80e26 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.5.adm @@ -0,0 +1,13 @@ +{ "f1": 1, "f2": true, "f3": "text" } +{ "f1": 2, "f2": false, "f3": "text" } +{ "f1": 3, "f2": true, "f3": "text" } +{ "f1": 4, "f2": true, "f3": null } +{ "f1": 5, "f2": false, "f3": null } +{ "f1": 6, "f2": true, "f3": "text\"\nmore lines" } +{ "f1": 7, "f2": false, "f3": "\"\n" } +{ "f1": 8, "f2": true, "f3": null } +{ "f1": 9, "f2": false, "f3": "text\"" } +{ "f1": 10, "f2": false, "f3": "text\\." } +{ "f1": 11, "f2": true, "f3": "text\\." } +{ "f1": null, "f2": false, "f3": "\\.text" } +{ "f1": 13, "f2": true, "f3": "\\.text" } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/tsv-parser-001/tsv-parser-001.2.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/tsv-parser-001/tsv-parser-001.2.adm new file mode 100644 index 0000000..fbe287b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/tsv-parser-001/tsv-parser-001.2.adm @@ -0,0 +1,28 @@ +{ "f1": 11, "f2": 55, "f3": "text field wih , charrrrrrrrrrr", "f4": true, "f5": 90, "f6": 0.666666667 } +{ "f1": 12, "f2": 55, "f3": "text field with \" charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 14, "f2": 55, "f3": "text field with ' charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 15, "f2": 55, "f3": "text field with \\ charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 16, "f2": 55, "f3": "text field wih \\, char ", "f4": true, "f5": 90, "f6": 0.666666667 } +{ "f1": 17, "f2": 55, "f3": "text field with \\\" charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 18, "f2": 55, "f3": "text field with \\' charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 19, "f2": 55, "f3": "text field with \\\\ charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 20, "f2": 55, "f3": "text field ending with charr ,", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 21, "f2": 55, "f3": "text field ending with charr \"", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 22, "f2": 55, "f3": "text field ending with charr '", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 23, "f2": 55, "f3": "text field ending with charr \\", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 24, "f2": 55, "f3": "text field ending with charr \\,", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 25, "f2": 55, "f3": "text field ending with charr \\\"", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 26, "f2": 55, "f3": "text field ending with charr \\'", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 27, "f2": 55, "f3": "text field ending with charr \\\\", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 28, "f2": 55, "f3": ",text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 29, "f2": 55, "f3": "\"text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 30, "f2": 55, "f3": "'text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 31, "f2": 55, "f3": "\\text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 32, "f2": 55, "f3": "\\,text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 33, "f2": 55, "f3": "\\\"text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 34, "f2": 55, "f3": "\\'text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 35, "f2": 55, "f3": "\\\\text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 36, "f2": 55, "f3": "\"text field inside with char\"", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 37, "f2": 55, "f3": " text field with charrrrrrrrr ", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 38, "f2": 55, "f3": "text field with \"\" charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } +{ "f1": 39, "f2": 55, "f3": "text field \"with\" charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/001/external_dataset.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/001/external_dataset.001.adm new file mode 100644 index 0000000..93d1b57 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/001/external_dataset.001.adm @@ -0,0 +1,6 @@ +{ "id": 1, "year": null, "review": "good", "details": "recommend" } +{ "id": 2, "year": null, "review": "bad", "details": "not recommend" } +{ "id": 3, "year": null, "review": "good", "details": null } +{ "id": 4, "year": 2018, "review": "good", "details": "recommend" } +{ "id": 5, "year": 2018, "review": "", "details": "not recommend" } +{ "id": 6, "year": 2018, "review": "good", "details": null } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/002/external_dataset.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/002/external_dataset.001.adm new file mode 100644 index 0000000..1954b05 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/002/external_dataset.001.adm @@ -0,0 +1,6 @@ +{ "id": 1, "year": null, "review": "\"good\"", "details": "\"recommend\"" } +{ "id": 2, "year": null, "review": "\"bad\"", "details": "\"not recommend\"" } +{ "id": 3, "year": null, "review": "\"good\"", "details": "\"recommend\"" } +{ "id": 4, "year": 2018, "review": "\"good\"", "details": "\"recommend\"" } +{ "id": 5, "year": 2018, "review": "", "details": "\"not recommend\"" } +{ "id": 6, "year": 2018, "review": "\"good\"", "details": "\"recommend\"" } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml index cd1fb12..9948209 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml @@ -24,5 +24,15 @@ <output-dir compare="Text">aws/s3/000</output-dir> </compilation-unit> </test-case> + <test-case FilePath="external-dataset"> + <compilation-unit name="aws/s3/001"> + <output-dir compare="Text">aws/s3/001</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="external-dataset"> + <compilation-unit name="aws/s3/002"> + <output-dir compare="Text">aws/s3/002</output-dir> + </compilation-unit> + </test-case> </test-group> </test-suite> diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml index 63db153..a578690 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml @@ -12249,6 +12249,18 @@ </compilation-unit> </test-case> </test-group> + <test-group name="csv-tsv-parser"> + <test-case FilePath="csv-tsv-parser"> + <compilation-unit name="csv-parser-001"> + <output-dir compare="Text">csv-parser-001</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="csv-tsv-parser"> + <compilation-unit name="tsv-parser-001"> + <output-dir compare="Text">tsv-parser-001</output-dir> + </compilation-unit> + </test-case> + </test-group> <test-group name="binary"> <test-case FilePath="binary"> <compilation-unit name="parse"> diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java index 8255ebb..5c8f219 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java @@ -41,7 +41,7 @@ public class CSVToRecordWithMetadataAndPKConverter this.cursor = new FieldCursorForDelimitedDataParser(null, delimiter, ExternalDataConstants.QUOTE); this.record = new CharArrayRecord(); this.valueIndex = valueIndex; - this.recordWithMetadata = new RecordWithMetadataAndPK<char[]>(record, metaType.getFieldTypes(), recordType, + this.recordWithMetadata = new RecordWithMetadataAndPK<>(record, metaType.getFieldTypes(), recordType, keyIndicator, keyIndexes, keyTypes); } @@ -53,16 +53,15 @@ public class CSVToRecordWithMetadataAndPKConverter int i = 0; int j = 0; while (cursor.nextField()) { - if (cursor.isDoubleQuoteIncludedInThisField) { - cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart); - cursor.fEnd -= cursor.doubleQuoteCount; - cursor.isDoubleQuoteIncludedInThisField = false; + if (cursor.fieldHasDoubleQuote()) { + cursor.eliminateDoubleQuote(); } if (i == valueIndex) { - record.setValue(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart); + record.setValue(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength()); record.endRecord(); } else { - recordWithMetadata.setRawMetadata(j, cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart); + recordWithMetadata.setRawMetadata(j, cursor.getBuffer(), cursor.getFieldStart(), + cursor.getFieldLength()); j++; } i++; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java index a9f7898..6b8bb59 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.Map; import org.apache.asterix.common.dataflow.ICcApplicationContext; +import org.apache.asterix.common.exceptions.AsterixException; import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.external.api.AsterixInputStream; import org.apache.asterix.external.api.IInputStreamFactory; @@ -35,7 +36,6 @@ import org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartit import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; import org.apache.hyracks.api.application.IServiceContext; import org.apache.hyracks.api.context.IHyracksTaskContext; -import org.apache.hyracks.api.exceptions.HyracksDataException; import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; @@ -77,8 +77,7 @@ public class AwsS3InputStreamFactory implements IInputStreamFactory { } @Override - public void configure(IServiceContext ctx, Map<String, String> configuration) - throws AlgebricksException, HyracksDataException { + public void configure(IServiceContext ctx, Map<String, String> configuration) throws AlgebricksException { this.configuration = configuration; ICcApplicationContext ccApplicationContext = (ICcApplicationContext) ctx.getApplicationContext(); @@ -115,13 +114,13 @@ public class AwsS3InputStreamFactory implements IInputStreamFactory { * * @return A list of string paths that point to files only * - * @throws HyracksDataException HyracksDataException + * @throws AsterixException AsterixException */ - private List<S3Object> getFilesOnly(List<S3Object> s3Objects, String fileFormat) throws HyracksDataException { + private List<S3Object> getFilesOnly(List<S3Object> s3Objects, String fileFormat) throws AsterixException { List<S3Object> filesOnly = new ArrayList<>(); String fileExtension = getFileExtension(fileFormat); if (fileExtension == null) { - throw HyracksDataException.create(ErrorCode.INVALID_FORMAT); + throw AsterixException.create(ErrorCode.PROVIDER_STREAM_RECORD_READER_UNKNOWN_FORMAT, fileFormat); } s3Objects.stream().filter(object -> object.key().endsWith(fileExtension)).forEach(filesOnly::add); @@ -214,8 +213,12 @@ public class AwsS3InputStreamFactory implements IInputStreamFactory { */ private String getFileExtension(String format) { switch (format.toLowerCase()) { - case "json": + case ExternalDataConstants.FORMAT_JSON_LOWER_CASE: return ".json"; + case ExternalDataConstants.FORMAT_CSV: + return ".csv"; + case ExternalDataConstants.FORMAT_TSV: + return ".tsv"; default: return null; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java index 0b41d4b..be600ed 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java @@ -36,8 +36,9 @@ public class LineRecordReader extends StreamRecordReader { protected int newlineLength; protected int recordNumber = 0; protected boolean nextIsHeader = false; - private static final List<String> recordReaderFormats = Collections.unmodifiableList( - Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_CSV)); + private static final List<String> recordReaderFormats = + Collections.unmodifiableList(Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT, + ExternalDataConstants.FORMAT_CSV, ExternalDataConstants.FORMAT_TSV)); private static final String REQUIRED_CONFIGS = ""; @Override diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java index 4c4128a..1fd328b 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java @@ -32,11 +32,10 @@ import org.apache.hyracks.api.exceptions.HyracksDataException; public class QuotedLineRecordReader extends LineRecordReader { private char quote; - private boolean prevCharEscape; - private boolean inQuote; + private char quoteEscape; private static final List<String> recordReaderFormats = Collections.unmodifiableList( Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_CSV)); - private static final String REQUIRED_CONFIGS = "quote"; + private static final String REQUIRED_CONFIGS = ExternalDataConstants.KEY_QUOTE; @Override public void configure(AsterixInputStream inputStream, Map<String, String> config) throws HyracksDataException { @@ -47,6 +46,17 @@ public class QuotedLineRecordReader extends LineRecordReader { ExternalDataConstants.PARAMETER_OF_SIZE_ONE, quoteString)); } this.quote = quoteString.charAt(0); + String escapeString = config.get(ExternalDataConstants.KEY_QUOTE_ESCAPE); + if (escapeString == null) { + quoteEscape = ExternalDataConstants.ESCAPE; + } else { + if (escapeString.length() != 1) { + throw new HyracksDataException( + ExceptionUtils.incorrectParameterMessage(ExternalDataConstants.KEY_QUOTE_ESCAPE, + ExternalDataConstants.PARAMETER_OF_SIZE_ONE, escapeString)); + } + quoteEscape = escapeString.charAt(0); + } } @Override @@ -67,31 +77,35 @@ public class QuotedLineRecordReader extends LineRecordReader { } newlineLength = 0; prevCharCR = false; - prevCharEscape = false; + boolean prevCharEscape = false; record.reset(); int readLength = 0; - inQuote = false; + boolean inQuote = false; do { int startPosn = bufferPosn; if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = reader.read(inputBuffer); if (bufferLength <= 0) { - { - if (readLength > 0) { - if (inQuote) { - throw new IOException("malformed input record ended inside quote"); - } - record.endRecord(); - recordNumber++; - return true; + if (readLength > 0) { + if (inQuote) { + throw new IOException("malformed input record ended inside quote"); } - close(); - return false; + record.endRecord(); + recordNumber++; + return true; } + close(); + return false; } } + boolean maybeInQuote = false; for (; bufferPosn < bufferLength; ++bufferPosn) { + if (inputBuffer[bufferPosn] == quote && quoteEscape == quote) { + inQuote |= maybeInQuote; + prevCharEscape |= maybeInQuote; + } + maybeInQuote = false; if (!inQuote) { if (inputBuffer[bufferPosn] == ExternalDataConstants.LF) { newlineLength = (prevCharCR) ? 2 : 1; @@ -103,24 +117,25 @@ public class QuotedLineRecordReader extends LineRecordReader { break; } prevCharCR = (inputBuffer[bufferPosn] == ExternalDataConstants.CR); - if (inputBuffer[bufferPosn] == quote) { - if (!prevCharEscape) { - inQuote = true; - } + if (inputBuffer[bufferPosn] == quote && !prevCharEscape) { + // this is an opening quote + inQuote = true; } if (prevCharEscape) { prevCharEscape = false; } else { - prevCharEscape = inputBuffer[bufferPosn] == ExternalDataConstants.ESCAPE; + // the quoteEscape != quote is for making an opening quote not an escape + prevCharEscape = inputBuffer[bufferPosn] == quoteEscape && quoteEscape != quote; } } else { - // only look for next quote - if (inputBuffer[bufferPosn] == quote) { - if (!prevCharEscape) { - inQuote = false; - } + // if quote == quoteEscape and current char is quote, then it could be closing or escaping + if (inputBuffer[bufferPosn] == quote && !prevCharEscape) { + // this is most likely a closing quote. the outcome depends on the next char + inQuote = false; + maybeInQuote = true; } - prevCharEscape = inputBuffer[bufferPosn] == ExternalDataConstants.ESCAPE; + prevCharEscape = + inputBuffer[bufferPosn] == quoteEscape && !prevCharEscape && quoteEscape != quote; } } readLength = bufferPosn - startPosn; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java index 4e371c8..8facce6 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java @@ -57,9 +57,9 @@ public class DelimitedDataParser extends AbstractDataParser implements IStreamDa private ArrayBackedValueStorage[] nameBuffers; private boolean areAllNullFields; - public DelimitedDataParser(IValueParserFactory[] valueParserFactories, char fieldDelimter, char quote, + public DelimitedDataParser(IValueParserFactory[] valueParserFactories, char fieldDelimiter, char quote, boolean hasHeader, ARecordType recordType, boolean isStreamParser) throws HyracksDataException { - this.fieldDelimiter = fieldDelimter; + this.fieldDelimiter = fieldDelimiter; this.quote = quote; this.hasHeader = hasHeader; this.recordType = recordType; @@ -98,7 +98,7 @@ public class DelimitedDataParser extends AbstractDataParser implements IStreamDa } } if (!isStreamParser) { - cursor = new FieldCursorForDelimitedDataParser(null, fieldDelimiter, quote); + cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote); } } @@ -134,25 +134,23 @@ public class DelimitedDataParser extends AbstractDataParser implements IStreamDa fieldValueBuffer.reset(); try { - if (cursor.fStart == cursor.fEnd && recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.STRING + if (cursor.isFieldEmpty() && recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.STRING && recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.NULL) { // if the field is empty and the type is optional, insert // NULL. Note that string type can also process empty field as an // empty string if (!NonTaggedFormatUtil.isOptional(recordType.getFieldTypes()[i])) { - throw new RuntimeDataException(ErrorCode.PARSER_DELIMITED_NONOPTIONAL_NULL, cursor.recordCount, - cursor.fieldCount); + throw new RuntimeDataException(ErrorCode.PARSER_DELIMITED_NONOPTIONAL_NULL, + cursor.getRecordCount(), cursor.getFieldCount()); } fieldValueBufferOutput.writeByte(ATypeTag.SERIALIZED_NULL_TYPE_TAG); } else { fieldValueBufferOutput.writeByte(fieldTypeTags[i]); - // Eliminate doule quotes in the field that we are going to parse - if (cursor.isDoubleQuoteIncludedInThisField) { - cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart); - cursor.fEnd -= cursor.doubleQuoteCount; - cursor.isDoubleQuoteIncludedInThisField = false; + // Eliminate double quotes in the field that we are going to parse + if (cursor.fieldHasDoubleQuote()) { + cursor.eliminateDoubleQuote(); } - valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart, + valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength(), fieldValueBufferOutput); areAllNullFields = false; } @@ -165,15 +163,14 @@ public class DelimitedDataParser extends AbstractDataParser implements IStreamDa throw HyracksDataException.create(e); } } + if (valueParsers.length != cursor.getFieldCount()) { + throw new HyracksDataException("Record #" + cursor.getRecordCount() + " is missing some fields"); + } } @Override public void parse(IRawRecord<? extends char[]> record, DataOutput out) throws HyracksDataException { - try { - cursor.nextRecord(record.get(), record.size()); - } catch (IOException e) { - throw HyracksDataException.create(e); - } + cursor.nextRecord(record.get(), record.size()); parseRecord(); if (!areAllNullFields) { recBuilder.write(out, true); diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java index f406729..1fee49f 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java @@ -21,10 +21,7 @@ package org.apache.asterix.external.parser.factory; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; -import org.apache.asterix.common.exceptions.ErrorCode; -import org.apache.asterix.common.exceptions.RuntimeDataException; import org.apache.asterix.external.api.IExternalDataSourceFactory.DataSourceType; import org.apache.asterix.external.api.IRecordDataParser; import org.apache.asterix.external.api.IStreamDataParser; @@ -40,7 +37,8 @@ public class DelimitedDataParserFactory extends AbstractRecordStreamParserFactor private static final long serialVersionUID = 1L; private static final List<String> parserFormats = - Collections.unmodifiableList(Arrays.asList("csv", "delimited-text")); + Collections.unmodifiableList(Arrays.asList(ExternalDataConstants.FORMAT_CSV, + ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_TSV)); @Override public IRecordDataParser<char[]> createRecordParser(IHyracksTaskContext ctx) throws HyracksDataException { @@ -49,8 +47,8 @@ public class DelimitedDataParserFactory extends AbstractRecordStreamParserFactor private DelimitedDataParser createParser() throws HyracksDataException { IValueParserFactory[] valueParserFactories = ExternalDataUtils.getValueParserFactories(recordType); - Character delimiter = DelimitedDataParserFactory.getDelimiter(configuration); - char quote = DelimitedDataParserFactory.getQuote(configuration, delimiter); + char delimiter = ExternalDataUtils.getDelimiter(configuration); + char quote = ExternalDataUtils.getQuote(configuration, delimiter); boolean hasHeader = ExternalDataUtils.hasHeader(configuration); return new DelimitedDataParser(valueParserFactories, delimiter, quote, hasHeader, recordType, ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM)); @@ -67,40 +65,6 @@ public class DelimitedDataParserFactory extends AbstractRecordStreamParserFactor return createParser(); } - // Get a delimiter from the given configuration - public static char getDelimiter(Map<String, String> configuration) throws HyracksDataException { - String delimiterValue = configuration.get(ExternalDataConstants.KEY_DELIMITER); - if (delimiterValue == null) { - delimiterValue = ExternalDataConstants.DEFAULT_DELIMITER; - } else if (delimiterValue.length() != 1) { - throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_DELIMITER, - delimiterValue); - } - return delimiterValue.charAt(0); - } - - // Get a quote from the given configuration when the delimiter is given - // Need to pass delimiter to check whether they share the same character - public static char getQuote(Map<String, String> configuration, char delimiter) throws HyracksDataException { - String quoteValue = configuration.get(ExternalDataConstants.KEY_QUOTE); - if (quoteValue == null) { - quoteValue = ExternalDataConstants.DEFAULT_QUOTE; - } else if (quoteValue.length() != 1) { - throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_QUOTE, - quoteValue); - } - - // Since delimiter (char type value) can't be null, - // we only check whether delimiter and quote use the same character - if (quoteValue.charAt(0) == delimiter) { - throw new RuntimeDataException( - ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_QUOTE_DELIMITER_MISMATCH, quoteValue, - delimiter); - } - - return quoteValue.charAt(0); - } - @Override public void setMetaType(ARecordType metaType) { } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java index 414c460..27ac10e 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java @@ -27,7 +27,7 @@ import org.apache.asterix.external.adapter.factory.LookupAdapterFactory; import org.apache.asterix.external.api.IIndexingAdapterFactory; import org.apache.asterix.external.api.ITypedAdapterFactory; import org.apache.asterix.external.indexing.ExternalFile; -import org.apache.asterix.external.util.ExternalDataCompatibilityUtils; +import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.asterix.om.types.ARecordType; import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; import org.apache.hyracks.api.application.IServiceContext; @@ -39,11 +39,15 @@ import org.apache.hyracks.api.exceptions.HyracksDataException; */ public class AdapterFactoryProvider { - // Adapters + private AdapterFactoryProvider() { + } + + // get adapter factory. this method has the side effect of modifying the configuration as necessary public static ITypedAdapterFactory getAdapterFactory(IServiceContext serviceCtx, String adapterName, Map<String, String> configuration, ARecordType itemType, ARecordType metaType) throws HyracksDataException, AlgebricksException { - ExternalDataCompatibilityUtils.prepare(adapterName, configuration); + ExternalDataUtils.defaultConfiguration(configuration); + ExternalDataUtils.prepare(adapterName, configuration); ICcApplicationContext context = (ICcApplicationContext) serviceCtx.getApplicationContext(); ITypedAdapterFactory adapterFactory = (ITypedAdapterFactory) context.getAdapterFactoryService().createAdapterFactory(); @@ -53,11 +57,12 @@ public class AdapterFactoryProvider { return adapterFactory; } - // Indexing Adapters + // get indexing adapter factory. this method has the side effect of modifying the configuration as necessary public static IIndexingAdapterFactory getIndexingAdapterFactory(IServiceContext serviceCtx, String adapterName, Map<String, String> configuration, ARecordType itemType, List<ExternalFile> snapshot, boolean indexingOp, ARecordType metaType) throws HyracksDataException, AlgebricksException { - ExternalDataCompatibilityUtils.prepare(adapterName, configuration); + ExternalDataUtils.defaultConfiguration(configuration); + ExternalDataUtils.prepare(adapterName, configuration); GenericAdapterFactory adapterFactory = new GenericAdapterFactory(); adapterFactory.setOutputType(itemType); adapterFactory.setMetaType(metaType); diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java index e222e99..8181262 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java @@ -18,8 +18,6 @@ */ package org.apache.asterix.external.util; -import java.util.Map; - import org.apache.asterix.common.exceptions.AsterixException; import org.apache.asterix.external.api.IDataParserFactory; import org.apache.asterix.external.api.IExternalDataSourceFactory; @@ -30,6 +28,9 @@ import org.apache.asterix.external.input.record.converter.IRecordConverterFactor public class ExternalDataCompatibilityUtils { + private ExternalDataCompatibilityUtils() { + } + public static void validateCompatibility(IExternalDataSourceFactory dataSourceFactory, IDataParserFactory dataParserFactory) throws AsterixException { if (dataSourceFactory.getDataSourceType() != dataParserFactory.getDataSourceType()) { @@ -58,16 +59,4 @@ public class ExternalDataCompatibilityUtils { + recordParserFactory.getRecordClass()); } } - - public static void prepare(String adapterName, Map<String, String> configuration) { - if (!configuration.containsKey(ExternalDataConstants.KEY_READER)) { - configuration.put(ExternalDataConstants.KEY_READER, adapterName); - } - if (!configuration.containsKey(ExternalDataConstants.KEY_PARSER)) { - if (configuration.containsKey(ExternalDataConstants.KEY_FORMAT)) { - configuration.put(ExternalDataConstants.KEY_PARSER, - configuration.get(ExternalDataConstants.KEY_FORMAT)); - } - } - } } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java index 26f5402..5427bc5 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java @@ -19,6 +19,10 @@ package org.apache.asterix.external.util; public class ExternalDataConstants { + + private ExternalDataConstants() { + } + // TODO: Remove unused variables. /** * Keys @@ -62,6 +66,7 @@ public class ExternalDataConstants { public static final String KEY_LOCAL_SOCKET_PATH = "local-socket-path"; public static final String KEY_FORMAT = "format"; public static final String KEY_QUOTE = "quote"; + public static final String KEY_QUOTE_ESCAPE = "quote-escape"; public static final String KEY_PARSER = "parser"; public static final String KEY_DATASET_RECORD = "dataset-record"; public static final String KEY_HIVE_SERDE = "hive-serde"; @@ -188,6 +193,8 @@ public class ExternalDataConstants { */ public static final String TRUE = "true"; public static final String FALSE = "false"; + public static final String TAB_STR = "\t"; + public static final String NULL_STR = "\0"; /** * Constant characters @@ -228,6 +235,7 @@ public class ExternalDataConstants { public static final String KEY_READER_FACTORY = "reader-factory"; public static final String READER_RSS = "rss_feed"; public static final String FORMAT_CSV = "csv"; + public static final String FORMAT_TSV = "tsv"; public static final String ERROR_PARSE_RECORD = "Parser failed to parse record"; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java index a418cbf..443aa7e 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java @@ -34,6 +34,7 @@ import org.apache.asterix.om.types.ATypeTag; import org.apache.asterix.om.types.AUnionType; import org.apache.hyracks.algebricks.common.exceptions.NotImplementedException; import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.dataflow.common.data.parsers.BooleanParserFactory; import org.apache.hyracks.dataflow.common.data.parsers.DoubleParserFactory; import org.apache.hyracks.dataflow.common.data.parsers.FloatParserFactory; import org.apache.hyracks.dataflow.common.data.parsers.IValueParserFactory; @@ -43,48 +44,43 @@ import org.apache.hyracks.dataflow.common.data.parsers.UTF8StringParserFactory; public class ExternalDataUtils { + private ExternalDataUtils() { + } + // Get a delimiter from the given configuration - public static char getDelimiter(Map<String, String> configuration) throws AsterixException { + public static char getDelimiter(Map<String, String> configuration) throws HyracksDataException { String delimiterValue = configuration.get(ExternalDataConstants.KEY_DELIMITER); if (delimiterValue == null) { delimiterValue = ExternalDataConstants.DEFAULT_DELIMITER; } else if (delimiterValue.length() != 1) { - throw new AsterixException( - "'" + delimiterValue + "' is not a valid delimiter. The length of a delimiter should be 1."); + throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_DELIMITER, + delimiterValue); } return delimiterValue.charAt(0); } // Get a quote from the given configuration when the delimiter is given // Need to pass delimiter to check whether they share the same character - public static char getQuote(Map<String, String> configuration, char delimiter) throws AsterixException { + public static char getQuote(Map<String, String> configuration, char delimiter) throws HyracksDataException { String quoteValue = configuration.get(ExternalDataConstants.KEY_QUOTE); if (quoteValue == null) { quoteValue = ExternalDataConstants.DEFAULT_QUOTE; } else if (quoteValue.length() != 1) { - throw new AsterixException("'" + quoteValue + "' is not a valid quote. The length of a quote should be 1."); + throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_QUOTE, + quoteValue); } // Since delimiter (char type value) can't be null, // we only check whether delimiter and quote use the same character if (quoteValue.charAt(0) == delimiter) { - throw new AsterixException( - "Quote '" + quoteValue + "' cannot be used with the delimiter '" + delimiter + "'. "); + throw new RuntimeDataException( + ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_QUOTE_DELIMITER_MISMATCH, quoteValue, + delimiter); } return quoteValue.charAt(0); } - // Get the header flag - public static boolean getHasHeader(Map<String, String> configuration) { - return Boolean.parseBoolean(configuration.get(ExternalDataConstants.KEY_HEADER)); - } - - public static void validateParameters(Map<String, String> configuration) throws AsterixException { - validateDataSourceParameters(configuration); - validateDataParserParameters(configuration); - } - public static void validateDataParserParameters(Map<String, String> configuration) throws AsterixException { String parser = configuration.get(ExternalDataConstants.KEY_FORMAT); if (parser == null) { @@ -150,15 +146,6 @@ public class ExternalDataUtils { return parserFormat != null ? parserFormat : configuration.get(ExternalDataConstants.KEY_FORMAT); } - public static void setRecordFormat(Map<String, String> configuration, String format) { - if (!configuration.containsKey(ExternalDataConstants.KEY_DATA_PARSER)) { - configuration.put(ExternalDataConstants.KEY_DATA_PARSER, format); - } - if (!configuration.containsKey(ExternalDataConstants.KEY_FORMAT)) { - configuration.put(ExternalDataConstants.KEY_FORMAT, format); - } - } - private static Map<ATypeTag, IValueParserFactory> valueParserFactoryMap = initializeValueParserFactoryMap(); private static Map<ATypeTag, IValueParserFactory> initializeValueParserFactoryMap() { @@ -168,6 +155,7 @@ public class ExternalDataUtils { m.put(ATypeTag.DOUBLE, DoubleParserFactory.INSTANCE); m.put(ATypeTag.BIGINT, LongParserFactory.INSTANCE); m.put(ATypeTag.STRING, UTF8StringParserFactory.INSTANCE); + m.put(ATypeTag.BOOLEAN, BooleanParserFactory.INSTANCE); return m; } @@ -201,10 +189,6 @@ public class ExternalDataUtils { return vpf; } - public static String getRecordReaderStreamName(Map<String, String> configuration) { - return configuration.get(ExternalDataConstants.KEY_READER_STREAM); - } - public static boolean hasHeader(Map<String, String> configuration) { String value = configuration.get(ExternalDataConstants.KEY_HEADER); if (value != null) { @@ -281,12 +265,6 @@ public class ExternalDataUtils { return configuration.get(ExternalDataConstants.KEY_FEED_NAME); } - public static int getQueueSize(Map<String, String> configuration) { - return configuration.containsKey(ExternalDataConstants.KEY_QUEUE_SIZE) - ? Integer.parseInt(configuration.get(ExternalDataConstants.KEY_QUEUE_SIZE)) - : ExternalDataConstants.DEFAULT_QUEUE_SIZE; - } - public static boolean isRecordWithMeta(Map<String, String> configuration) { return configuration.containsKey(ExternalDataConstants.KEY_META_TYPE_NAME); } @@ -339,4 +317,42 @@ public class ExternalDataUtils { } return intIndicators; } + + /** + * Fills the configuration of the external dataset and its adapter with default values if not provided by user. + * + * @param configuration external data configuration + */ + public static void defaultConfiguration(Map<String, String> configuration) { + String format = configuration.get(ExternalDataConstants.KEY_FORMAT); + if (format != null) { + // default quote, escape character for quote and fields delimiter for csv and tsv format + if (format.equals(ExternalDataConstants.FORMAT_CSV)) { + configuration.putIfAbsent(ExternalDataConstants.KEY_DELIMITER, ExternalDataConstants.DEFAULT_DELIMITER); + configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE, ExternalDataConstants.DEFAULT_QUOTE); + configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE_ESCAPE, ExternalDataConstants.DEFAULT_QUOTE); + } else if (format.equals(ExternalDataConstants.FORMAT_TSV)) { + configuration.putIfAbsent(ExternalDataConstants.KEY_DELIMITER, ExternalDataConstants.TAB_STR); + configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE, ExternalDataConstants.NULL_STR); + configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE_ESCAPE, ExternalDataConstants.NULL_STR); + } + } + } + + /** + * Prepares the configuration of the external dataset and its adapter by filling the information required by + * adapters and parsers. + * + * @param adapterName adapter name + * @param configuration external data configuration + */ + public static void prepare(String adapterName, Map<String, String> configuration) { + if (!configuration.containsKey(ExternalDataConstants.KEY_READER)) { + configuration.put(ExternalDataConstants.KEY_READER, adapterName); + } + if (!configuration.containsKey(ExternalDataConstants.KEY_PARSER) + && configuration.containsKey(ExternalDataConstants.KEY_FORMAT)) { + configuration.put(ExternalDataConstants.KEY_PARSER, configuration.get(ExternalDataConstants.KEY_FORMAT)); + } + } } diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/parsers/BooleanParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/parsers/BooleanParserFactory.java new file mode 100644 index 0000000..488be04 --- /dev/null +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/parsers/BooleanParserFactory.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hyracks.dataflow.common.data.parsers; + +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hyracks.api.exceptions.HyracksDataException; + +public class BooleanParserFactory implements IValueParserFactory { + + private static final long serialVersionUID = 1L; + + public static final IValueParserFactory INSTANCE = new BooleanParserFactory(); + + private BooleanParserFactory() { + } + + @Override + public IValueParser createValueParser() { + return BooleanParserFactory::parse; + } + + public static void parse(char[] buffer, int start, int length, DataOutput out) throws HyracksDataException { + try { + if (length == 4 && (buffer[start] == 't' || buffer[start] == 'T') + && (buffer[start + 1] == 'r' || buffer[start + 1] == 'R') + && (buffer[start + 2] == 'u' || buffer[start + 2] == 'U') + && (buffer[start + 3] == 'e' || buffer[start + 3] == 'E')) { + out.writeBoolean(true); + return; + } else if (length == 5 && (buffer[start] == 'f' || buffer[start] == 'F') + && (buffer[start + 1] == 'a' || buffer[start + 1] == 'A') + && (buffer[start + 2] == 'l' || buffer[start + 2] == 'L') + && (buffer[start + 3] == 's' || buffer[start + 3] == 'S') + && (buffer[start + 4] == 'e' || buffer[start + 4] == 'E')) { + out.writeBoolean(false); + return; + } + } catch (IOException e) { + throw HyracksDataException.create(e); + } + + throw new HyracksDataException("Invalid input data"); + } +} diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java index 9ddb4c2..2eb882a 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java @@ -76,12 +76,11 @@ public class DelimitedDataTupleParserFactory implements ITupleParserFactory { break; } // Eliminate double quotes in the field that we are going to parse - if (cursor.isDoubleQuoteIncludedInThisField) { - cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart); - cursor.fEnd -= cursor.doubleQuoteCount; - cursor.isDoubleQuoteIncludedInThisField = false; + if (cursor.fieldHasDoubleQuote()) { + cursor.eliminateDoubleQuote(); } - valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart, dos); + valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength(), + dos); tb.addFieldEndOffset(); } FrameUtils.appendToWriter(writer, appender, tb.getFieldEndOffsets(), tb.getByteArray(), 0, diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java index 7e5ee2c..fd3e4c3 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java @@ -32,18 +32,18 @@ public class FieldCursorForDelimitedDataParser { EOF //end of stream reached } - public char[] buffer; //buffer to holds the input coming form the underlying input stream - public int fStart; //start position for field - public int fEnd; //end position for field - public int recordCount; //count of records - public int fieldCount; //count of fields in current record - public int doubleQuoteCount; //count of double quotes - public boolean isDoubleQuoteIncludedInThisField; //does current field include double quotes + private char[] buffer; //buffer to holds the input coming form the underlying input stream + private int fStart; //start position for field + private int fEnd; //end position for field + private int recordCount; //count of records + private int fieldCount; //count of fields in current record + private int doubleQuoteCount; //count of double quotes + private boolean isDoubleQuoteIncludedInThisField; //does current field include double quotes private static final int INITIAL_BUFFER_SIZE = 4096;//initial buffer size private static final int INCREMENT = 4096; //increment size - private Reader in; //the underlying buffer + private final Reader in; //the underlying buffer private int start; //start of valid buffer area private int end; //end of valid buffer area @@ -55,8 +55,8 @@ public class FieldCursorForDelimitedDataParser { private int quoteCount; //count of single quotes private boolean startedQuote; //whether a quote has been started - private char quote; //the quote character - private char fieldDelimiter; //the delimiter + private final char quote; //the quote character + private final char fieldDelimiter; //the delimiter public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote) { this.in = in; @@ -70,9 +70,9 @@ public class FieldCursorForDelimitedDataParser { state = State.INIT; this.quote = quote; this.fieldDelimiter = fieldDelimiter; - lastDelimiterPosition = -99; - lastQuotePosition = -99; - lastDoubleQuotePosition = -99; + lastDelimiterPosition = -1; + lastQuotePosition = -1; + lastDoubleQuotePosition = -1; quoteCount = 0; doubleQuoteCount = 0; startedQuote = false; @@ -81,9 +81,44 @@ public class FieldCursorForDelimitedDataParser { fieldCount = 0; } - public void nextRecord(char[] buffer, int recordLength) throws IOException { + public char[] getBuffer() { + return buffer; + } + + public int getFieldStart() { + return fStart; + } + + public int getFieldLength() { + return fEnd - fStart; + } + + public boolean isFieldEmpty() { + return fStart == fEnd; + } + + public boolean fieldHasDoubleQuote() { + return isDoubleQuoteIncludedInThisField; + } + + public int getFieldCount() { + return fieldCount; + } + + public int getRecordCount() { + return recordCount; + } + + public void nextRecord(char[] buffer, int recordLength) { recordCount++; fieldCount = 0; + lastDelimiterPosition = -1; + lastQuotePosition = -1; + lastDoubleQuotePosition = -1; + quoteCount = 0; + doubleQuoteCount = 0; + startedQuote = false; + isDoubleQuoteIncludedInThisField = false; start = 0; end = recordLength; state = State.IN_RECORD; @@ -187,7 +222,6 @@ public class FieldCursorForDelimitedDataParser { } public boolean nextField() throws IOException { - fieldCount++; switch (state) { case INIT: case EOR: @@ -196,12 +230,12 @@ public class FieldCursorForDelimitedDataParser { return false; case IN_RECORD: - boolean eof; + fieldCount++; // reset quote related values startedQuote = false; isDoubleQuoteIncludedInThisField = false; - lastQuotePosition = -99; - lastDoubleQuotePosition = -99; + lastQuotePosition = -1; + lastDoubleQuotePosition = -1; quoteCount = 0; doubleQuoteCount = 0; @@ -209,21 +243,26 @@ public class FieldCursorForDelimitedDataParser { while (true) { if (p >= end) { int s = start; - eof = !readMore(); + boolean eof = !readMore(); p -= (s - start); - lastQuotePosition -= (s - start); - lastDoubleQuotePosition -= (s - start); - lastDelimiterPosition -= (s - start); + lastQuotePosition -= (lastQuotePosition > -1) ? (s - start) : 0; + lastDoubleQuotePosition -= (lastDoubleQuotePosition > -1) ? (s - start) : 0; + lastDelimiterPosition -= (lastDelimiterPosition > -1) ? (s - start) : 0; if (eof) { state = State.EOF; - if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 - && quoteCount == doubleQuoteCount * 2 + 2) { - // set the position of fStart to +1, fEnd to -1 to remove quote character - fStart = start + 1; - fEnd = p - 1; - } else { + if (!startedQuote) { fStart = start; fEnd = p; + } else { + if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 + && quoteCount == doubleQuoteCount * 2 + 2) { + // set the position of fStart to +1, fEnd to -1 to remove quote character + fStart = start + 1; + fEnd = p - 1; + } else { + throw new IOException("At record: " + recordCount + ", field#: " + fieldCount + + " - missing a closing quote"); + } } return true; } @@ -232,12 +271,12 @@ public class FieldCursorForDelimitedDataParser { if (ch == quote) { // If this is first quote in the field, then it needs to be placed in the beginning. if (!startedQuote) { - if (lastDelimiterPosition == p - 1 || lastDelimiterPosition == -99) { + if (p == start) { startedQuote = true; } else { // In this case, we don't have a quote in the beginning of a field. throw new IOException("At record: " + recordCount + ", field#: " + fieldCount - + " - a quote enclosing a field needs to be placed in the beginning of that field."); + + " - a quote enclosing a field needs to be placed in the beginning of that field"); } } // Check double quotes - "". We check [start != p-2] @@ -245,8 +284,8 @@ public class FieldCursorForDelimitedDataParser { // since it looks like a double quote. However, it's not a double quote. // (e.g. if field2 has no value: // field1,"",field3 ... ) - if (lastQuotePosition == p - 1 && lastDelimiterPosition != p - 2 - && lastDoubleQuotePosition != p - 1) { + if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 + && lastQuotePosition != start) { isDoubleQuoteIncludedInThisField = true; doubleQuoteCount++; lastDoubleQuotePosition = p; @@ -262,64 +301,46 @@ public class FieldCursorForDelimitedDataParser { start = p + 1; lastDelimiterPosition = p; return true; - } else if (startedQuote) { - if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) { - // There is a quote right before the delimiter (e.g. ",) and it is not two quote, - // then the field contains a valid string. - // We set the position of fStart to +1, fEnd to -1 to remove quote character - fStart = start + 1; - fEnd = p - 1; - start = p + 1; - lastDelimiterPosition = p; - startedQuote = false; - return true; - } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition - && quoteCount == doubleQuoteCount * 2 + 2) { - // There is a quote before the delimiter, however it is not directly placed before the delimiter. - // In this case, we throw an exception. - // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes. - throw new IOException("At record: " + recordCount + ", field#: " + fieldCount - + " - A quote enclosing a field needs to be followed by the delimiter."); - } } - // If the control flow reaches here: we have a delimiter in this field and - // there should be a quote in the beginning and the end of - // this field. So, just continue reading next character - } else if (ch == '\n') { - if (!startedQuote) { - fStart = start; - fEnd = p; - start = p + 1; - state = State.EOR; - lastDelimiterPosition = p; - return true; - } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 - && quoteCount == doubleQuoteCount * 2 + 2) { - // set the position of fStart to +1, fEnd to -1 to remove quote character + + if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 + && lastQuotePosition != start) { + // There is a quote right before the delimiter (e.g. ",) and it is not two quote, + // then the field contains a valid string. + // We set the position of fStart to +1, fEnd to -1 to remove quote character fStart = start + 1; fEnd = p - 1; - lastDelimiterPosition = p; start = p + 1; - state = State.EOR; + lastDelimiterPosition = p; startedQuote = false; return true; + } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition + && quoteCount == doubleQuoteCount * 2 + 2) { + // There is a quote before the delimiter, however it is not directly placed before the delimiter. + // In this case, we throw an exception. + // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes. + throw new IOException("At record: " + recordCount + ", field#: " + fieldCount + + " - A quote enclosing a field needs to be followed by the delimiter."); } - } else if (ch == '\r') { + // If the control flow reaches here: we have a delimiter in this field and + // there should be a quote in the beginning and the end of + // this field. So, just continue reading next character + } else if (ch == '\n' || ch == '\r') { if (!startedQuote) { fStart = start; fEnd = p; start = p + 1; - state = State.CR; + state = ch == '\n' ? State.EOR : State.CR; lastDelimiterPosition = p; return true; - } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 + } else if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1 && quoteCount == doubleQuoteCount * 2 + 2) { // set the position of fStart to +1, fEnd to -1 to remove quote character fStart = start + 1; fEnd = p - 1; lastDelimiterPosition = p; start = p + 1; - state = State.CR; + state = ch == '\n' ? State.EOR : State.CR; startedQuote = false; return true; } @@ -330,7 +351,10 @@ public class FieldCursorForDelimitedDataParser { throw new IllegalStateException(); } - protected boolean readMore() throws IOException { + private boolean readMore() throws IOException { + if (in == null) { + return false; + } if (start > 0) { System.arraycopy(buffer, start, buffer, 0, end - start); } @@ -350,10 +374,11 @@ public class FieldCursorForDelimitedDataParser { } // Eliminate escaped double quotes("") in a field - public void eliminateDoubleQuote(char[] buffer, int start, int length) { - int lastDoubleQuotePosition = -99; - int writepos = start; - int readpos = start; + public void eliminateDoubleQuote() { + int lastDoubleQuotePosition = -1; + int writepos = fStart; + int readpos = fStart; + int length = fEnd - fStart; // Find positions where double quotes appear for (int i = 0; i < length; i++) { // Skip double quotes @@ -369,5 +394,7 @@ public class FieldCursorForDelimitedDataParser { readpos++; } } + fEnd -= doubleQuoteCount; + isDoubleQuoteIncludedInThisField = false; } } diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java index e663179..8edcafc 100644 --- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java +++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java @@ -51,9 +51,8 @@ public class CursorTest { while (cursor.nextRecord()) { int fieldNumber = 0; while (cursor.nextField()) { - if (cursor.isDoubleQuoteIncludedInThisField) { - cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart); - cursor.fEnd -= cursor.doubleQuoteCount; + if (cursor.fieldHasDoubleQuote()) { + cursor.eliminateDoubleQuote(); } fieldNumber++; }
