This is an automated email from the ASF dual-hosted git repository.
wyk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git
The following commit(s) were added to refs/heads/master by this push:
new 5b136edd6d [ASTERIXDB-3392] Add parquet format for COPY TO
5b136edd6d is described below
commit 5b136edd6d16dd9fca24353b3651e20edbadf2b0
Author: preetham0202 <[email protected]>
AuthorDate: Fri Mar 8 14:02:14 2024 +0530
[ASTERIXDB-3392] Add parquet format for COPY TO
Change-Id: I40dc16969e66af09cde04b460f441af666b39d51
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18209
Integration-Tests: Jenkins <[email protected]>
Reviewed-by: <[email protected]>
Reviewed-by: Wail Alkowaileet <[email protected]>
Tested-by: Wail Alkowaileet <[email protected]>
---
.../parquet-error-checks.01.ddl.sqlpp | 32 +++
.../parquet-error-checks.02.update.sqlpp | 38 ++++
.../parquet-error-checks.03.update.sqlpp | 49 +++++
.../parquet-error-checks.04.update.sqlpp | 47 ++++
.../parquet-error-checks.05.update.sqlpp | 43 ++++
.../parquet-error-checks.06.update.sqlpp | 48 +++++
.../parquet-error-checks.07.update.sqlpp | 40 ++++
.../parquet-error-checks.08.update.sqlpp | 37 ++++
.../parquet-error-checks.09.update.sqlpp | 37 ++++
.../parquet-cover-data-types.01.ddl.sqlpp | 30 +++
.../parquet-cover-data-types.02.update.sqlpp | 33 +++
.../parquet-cover-data-types.03.update.sqlpp | 46 ++++
.../parquet-cover-data-types.04.ddl.sqlpp | 37 ++++
.../parquet-cover-data-types.05.query.sqlpp | 27 +++
.../parquet-simple/parquet-simple.01.ddl.sqlpp | 25 +++
.../parquet-simple/parquet-simple.02.update.sqlpp | 39 ++++
.../parquet-simple/parquet-simple.03.ddl.sqlpp | 35 +++
.../parquet-simple/parquet-simple.04.query.sqlpp | 25 +++
.../parquet-tweet/parquet-tweet.01.ddl.sqlpp | 35 +++
.../parquet-tweet/parquet-tweet.02.update.sqlpp | 26 +++
.../parquet-tweet/parquet-tweet.03.update.sqlpp | 240 +++++++++++++++++++++
.../parquet-tweet/parquet-tweet.04.ddl.sqlpp | 34 +++
.../parquet-tweet/parquet-tweet.05.query.sqlpp | 27 +++
.../copy-to/parquet-utf8/parquet-utf8.01.ddl.sqlpp | 35 +++
.../parquet-utf8/parquet-utf8.02.update.sqlpp | 26 +++
.../parquet-utf8/parquet-utf8.03.update.sqlpp | 42 ++++
.../copy-to/parquet-utf8/parquet-utf8.04.ddl.sqlpp | 34 +++
.../parquet-utf8/parquet-utf8.05.query.sqlpp | 27 +++
.../parquet-cover-data-types.05.adm | 1 +
.../copy-to/parquet-simple/parquet-simple.04.adm | 1 +
.../copy-to/parquet-tweet/parquet-tweet.05.adm | 2 +
.../copy-to/parquet-utf8/parquet-utf8.05.adm | 8 +
.../runtimets/testsuite_external_dataset_s3.xml | 37 +++-
.../asterix/common/exceptions/ErrorCode.java | 2 +
.../src/main/resources/asx_errormsg/en.properties | 2 +
.../external/util/ExternalDataConstants.java | 19 +-
.../apache/asterix/external/util/HDFSUtils.java | 5 +
.../external/util/WriterValidationUtil.java | 84 +++++++-
.../writer/printer/ParquetExternalFilePrinter.java | 110 ++++++++++
.../printer/ParquetExternalFilePrinterFactory.java | 47 ++++
.../external/writer/printer/ParquetOutputFile.java | 57 +++++
.../printer/parquet/AsterixParquetWriter.java | 94 ++++++++
.../printer/parquet/FieldNamesDictionary.java | 59 +++++
.../writer/printer/parquet/ObjectWriteSupport.java | 66 ++++++
.../printer/parquet/ParquetRecordLazyVisitor.java | 180 ++++++++++++++++
.../printer/parquet/ParquetRecordVisitorUtils.java | 202 +++++++++++++++++
.../metadata/provider/ExternalWriterProvider.java | 54 ++++-
.../apache/hyracks/api/exceptions/ErrorCode.java | 3 +
.../src/main/resources/errormsg/en.properties | 3 +
49 files changed, 2214 insertions(+), 16 deletions(-)
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.01.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.01.ddl.sqlpp
new file mode 100644
index 0000000000..36d00be8f4
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.01.ddl.sqlpp
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test if exists;
+CREATE DATAVERSE test;
+USE test;
+
+
+CREATE TYPE ColumnType1 AS {
+ id: integer
+};
+
+CREATE COLLECTION TestCollection(ColumnType1) PRIMARY KEY id;
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.02.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.02.update.sqlpp
new file mode 100644
index 0000000000..23d9316934
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.02.update.sqlpp
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks2")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema{"
+}
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.03.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.03.update.sqlpp
new file mode 100644
index 0000000000..2e811f28ce
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.03.update.sqlpp
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+insert into TestCollection({"id":1, "name": "John", "nested" : { "first" :
"john" , "second":"JOHN" } });
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks3")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema {
+ optional binary id (UTF8);
+ optional binary name (UTF8);
+ optional group nested {
+ optional binary first (UTF8);
+ optional binary second (UTF8);
+ }
+ }
+"
+}
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.04.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.04.update.sqlpp
new file mode 100644
index 0000000000..9933fe29dd
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.04.update.sqlpp
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks4")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema {
+ optional int64 id;
+ optional binary name (UTF8);
+ optional group nested {
+ optional binary first (UTF8);
+ }
+ }
+"
+}
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.05.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.05.update.sqlpp
new file mode 100644
index 0000000000..cf3a450488
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.05.update.sqlpp
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks5")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema {
+ optional int64 id;
+ optional binary name (UTF8);
+ optional binary nested (UTF8);
+ }"
+}
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.06.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.06.update.sqlpp
new file mode 100644
index 0000000000..2d53754e42
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.06.update.sqlpp
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks6")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message spark_schema {
+ optional int64 id;
+ optional group name {
+ optional binary first (UTF8);
+ }
+ optional group nested {
+ optional binary first (UTF8);
+ optional binary second (UTF8);
+ }
+ }"
+}
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.07.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.07.update.sqlpp
new file mode 100644
index 0000000000..dcb0dcdcae
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.07.update.sqlpp
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks7")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema{}",
+ "row-group-size":"random"
+}
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.08.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.08.update.sqlpp
new file mode 100644
index 0000000000..9856c4ab76
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.08.update.sqlpp
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks8")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema{}",
+ "page-size":"random"
+}
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.09.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.09.update.sqlpp
new file mode 100644
index 0000000000..b8579a59ca
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.09.update.sqlpp
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-error-checks9")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "compression":"rar",
+ "schema":""
+}
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.01.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.01.ddl.sqlpp
new file mode 100644
index 0000000000..56e79c8247
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.01.ddl.sqlpp
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test if exists;
+CREATE DATAVERSE test;
+USE test;
+
+
+CREATE TYPE ColumnType1 AS {
+ id: integer,
+ name : string
+};
+
+CREATE COLLECTION TestCollection(ColumnType1) PRIMARY KEY id;
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.02.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.02.update.sqlpp
new file mode 100644
index 0000000000..ec1ac0c708
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.02.update.sqlpp
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Description : create a dataset using year-month-duration as the primary key
+ * Expected Res : Success
+ * Date : 7 May 2013
+ * Issue : 363
+ */
+
+use test;
+/*
+insert into TestCollection({"id":`year-month-duration`("P16Y"), "name":
"John"});
+insert into TestCollection({"id":`day-time-duration`("-P3829H849.392S"),
"name": "Alex"});
+*/
+
+insert into TestCollection({"id":18, "name": "Virat" ,
"dateType":date("1988-11-05"), "timeType": time("03:10:00.493Z") , "boolType" :
false , "doubleType" : 0.75, "datetimeType" : datetime("1900-02-01T00:00:00")
});
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.03.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.03.update.sqlpp
new file mode 100644
index 0000000000..c2606b7797
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.03.update.sqlpp
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+COPY (
+ select c.* from TestCollection c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-cover-data-types")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema {
+ optional binary name (UTF8);
+ optional int32 id;
+ optional int32 dateType (DATE);
+ optional int32 timeType (TIME_MILLIS);
+ optional boolean boolType ;
+ optional double doubleType ;
+ optional int64 datetimeType (TIMESTAMP_MILLIS);
+ }"
+};
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.04.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.04.ddl.sqlpp
new file mode 100644
index 0000000000..310198ed41
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.04.ddl.sqlpp
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+CREATE TYPE ColumnType2 AS {
+};
+
+
+
+CREATE EXTERNAL DATASET TestDataset(ColumnType2) USING S3
+(
+ ("serviceEndpoint"="http://127.0.0.1:8001"),
+ ("region"="us-west-2"),
+ ("container"="playground"),
+ ("definition"="copy-to-result/parquet-cover-data-types/"),
+ ("include"="*.parquet"),
+ ("requireVersionChangeDetection"="false"),
+ ("format" = "parquet")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.05.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.05.query.sqlpp
new file mode 100644
index 0000000000..b03fc5e726
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-cover-data-types/parquet-cover-data-types.05.query.sqlpp
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+SELECT c.*
+FROM TestDataset c
+ORDER BY c.id;
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp
new file mode 100644
index 0000000000..76970a5579
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test if exists;
+CREATE DATAVERSE test;
+USE test;
+
+CREATE TYPE ColumnType2 AS {
+};
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp
new file mode 100644
index 0000000000..745ac872b1
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+COPY (
+ select "123" as id
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-simple")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema {
+ optional binary id (UTF8);
+ }"
+
+};
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.03.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.03.ddl.sqlpp
new file mode 100644
index 0000000000..a5d7789f46
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.03.ddl.sqlpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+CREATE EXTERNAL DATASET DatasetCopy(ColumnType2) USING S3
+(
+("accessKeyId"="dummyAccessKey"),
+("secretAccessKey"="dummySecretKey"),
+("sessionToken"="dummySessionToken"),
+("region"="us-west-2"),
+ ("serviceEndpoint"="http://127.0.0.1:8001"),
+ ("container"="playground"),
+ ("definition"="copy-to-result/parquet-simple"),
+ ("format" = "parquet"),
+ ("requireVersionChangeDetection"="false"),
+ ("include"="*.parquet")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.04.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.04.query.sqlpp
new file mode 100644
index 0000000000..5aeedb8cdf
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.04.query.sqlpp
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+SELECT id
+FROM DatasetCopy c
+ORDER BY c.id;
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.01.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.01.ddl.sqlpp
new file mode 100644
index 0000000000..f890e0d9fc
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.01.ddl.sqlpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test if exists;
+CREATE DATAVERSE test;
+USE test;
+
+CREATE TYPE ColumnType1 AS {
+ id: string
+};
+
+CREATE DATASET DummyTweetDataset(ColumnType1)
+PRIMARY KEY id WITH {
+ "storage-format": {"format" : "column"}
+};
+
+
+CREATE TYPE ColumnType2 AS {
+};
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.02.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.02.update.sqlpp
new file mode 100644
index 0000000000..83a1140f96
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.02.update.sqlpp
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+LOAD DATASET DummyTweetDataset USING localfs
+(
+ ("path" = "asterix_nc1://data/hdfs/parquet/dummy_tweet.json"),
+ ("format" = "json")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.03.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.03.update.sqlpp
new file mode 100644
index 0000000000..179d25cc6d
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.03.update.sqlpp
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+COPY (
+ SELECT c.* FROM DummyTweetDataset c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-tweet")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message schema {
+ optional group coordinates {
+ optional group coordinates (LIST) {
+ repeated group list {
+ optional double element;
+ }
+ }
+ optional binary type (UTF8);
+ }
+ optional binary created_at (UTF8);
+ optional group entities {
+ optional group urls (LIST) {
+ repeated group list {
+ optional group element {
+ optional binary display_url (UTF8);
+ optional binary expanded_url (UTF8);
+ optional group indices (LIST) {
+ repeated group list {
+ optional int64 element;
+ }
+ }
+ optional binary url (UTF8);
+ }
+ }
+ }
+ optional group user_mentions (LIST) {
+ repeated group list {
+ optional group element {
+ optional int64 id;
+ optional binary id_str (UTF8);
+ optional group indices (LIST) {
+ repeated group list {
+ optional int64 element;
+ }
+ }
+ optional binary name (UTF8);
+ optional binary screen_name (UTF8);
+ }
+ }
+ }
+ }
+ optional int64 favorite_count;
+ optional boolean favorited;
+ optional binary filter_level (UTF8);
+ optional group geo {
+ optional group coordinates (LIST) {
+ repeated group list {
+ optional double element;
+ }
+ }
+ optional binary type (UTF8);
+ }
+ optional binary id (UTF8);
+ optional binary id_str (UTF8);
+ optional binary in_reply_to_screen_name (UTF8);
+ optional int64 in_reply_to_status_id;
+ optional binary in_reply_to_status_id_str (UTF8);
+ optional int64 in_reply_to_user_id;
+ optional binary in_reply_to_user_id_str (UTF8);
+ optional boolean is_quote_status;
+ optional binary lang (UTF8);
+ optional group place {
+ optional group bounding_box {
+ optional group coordinates (LIST) {
+ repeated group list {
+ optional group element (LIST) {
+ repeated group list {
+ optional group element (LIST) {
+ repeated group list {
+ optional double element;
+ }
+ }
+ }
+ }
+ }
+ }
+ optional binary type (UTF8);
+ }
+ optional binary country (UTF8);
+ optional binary country_code (UTF8);
+ optional binary full_name (UTF8);
+ optional binary id (UTF8);
+ optional binary name (UTF8);
+ optional binary place_type (UTF8);
+ optional binary url (UTF8);
+ }
+ optional boolean possibly_sensitive;
+ optional group quoted_status {
+ optional binary created_at (UTF8);
+ optional group entities {
+ optional group user_mentions (LIST) {
+ repeated group list {
+ optional group element {
+ optional int64 id;
+ optional binary id_str (UTF8);
+ optional group indices (LIST) {
+ repeated group list {
+ optional int64 element;
+ }
+ }
+ optional binary name (UTF8);
+ optional binary screen_name (UTF8);
+ }
+ }
+ }
+ }
+ optional int64 favorite_count;
+ optional boolean favorited;
+ optional binary filter_level (UTF8);
+ optional int64 id;
+ optional binary id_str (UTF8);
+ optional binary in_reply_to_screen_name (UTF8);
+ optional int64 in_reply_to_status_id;
+ optional binary in_reply_to_status_id_str (UTF8);
+ optional int64 in_reply_to_user_id;
+ optional binary in_reply_to_user_id_str (UTF8);
+ optional boolean is_quote_status;
+ optional binary lang (UTF8);
+ optional int64 retweet_count;
+ optional boolean retweeted;
+ optional binary source (UTF8);
+ optional binary text (UTF8);
+ optional boolean truncated;
+ optional group user {
+ optional boolean contributors_enabled;
+ optional binary created_at (UTF8);
+ optional boolean default_profile;
+ optional boolean default_profile_image;
+ optional binary description (UTF8);
+ optional int64 favourites_count;
+ optional int64 followers_count;
+ optional int64 friends_count;
+ optional boolean geo_enabled;
+ optional int64 id;
+ optional binary id_str (UTF8);
+ optional boolean is_translator;
+ optional binary lang (UTF8);
+ optional int64 listed_count;
+ optional binary name (UTF8);
+ optional binary profile_background_color (UTF8);
+ optional binary profile_background_image_url (UTF8);
+ optional binary profile_background_image_url_https (UTF8);
+ optional boolean profile_background_tile;
+ optional binary profile_banner_url (UTF8);
+ optional binary profile_image_url (UTF8);
+ optional binary profile_image_url_https (UTF8);
+ optional binary profile_link_color (UTF8);
+ optional binary profile_sidebar_border_color (UTF8);
+ optional binary profile_sidebar_fill_color (UTF8);
+ optional binary profile_text_color (UTF8);
+ optional boolean profile_use_background_image;
+ optional boolean protected;
+ optional binary screen_name (UTF8);
+ optional int64 statuses_count;
+ optional boolean verified;
+ }
+ }
+ optional int64 quoted_status_id;
+ optional binary quoted_status_id_str (UTF8);
+ optional int64 retweet_count;
+ optional boolean retweeted;
+ optional binary source (UTF8);
+ optional binary text (UTF8);
+ optional binary timestamp_ms (UTF8);
+ optional boolean truncated;
+ optional group user {
+ optional boolean contributors_enabled;
+ optional binary created_at (UTF8);
+ optional boolean default_profile;
+ optional boolean default_profile_image;
+ optional binary description (UTF8);
+ optional int64 favourites_count;
+ optional int64 followers_count;
+ optional int64 friends_count;
+ optional boolean geo_enabled;
+ optional int64 id;
+ optional binary id_str (UTF8);
+ optional boolean is_translator;
+ optional binary lang (UTF8);
+ optional int64 listed_count;
+ optional binary location (UTF8);
+ optional binary name (UTF8);
+ optional binary profile_background_color (UTF8);
+ optional binary profile_background_image_url (UTF8);
+ optional binary profile_background_image_url_https (UTF8);
+ optional boolean profile_background_tile;
+ optional binary profile_banner_url (UTF8);
+ optional binary profile_image_url (UTF8);
+ optional binary profile_image_url_https (UTF8);
+ optional binary profile_link_color (UTF8);
+ optional binary profile_sidebar_border_color (UTF8);
+ optional binary profile_sidebar_fill_color (UTF8);
+ optional binary profile_text_color (UTF8);
+ optional boolean profile_use_background_image;
+ optional boolean protected;
+ optional binary screen_name (UTF8);
+ optional int64 statuses_count;
+ optional binary time_zone (UTF8);
+ optional binary url (UTF8);
+ optional int64 utc_offset;
+ optional boolean verified;
+ }
+ }"
+};
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.04.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.04.ddl.sqlpp
new file mode 100644
index 0000000000..1cf0c78f6d
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.04.ddl.sqlpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+
+
+CREATE EXTERNAL DATASET DummyTweetDatasetCopy(ColumnType2) USING S3
+(
+ ("serviceEndpoint"="http://127.0.0.1:8001"),
+ ("region"="us-west-2"),
+ ("container"="playground"),
+ ("definition"="copy-to-result/parquet-tweet/"),
+ ("include"="*.parquet"),
+ ("requireVersionChangeDetection"="false"),
+ ("format" = "parquet")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.05.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.05.query.sqlpp
new file mode 100644
index 0000000000..13587f67b5
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-tweet/parquet-tweet.05.query.sqlpp
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+SELECT c.*
+FROM DummyTweetDatasetCopy c
+ORDER BY c.id;
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.01.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.01.ddl.sqlpp
new file mode 100644
index 0000000000..dfc64ce0aa
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.01.ddl.sqlpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test if exists;
+CREATE DATAVERSE test;
+USE test;
+
+CREATE TYPE ColumnType1 AS {
+ id: int
+};
+
+CREATE DATASET NameCommentDataset(ColumnType1)
+PRIMARY KEY id WITH {
+ "storage-format": {"format" : "column"}
+};
+
+
+CREATE TYPE ColumnType2 AS {
+};
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.02.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.02.update.sqlpp
new file mode 100644
index 0000000000..85913693c8
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.02.update.sqlpp
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+LOAD DATASET NameCommentDataset USING localfs
+(
+ ("path" = "asterix_nc1://data/hdfs/parquet/id_name_comment.json"),
+ ("format" = "json")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.03.update.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.03.update.sqlpp
new file mode 100644
index 0000000000..ef76ad1f77
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.03.update.sqlpp
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+COPY (
+ SELECT c.* FROM NameCommentDataset c
+) toWriter
+TO S3
+PATH ("copy-to-result", "parquet-utf8")
+WITH {
+ "accessKeyId":"dummyAccessKey",
+ "secretAccessKey":"dummySecretKey",
+ "region":"us-west-2",
+ "serviceEndpoint":"http://127.0.0.1:8001",
+ "container":"playground",
+ "format":"parquet",
+ "schema":"message spark_schema {
+ optional binary comment (UTF8);
+ optional int64 id;
+ optional binary name (UTF8);
+ }"
+};
+
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.04.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.04.ddl.sqlpp
new file mode 100644
index 0000000000..4fd41f6ec2
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.04.ddl.sqlpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+
+
+CREATE EXTERNAL DATASET NameCommentDatasetCopy(ColumnType2) USING S3
+(
+ ("serviceEndpoint"="http://127.0.0.1:8001"),
+ ("region"="us-west-2"),
+ ("container"="playground"),
+ ("definition"="copy-to-result/parquet-utf8/"),
+ ("include"="*.parquet"),
+ ("requireVersionChangeDetection"="false"),
+ ("format" = "parquet")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.05.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.05.query.sqlpp
new file mode 100644
index 0000000000..17cd027abd
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-utf8/parquet-utf8.05.query.sqlpp
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+SELECT c.*
+FROM NameCommentDatasetCopy c
+ORDER BY c.id;
+
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-cover-data-types/parquet-cover-data-types.05.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-cover-data-types/parquet-cover-data-types.05.adm
new file mode 100644
index 0000000000..8fc863ef6c
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-cover-data-types/parquet-cover-data-types.05.adm
@@ -0,0 +1 @@
+{ "name": "Virat", "id": 18, "dateType": date("1988-11-05"), "timeType":
time("03:10:00.493"), "boolType": false, "doubleType": 0.75, "datetimeType":
datetime("1900-02-01T00:00:00.000") }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
new file mode 100644
index 0000000000..bf567b24c1
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
@@ -0,0 +1 @@
+{ "id": "123" }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-tweet/parquet-tweet.05.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-tweet/parquet-tweet.05.adm
new file mode 100644
index 0000000000..5e0df967f3
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-tweet/parquet-tweet.05.adm
@@ -0,0 +1,2 @@
+{ "coordinates": { "coordinates": [ 1.1 ], "type": "string" }, "created_at":
"string", "entities": { "urls": [ { "display_url": "string", "expanded_url":
"string", "indices": [ 1 ], "url": "string" } ], "user_mentions": [ { "id": 1,
"id_str": "string", "indices": [ 1 ], "name": "string", "screen_name": "string"
} ] }, "favorite_count": 1, "favorited": true, "filter_level": "string", "geo":
{ "coordinates": [ 1.1 ], "type": "string" }, "id": "0000000", "id_str":
"string", "in_reply_to_scr [...]
+{ "coordinates": { "coordinates": [ 1.1 ], "type": "string" }, "created_at":
"string", "favorite_count": 1, "favorited": true, "filter_level": "string",
"geo": { "coordinates": [ 1.1 ], "type": "string" }, "id":
"11111111111111111111", "id_str": "string", "in_reply_to_screen_name":
"string", "in_reply_to_status_id": 1, "in_reply_to_status_id_str": "string",
"in_reply_to_user_id": 1, "in_reply_to_user_id_str": "string",
"is_quote_status": true, "lang": "string", "place": { "bounding_box": [...]
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-utf8/parquet-utf8.05.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-utf8/parquet-utf8.05.adm
new file mode 100644
index 0000000000..c60145d7f4
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-utf8/parquet-utf8.05.adm
@@ -0,0 +1,8 @@
+{ "id": 1, "name": "John" }
+{ "id": 2, "name": "Abel" }
+{ "id": 3, "name": "Sandy" }
+{ "id": 4, "name": "Alex" }
+{ "id": 5, "name": "Mike" }
+{ "id": 6, "name": "Tom" }
+{ "comment": "😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا", "id": 7, "name": "Jerry" }
+{ "comment": "😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃.
حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽
😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee
☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉
= 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉.
Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffe
e ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حس [...]
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
index db612826e0..8e31aa424c 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
@@ -39,6 +39,26 @@
<output-dir compare="Text">default-namespace</output-dir>
</compilation-unit>
</test-case>
+ <test-case FilePath="copy-to">
+ <compilation-unit name="parquet-simple">
+ <output-dir compare="Text">parquet-simple</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="copy-to">
+ <compilation-unit name="parquet-tweet">
+ <output-dir compare="Text">parquet-tweet</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="copy-to">
+ <compilation-unit name="parquet-utf8">
+ <output-dir compare="Text">parquet-utf8</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="copy-to">
+ <compilation-unit name="parquet-cover-data-types">
+ <output-dir compare="Text">parquet-cover-data-types</output-dir>
+ </compilation-unit>
+ </test-case>
<test-case FilePath="copy-to">
<compilation-unit name="empty-path">
<output-dir compare="Text">empty-path</output-dir>
@@ -80,8 +100,21 @@
<compilation-unit name="supported-adapter-format-compression">
<output-dir
compare="Text">supported-adapter-format-compression</output-dir>
<expected-error>ASX1188: Unsupported writing adapter 'AZUREBLOB'.
Supported adapters: [gcs, localfs, s3]</expected-error>
- <expected-error>ASX1189: Unsupported writing format 'csv'. Supported
formats: [json]</expected-error>
- <expected-error>ASX1096: Unknown compression scheme rar. Supported
schemes are [gzip]</expected-error>
+ <expected-error>ASX1189: Unsupported writing format 'csv'. Supported
formats: [json, parquet]</expected-error>
+ <expected-error>ASX1202: Unsupported compression scheme rar. Supported
schemes for json are [gzip]</expected-error>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="copy-to/negative">
+ <compilation-unit name="parquet-error-checks">
+ <output-dir compare="Text">parquet-error-checks</output-dir>
+ <expected-error>HYR0131: Invalid parquet schema
provided</expected-error>
+ <expected-error>ASX0037: Type mismatch: expected value of type
integer, but got the value of type BINARY</expected-error>
+ <expected-error>HYR0133: Extra field in the result, field 'second'
does not exist at 'nested' in the schema</expected-error>
+ <expected-error>HYR0132: Result does not follow the schema, group type
expected but found primitive type at 'nested'</expected-error>
+ <expected-error>HYR0132: Result does not follow the schema, primitive
type expected but found group type at 'name'</expected-error>
+ <expected-error>ASX1201: Storage units expected for the field
'row-group-size' (e.g., 0.1KB, 100kb, 1mb, 3MB, 8.5GB ...). Provided
'random'</expected-error>
+ <expected-error>ASX1201: Storage units expected for the field
'page-size' (e.g., 0.1KB, 100kb, 1mb, 3MB, 8.5GB ...). Provided
'random'</expected-error>
+ <expected-error>ASX1202: Unsupported compression scheme rar. Supported
schemes for parquet are [gzip, snappy, zstd]</expected-error>
</compilation-unit>
</test-case>
<test-case FilePath="copy-to/negative">
diff --git
a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
index 3364600ffe..5e59b97870 100644
---
a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
+++
b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
@@ -304,6 +304,8 @@ public enum ErrorCode implements IError {
DUPLICATE_FIELD_IN_PRIMARY_KEY(1198),
INCOMPATIBLE_FIELDS_IN_PRIMARY_KEY(1199),
PREFIX_SHOULD_NOT_START_WITH_SLASH(1200),
+ ILLEGAL_SIZE_PROVIDED(1201),
+ UNSUPPORTED_WRITER_COMPRESSION_SCHEME(1202),
// Feed errors
DATAFLOW_ILLEGAL_STATE(3001),
diff --git
a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
index 4b7da0c1ec..a07dddf2bb 100644
--- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
+++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
@@ -305,6 +305,8 @@
1198 = Duplicate field '%1$s' in primary key
1199 = Fields '%1$s' and '%2$s' are incompatible for primary key
1200 = Prefix should not start with "/". Prefix: '%1$s'
+1201 = Storage units expected for the field '%1$s' (e.g., 0.1KB, 100kb, 1mb,
3MB, 8.5GB ...). Provided '%2$s'
+1202 = Unsupported compression scheme %1$s. Supported schemes for %2$s are %3$s
# Feed Errors
3001 = Illegal state.
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index 3139be7f70..6a4b336100 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -82,6 +82,11 @@ public class ExternalDataConstants {
public static final String KEY_EXPRESSION = "expression";
public static final String KEY_LOCAL_SOCKET_PATH = "local-socket-path";
public static final String KEY_FORMAT = "format";
+ public static final String KEY_SCHEMA = "schema";
+ public static final String KEY_PARQUET_ROW_GROUP_SIZE = "row-group-size";
+ public static final String PARQUET_DEFAULT_ROW_GROUP_SIZE = "10MB";
+ public static final String KEY_PARQUET_PAGE_SIZE = "page-size";
+ public static final String PARQUET_DEFAULT_PAGE_SIZE = "8KB";
public static final String KEY_INCLUDE = "include";
public static final String KEY_EXCLUDE = "exclude";
public static final String KEY_QUOTE = "quote";
@@ -161,6 +166,8 @@ public class ExternalDataConstants {
public static final String CLASS_NAME_PARQUET_INPUT_FORMAT =
"org.apache.asterix.external.input.record.reader.hdfs.parquet.MapredParquetInputFormat";
public static final String CLASS_NAME_HDFS_FILESYSTEM =
"org.apache.hadoop.hdfs.DistributedFileSystem";
+ public static final String S3A_CHANGE_DETECTION_REQUIRED =
"requireVersionChangeDetection";
+ public static final String S3A_CHANGE_DETECTION_REQUIRED_CONFIG_KEY =
"fs.s3a.change.detection.version.required";
/**
* input formats aliases
*/
@@ -306,6 +313,8 @@ public class ExternalDataConstants {
* Compression constants
*/
public static final String KEY_COMPRESSION_GZIP = "gzip";
+ public static final String KEY_COMPRESSION_SNAPPY = "snappy";
+ public static final String KEY_COMPRESSION_ZSTD = "zstd";
public static final String KEY_COMPRESSION_GZIP_COMPRESSION_LEVEL =
"gzipCompressionLevel";
/**
@@ -318,13 +327,17 @@ public class ExternalDataConstants {
public static final int WRITER_MAX_RESULT_MINIMUM = 1000;
public static final Set<String> WRITER_SUPPORTED_FORMATS;
public static final Set<String> WRITER_SUPPORTED_ADAPTERS;
- public static final Set<String> WRITER_SUPPORTED_COMPRESSION;
+ public static final Set<String> TEXTUAL_WRITER_SUPPORTED_COMPRESSION;
+ public static final Set<String> PARQUET_WRITER_SUPPORTED_COMPRESSION;
+ public static final int PARQUET_DICTIONARY_PAGE_SIZE = 1048576;
static {
- WRITER_SUPPORTED_FORMATS = Set.of(FORMAT_JSON_LOWER_CASE);
+ WRITER_SUPPORTED_FORMATS = Set.of(FORMAT_JSON_LOWER_CASE,
FORMAT_PARQUET);
WRITER_SUPPORTED_ADAPTERS =
Set.of(ALIAS_LOCALFS_ADAPTER.toLowerCase(),
KEY_ADAPTER_NAME_AWS_S3.toLowerCase(),
KEY_ADAPTER_NAME_GCS.toLowerCase());
- WRITER_SUPPORTED_COMPRESSION = Set.of(KEY_COMPRESSION_GZIP);
+ TEXTUAL_WRITER_SUPPORTED_COMPRESSION = Set.of(KEY_COMPRESSION_GZIP);
+ PARQUET_WRITER_SUPPORTED_COMPRESSION =
+ Set.of(KEY_COMPRESSION_GZIP, KEY_COMPRESSION_SNAPPY,
KEY_COMPRESSION_ZSTD);
}
public static class ParquetOptions {
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/HDFSUtils.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/HDFSUtils.java
index f7638b4696..6bc013a7bc 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/HDFSUtils.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/HDFSUtils.java
@@ -218,6 +218,11 @@ public class HDFSUtils {
configureParquet(configuration, conf);
}
+ if
(configuration.containsKey(ExternalDataConstants.S3A_CHANGE_DETECTION_REQUIRED))
{
+
conf.set(ExternalDataConstants.S3A_CHANGE_DETECTION_REQUIRED_CONFIG_KEY,
+
configuration.get(ExternalDataConstants.S3A_CHANGE_DETECTION_REQUIRED));
+ }
+
return conf;
}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/WriterValidationUtil.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/WriterValidationUtil.java
index 843600e6e6..0d82dd9724 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/WriterValidationUtil.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/WriterValidationUtil.java
@@ -20,6 +20,10 @@ package org.apache.asterix.external.util;
import static
org.apache.asterix.common.exceptions.ErrorCode.INVALID_REQ_PARAM_VAL;
import static
org.apache.asterix.common.exceptions.ErrorCode.MINIMUM_VALUE_ALLOWED_FOR_PARAM;
+import static
org.apache.asterix.external.util.ExternalDataConstants.FORMAT_JSON_LOWER_CASE;
+import static
org.apache.asterix.external.util.ExternalDataConstants.FORMAT_PARQUET;
+import static
org.apache.asterix.external.util.ExternalDataConstants.KEY_PARQUET_PAGE_SIZE;
+import static
org.apache.asterix.external.util.ExternalDataConstants.KEY_PARQUET_ROW_GROUP_SIZE;
import static
org.apache.asterix.external.util.ExternalDataConstants.KEY_WRITER_MAX_RESULT;
import static
org.apache.asterix.external.util.ExternalDataConstants.WRITER_MAX_RESULT_MINIMUM;
@@ -31,6 +35,7 @@ import java.util.stream.Collectors;
import org.apache.asterix.common.exceptions.CompilationException;
import org.apache.asterix.common.exceptions.ErrorCode;
import org.apache.hyracks.api.exceptions.SourceLocation;
+import org.apache.hyracks.util.StorageUtil;
public class WriterValidationUtil {
@@ -41,7 +46,6 @@ public class WriterValidationUtil {
Map<String, String> configuration, SourceLocation sourceLocation)
throws CompilationException {
validateAdapter(adapter, supportedAdapters, sourceLocation);
validateFormat(configuration, sourceLocation);
- validateCompression(configuration, sourceLocation);
validateMaxResult(configuration, sourceLocation);
}
@@ -56,14 +60,66 @@ public class WriterValidationUtil {
String format = configuration.get(ExternalDataConstants.KEY_FORMAT);
checkSupported(ExternalDataConstants.KEY_FORMAT, format,
ExternalDataConstants.WRITER_SUPPORTED_FORMATS,
ErrorCode.UNSUPPORTED_WRITING_FORMAT, sourceLocation, false);
+ switch (format.toLowerCase()) {
+ case FORMAT_JSON_LOWER_CASE:
+ validateJSON(configuration, sourceLocation);
+ break;
+ case FORMAT_PARQUET:
+ validateParquet(configuration, sourceLocation);
+ break;
+ }
+ }
+
+ private static void validateParquet(Map<String, String> configuration,
SourceLocation sourceLocation)
+ throws CompilationException {
+ validateParquetCompression(configuration, sourceLocation);
+ validateParquetRowGroupSize(configuration);
+ validateParquetPageSize(configuration);
+ }
+
+ private static void validateParquetRowGroupSize(Map<String, String>
configuration) throws CompilationException {
+ String rowGroupSize = configuration.get(KEY_PARQUET_ROW_GROUP_SIZE);
+ if (rowGroupSize == null)
+ return;
+ try {
+ StorageUtil.getByteValue(rowGroupSize);
+ } catch (IllegalArgumentException e) {
+ throw CompilationException.create(ErrorCode.ILLEGAL_SIZE_PROVIDED,
KEY_PARQUET_ROW_GROUP_SIZE,
+ rowGroupSize);
+ }
+ }
+
+ private static void validateParquetPageSize(Map<String, String>
configuration) throws CompilationException {
+ String pageSize = configuration.get(KEY_PARQUET_PAGE_SIZE);
+ if (pageSize == null)
+ return;
+ try {
+ StorageUtil.getByteValue(pageSize);
+ } catch (IllegalArgumentException e) {
+ throw CompilationException.create(ErrorCode.ILLEGAL_SIZE_PROVIDED,
KEY_PARQUET_PAGE_SIZE, pageSize);
+ }
+ }
+
+ private static void validateJSON(Map<String, String> configuration,
SourceLocation sourceLocation)
+ throws CompilationException {
+ validateTextualCompression(configuration, sourceLocation);
+ }
+
+ private static void validateParquetCompression(Map<String, String>
configuration, SourceLocation sourceLocation)
+ throws CompilationException {
+ String compression =
configuration.get(ExternalDataConstants.KEY_WRITER_COMPRESSION);
+
checkCompressionSupported(ExternalDataConstants.KEY_WRITER_COMPRESSION,
compression,
+ ExternalDataConstants.PARQUET_WRITER_SUPPORTED_COMPRESSION,
+ ErrorCode.UNSUPPORTED_WRITER_COMPRESSION_SCHEME,
sourceLocation, FORMAT_PARQUET, true);
}
- private static void validateCompression(Map<String, String> configuration,
SourceLocation sourceLocation)
+ private static void validateTextualCompression(Map<String, String>
configuration, SourceLocation sourceLocation)
throws CompilationException {
String compression =
configuration.get(ExternalDataConstants.KEY_WRITER_COMPRESSION);
- checkSupported(ExternalDataConstants.KEY_WRITER_COMPRESSION,
compression,
- ExternalDataConstants.WRITER_SUPPORTED_COMPRESSION,
ErrorCode.UNKNOWN_COMPRESSION_SCHEME,
- sourceLocation, true);
+
checkCompressionSupported(ExternalDataConstants.KEY_WRITER_COMPRESSION,
compression,
+ ExternalDataConstants.TEXTUAL_WRITER_SUPPORTED_COMPRESSION,
+ ErrorCode.UNSUPPORTED_WRITER_COMPRESSION_SCHEME,
sourceLocation,
+ configuration.get(ExternalDataConstants.KEY_FORMAT), true);
if (ExternalDataUtils.isGzipCompression(compression)) {
validateGzipCompressionLevel(configuration, sourceLocation);
}
@@ -121,4 +177,22 @@ public class WriterValidationUtil {
}
}
+ private static void checkCompressionSupported(String paramKey, String
value, Set<String> supportedSet,
+ ErrorCode errorCode, SourceLocation sourceLocation, String format,
boolean optional)
+ throws CompilationException {
+ if (optional && value == null) {
+ return;
+ }
+
+ if (value == null) {
+ throw new CompilationException(ErrorCode.PARAMETERS_REQUIRED,
sourceLocation, paramKey);
+ }
+
+ String normalizedValue = value.toLowerCase();
+ if (!supportedSet.contains(normalizedValue)) {
+ List<String> sorted =
supportedSet.stream().sorted().collect(Collectors.toList());
+ throw CompilationException.create(errorCode, sourceLocation,
value, format, sorted.toString());
+ }
+ }
+
}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetExternalFilePrinter.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetExternalFilePrinter.java
new file mode 100644
index 0000000000..b07e857d5d
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetExternalFilePrinter.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.external.writer.printer;
+
+import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import
org.apache.asterix.external.input.record.reader.hdfs.parquet.AsterixParquetRuntimeException;
+import org.apache.asterix.external.util.ExternalDataConstants;
+import org.apache.asterix.external.writer.printer.parquet.AsterixParquetWriter;
+import org.apache.asterix.om.types.IAType;
+import org.apache.asterix.runtime.writer.IExternalPrinter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hyracks.api.exceptions.ErrorCode;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IValueReference;
+import org.apache.parquet.column.ParquetProperties;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.schema.MessageType;
+
+public class ParquetExternalFilePrinter implements IExternalPrinter {
+ private final IAType typeInfo;
+ private final CompressionCodecName compressionCodecName;
+ private String schemaString;
+ private MessageType schema;
+ private ParquetOutputFile parquetOutputFile;
+ private ParquetWriter<IValueReference> writer;
+ private final long rowGroupSize;
+ private final int pageSize;
+
+ public ParquetExternalFilePrinter(CompressionCodecName
compressionCodecName, String schemaString, IAType typeInfo,
+ long rowGroupSize, int pageSize) {
+ this.compressionCodecName = compressionCodecName;
+ this.schemaString = schemaString.replace('\r', ' ');
+ this.typeInfo = typeInfo;
+ this.rowGroupSize = rowGroupSize;
+ this.pageSize = pageSize;
+ }
+
+ @Override
+ public void open() throws HyracksDataException {
+ try {
+ this.schema = parseMessageType(schemaString);
+ } catch (IllegalArgumentException e) {
+ throw new HyracksDataException(ErrorCode.ILLGEAL_PARQUET_SCHEMA);
+ }
+ }
+
+ @Override
+ public void newStream(OutputStream outputStream) throws
HyracksDataException {
+ if (parquetOutputFile != null) {
+ close();
+ }
+ parquetOutputFile = new ParquetOutputFile(outputStream);
+ Configuration conf = new Configuration();
+
+ try {
+ writer =
AsterixParquetWriter.builder(parquetOutputFile).withCompressionCodec(compressionCodecName)
+
.withType(schema).withTypeInfo(typeInfo).withRowGroupSize(rowGroupSize).withPageSize(pageSize)
+
.withDictionaryPageSize(ExternalDataConstants.PARQUET_DICTIONARY_PAGE_SIZE)
+ .enableDictionaryEncoding().withValidation(false)
+
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0).withConf(conf).build();
+ } catch (IOException e) {
+ throw HyracksDataException.create(e);
+ }
+
+ }
+
+ @Override
+ public void print(IValueReference value) throws HyracksDataException {
+ try {
+ this.writer.write(value);
+ } catch (AsterixParquetRuntimeException e) {
+ throw e.getHyracksDataException();
+ } catch (IOException e) {
+ throw HyracksDataException.create(e);
+ }
+ }
+
+ @Override
+ public void close() throws HyracksDataException {
+ if (this.writer != null) {
+ try {
+ this.writer.close();
+ } catch (IOException e) {
+ throw HyracksDataException.create(e);
+ }
+ }
+ }
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetExternalFilePrinterFactory.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetExternalFilePrinterFactory.java
new file mode 100644
index 0000000000..53975d2c11
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetExternalFilePrinterFactory.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.external.writer.printer;
+
+import org.apache.asterix.om.types.IAType;
+import org.apache.asterix.runtime.writer.IExternalPrinter;
+import org.apache.asterix.runtime.writer.IExternalPrinterFactory;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+
+public class ParquetExternalFilePrinterFactory implements
IExternalPrinterFactory {
+ private static final long serialVersionUID = 8971234908711234L;
+ private final String schema;
+ private final IAType typeInfo;
+ private final CompressionCodecName compressionCodecName;
+ private final long rowGroupSize;
+ private final int pageSize;
+
+ public ParquetExternalFilePrinterFactory(CompressionCodecName
compressionCodecName, String schema, IAType typeInfo,
+ long rowGroupSize, int pageSize) {
+ this.compressionCodecName = compressionCodecName;
+ this.schema = schema;
+ this.typeInfo = typeInfo;
+ this.rowGroupSize = rowGroupSize;
+ this.pageSize = pageSize;
+ }
+
+ @Override
+ public IExternalPrinter createPrinter() {
+ return new ParquetExternalFilePrinter(compressionCodecName, schema,
typeInfo, rowGroupSize, pageSize);
+ }
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetOutputFile.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetOutputFile.java
new file mode 100644
index 0000000000..cc54676b56
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/ParquetOutputFile.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.external.writer.printer;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.parquet.hadoop.util.HadoopStreams;
+import org.apache.parquet.io.OutputFile;
+import org.apache.parquet.io.PositionOutputStream;
+
+public class ParquetOutputFile implements OutputFile {
+ private final FSDataOutputStream fs;
+
+ public ParquetOutputFile(OutputStream os) {
+ this.fs = new FSDataOutputStream(os, new
FileSystem.Statistics("test"));
+ }
+
+ @Override
+ public PositionOutputStream create(long blockSizeHint) throws IOException {
+ return HadoopStreams.wrap(fs);
+ }
+
+ @Override
+ public PositionOutputStream createOrOverwrite(long blockSizeHint) throws
IOException {
+ return HadoopStreams.wrap(fs);
+ }
+
+ @Override
+ public boolean supportsBlockSize() {
+ return false;
+ }
+
+ @Override
+ public long defaultBlockSize() {
+ return 33554432L;
+ }
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetWriter.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetWriter.java
new file mode 100644
index 0000000000..edeab1fdf9
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetWriter.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.external.writer.printer.parquet;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.asterix.om.types.IAType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hyracks.data.std.api.IValueReference;
+import org.apache.parquet.column.ParquetProperties;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.api.WriteSupport;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.io.OutputFile;
+import org.apache.parquet.schema.MessageType;
+
+public class AsterixParquetWriter extends ParquetWriter<IValueReference> {
+ public static Builder builder(Path file) {
+ return new Builder(file);
+ }
+
+ public static Builder builder(OutputFile file) {
+ return new Builder(file);
+ }
+
+ AsterixParquetWriter(Path file, WriteSupport<IValueReference> writeSupport,
+ CompressionCodecName compressionCodecName, int blockSize, int
pageSize, boolean enableDictionary,
+ boolean enableValidation, ParquetProperties.WriterVersion
writerVersion, Configuration conf)
+ throws IOException {
+ super(file, writeSupport, compressionCodecName, blockSize, pageSize,
pageSize, enableDictionary,
+ enableValidation, writerVersion, conf);
+ }
+
+ public static class Builder extends ParquetWriter.Builder<IValueReference,
Builder> {
+ private MessageType type;
+ private IAType typeInfo;
+ private Map<String, String> extraMetaData;
+
+ private Builder(Path file) {
+ super(file);
+ this.type = null;
+ this.extraMetaData = new HashMap();
+ }
+
+ private Builder(OutputFile file) {
+ super(file);
+ this.type = null;
+ this.extraMetaData = new HashMap();
+ }
+
+ public Builder withType(MessageType type) {
+ this.type = type;
+ return this;
+ }
+
+ public Builder withTypeInfo(IAType typeInfo) {
+ this.typeInfo = typeInfo;
+ return this;
+ }
+
+ public Builder withExtraMetaData(Map<String, String> extraMetaData) {
+ this.extraMetaData = extraMetaData;
+ return this;
+ }
+
+ protected Builder self() {
+ return this;
+ }
+
+ protected WriteSupport<IValueReference> getWriteSupport(Configuration
conf) {
+ return new ObjectWriteSupport(this.type, this.typeInfo,
this.extraMetaData);
+ }
+ }
+
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/FieldNamesDictionary.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/FieldNamesDictionary.java
new file mode 100644
index 0000000000..f36df1a9c6
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/FieldNamesDictionary.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.external.writer.printer.parquet;
+
+import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import
org.apache.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory;
+import org.apache.hyracks.data.std.api.IValueReference;
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
+
+public class FieldNamesDictionary {
+ private final IBinaryHashFunction fieldNameHashFunction;
+ private final Int2ObjectMap<String> hashToFieldNameIndexMap;
+
+ public FieldNamesDictionary() {
+ fieldNameHashFunction =
+ new
PointableBinaryHashFunctionFactory(UTF8StringPointable.FACTORY).createBinaryHashFunction();
+ hashToFieldNameIndexMap = new Int2ObjectOpenHashMap<>();
+ }
+
+ //TODO solve collision (they're so rare that I haven't seen any)
+ public String getOrCreateFieldNameIndex(IValueReference pointable) throws
HyracksDataException {
+
+ int hash = getHash(pointable);
+ if (!hashToFieldNameIndexMap.containsKey(hash)) {
+ String fieldName =
UTF8StringUtil.toString(pointable.getByteArray(), pointable.getStartOffset());
+ hashToFieldNameIndexMap.put(hash, fieldName);
+ return fieldName;
+ }
+ return hashToFieldNameIndexMap.get(hash);
+ }
+
+ private int getHash(IValueReference fieldName) throws HyracksDataException
{
+ byte[] object = fieldName.getByteArray();
+ int start = fieldName.getStartOffset();
+ int length = fieldName.getLength();
+ return fieldNameHashFunction.hash(object, start, length);
+ }
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ObjectWriteSupport.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ObjectWriteSupport.java
new file mode 100644
index 0000000000..512b523061
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ObjectWriteSupport.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.external.writer.printer.parquet;
+
+import java.util.Map;
+
+import
org.apache.asterix.external.input.record.reader.hdfs.parquet.AsterixParquetRuntimeException;
+import org.apache.asterix.om.types.IAType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IValueReference;
+import org.apache.parquet.hadoop.api.WriteSupport;
+import org.apache.parquet.io.api.RecordConsumer;
+import org.apache.parquet.schema.MessageType;
+
+public class ObjectWriteSupport extends WriteSupport<IValueReference> {
+ private MessageType schema;
+
+ private RecordConsumer recordConsumer;
+ private Map<String, String> extraMetaData;
+ ParquetRecordLazyVisitor parquetRecordLazyVisitor;
+
+ public ObjectWriteSupport(MessageType schema, IAType typeInfo, Map<String,
String> extraMetaData) {
+ this.schema = schema;
+ this.extraMetaData = extraMetaData;
+ parquetRecordLazyVisitor = new ParquetRecordLazyVisitor(schema,
typeInfo);
+ }
+
+ public String getName() {
+ return "asterix";
+ }
+
+ public WriteSupport.WriteContext init(Configuration configuration) {
+ return new WriteSupport.WriteContext(this.schema, this.extraMetaData);
+ }
+
+ public void prepareForWrite(RecordConsumer recordConsumer) {
+ this.recordConsumer = recordConsumer;
+ }
+
+ @Override
+ public void write(IValueReference valueReference) {
+ try {
+ parquetRecordLazyVisitor.consumeRecord(valueReference,
recordConsumer);
+ } catch (HyracksDataException e) {
+ throw new AsterixParquetRuntimeException(e);
+ }
+ }
+
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java
new file mode 100644
index 0000000000..6ad9608106
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.external.writer.printer.parquet;
+
+import static
org.apache.asterix.external.writer.printer.parquet.ParquetRecordVisitorUtils.ELEMENT_FIELD;
+import static
org.apache.asterix.external.writer.printer.parquet.ParquetRecordVisitorUtils.GROUP_TYPE_ERROR_FIELD;
+import static
org.apache.asterix.external.writer.printer.parquet.ParquetRecordVisitorUtils.LIST_FIELD;
+import static
org.apache.asterix.external.writer.printer.parquet.ParquetRecordVisitorUtils.PRIMITIVE_TYPE_ERROR_FIELD;
+
+import org.apache.asterix.om.lazy.AbstractLazyVisitablePointable;
+import org.apache.asterix.om.lazy.AbstractListLazyVisitablePointable;
+import org.apache.asterix.om.lazy.FlatLazyVisitablePointable;
+import org.apache.asterix.om.lazy.ILazyVisitablePointableVisitor;
+import org.apache.asterix.om.lazy.RecordLazyVisitablePointable;
+import org.apache.asterix.om.lazy.TypedRecordLazyVisitablePointable;
+import org.apache.asterix.om.types.ARecordType;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.IAType;
+import org.apache.hyracks.api.exceptions.ErrorCode;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IValueReference;
+import org.apache.parquet.io.api.RecordConsumer;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Type;
+
+public class ParquetRecordLazyVisitor implements
ILazyVisitablePointableVisitor<Void, Type> {
+
+ private final MessageType schema;
+ private final IAType typeInfo;
+ private final RecordLazyVisitablePointable rec;
+ private RecordConsumer recordConsumer;
+ private FieldNamesDictionary fieldNamesDictionary;
+
+ private final ParquetRecordVisitorUtils parquetRecordVisitorUtils;
+
+ public ParquetRecordLazyVisitor(MessageType schema, IAType typeInfo) {
+ this.schema = schema;
+ this.typeInfo = typeInfo;
+ if (typeInfo.getTypeTag() == ATypeTag.OBJECT) {
+ this.rec = new TypedRecordLazyVisitablePointable((ARecordType)
typeInfo);
+ } else if (typeInfo.getTypeTag() == ATypeTag.ANY) {
+ this.rec = new RecordLazyVisitablePointable(true);
+ } else {
+ throw new RuntimeException("Type Unsupported for parquet
printing");
+ }
+ this.fieldNamesDictionary = new FieldNamesDictionary();
+ this.parquetRecordVisitorUtils = new ParquetRecordVisitorUtils();
+ }
+
+ public MessageType getSchema() {
+ return schema;
+ }
+
+ @Override
+ public Void visit(RecordLazyVisitablePointable pointable, Type type)
throws HyracksDataException {
+
+ if (type.isPrimitive()) {
+ throw new
HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA,
GROUP_TYPE_ERROR_FIELD,
+ PRIMITIVE_TYPE_ERROR_FIELD, type.getName());
+ }
+ GroupType groupType = type.asGroupType();
+ recordConsumer.startGroup();
+
+ for (int i = 0; i < pointable.getNumberOfChildren(); i++) {
+ pointable.nextChild();
+ AbstractLazyVisitablePointable child =
pointable.getChildVisitablePointable();
+ String columnName =
fieldNamesDictionary.getOrCreateFieldNameIndex(pointable.getFieldName());
+
+ if (!groupType.containsField(columnName)) {
+ throw new
HyracksDataException(ErrorCode.EXTRA_FIELD_IN_RESULT_NOT_FOUND_IN_SCHEMA,
columnName,
+ groupType.getName());
+ }
+ recordConsumer.startField(columnName,
groupType.getFieldIndex(columnName));
+ child.accept(this, groupType.getType(columnName));
+ recordConsumer.endField(columnName,
groupType.getFieldIndex(columnName));
+ }
+ recordConsumer.endGroup();
+ return null;
+ }
+
+ @Override
+ public Void visit(AbstractListLazyVisitablePointable pointable, Type type)
throws HyracksDataException {
+
+ if (type.isPrimitive()) {
+ throw new
HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA,
GROUP_TYPE_ERROR_FIELD,
+ PRIMITIVE_TYPE_ERROR_FIELD, type.getName());
+ }
+ GroupType groupType = type.asGroupType();
+
+ if (!groupType.containsField(LIST_FIELD)) {
+ throw new
HyracksDataException(ErrorCode.EXTRA_FIELD_IN_RESULT_NOT_FOUND_IN_SCHEMA,
LIST_FIELD,
+ groupType.getName());
+ }
+
+ if (groupType.getType(LIST_FIELD).isPrimitive()) {
+ throw new
HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA,
GROUP_TYPE_ERROR_FIELD,
+ PRIMITIVE_TYPE_ERROR_FIELD, LIST_FIELD);
+ }
+
+ GroupType listType = groupType.getType(LIST_FIELD).asGroupType();
+
+ if (!listType.containsField(ELEMENT_FIELD)) {
+ throw new
HyracksDataException(ErrorCode.EXTRA_FIELD_IN_RESULT_NOT_FOUND_IN_SCHEMA,
ELEMENT_FIELD,
+ listType.getName());
+ }
+
+ recordConsumer.startGroup();
+ recordConsumer.startField(LIST_FIELD,
groupType.getFieldIndex(LIST_FIELD));
+
+ for (int i = 0; i < pointable.getNumberOfChildren(); i++) {
+ pointable.nextChild();
+ AbstractLazyVisitablePointable child =
pointable.getChildVisitablePointable();
+
+ recordConsumer.startGroup();
+ recordConsumer.startField(ELEMENT_FIELD,
listType.getFieldIndex(ELEMENT_FIELD));
+
+ child.accept(this, listType.getType(ELEMENT_FIELD));
+
+ recordConsumer.endField(ELEMENT_FIELD,
listType.getFieldIndex(ELEMENT_FIELD));
+ recordConsumer.endGroup();
+
+ }
+
+ recordConsumer.endField(LIST_FIELD,
groupType.getFieldIndex(LIST_FIELD));
+ recordConsumer.endGroup();
+ return null;
+ }
+
+ @Override
+ public Void visit(FlatLazyVisitablePointable pointable, Type type) throws
HyracksDataException {
+
+ if (!type.isPrimitive()) {
+ throw new
HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA,
PRIMITIVE_TYPE_ERROR_FIELD,
+ GROUP_TYPE_ERROR_FIELD, type.getName());
+ }
+ parquetRecordVisitorUtils.addValueToColumn(recordConsumer, pointable,
type.asPrimitiveType());
+ return null;
+ }
+
+ public void consumeRecord(IValueReference valueReference, RecordConsumer
recordConsumer)
+ throws HyracksDataException {
+ rec.set(valueReference);
+ this.recordConsumer = recordConsumer;
+
+ recordConsumer.startMessage();
+ for (int i = 0; i < rec.getNumberOfChildren(); i++) {
+ rec.nextChild();
+ String columnName =
fieldNamesDictionary.getOrCreateFieldNameIndex(rec.getFieldName());
+ AbstractLazyVisitablePointable child =
rec.getChildVisitablePointable();
+
+ if (!schema.containsField(columnName)) {
+ throw new
HyracksDataException(ErrorCode.EXTRA_FIELD_IN_RESULT_NOT_FOUND_IN_SCHEMA,
columnName,
+ schema.getName());
+ }
+
+ recordConsumer.startField(columnName,
schema.getFieldIndex(columnName));
+ child.accept(this, schema.getType(columnName));
+ recordConsumer.endField(columnName,
schema.getFieldIndex(columnName));
+ }
+ recordConsumer.endMessage();
+ }
+
+}
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordVisitorUtils.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordVisitorUtils.java
new file mode 100644
index 0000000000..54978c340d
--- /dev/null
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordVisitorUtils.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.external.writer.printer.parquet;
+
+import java.io.IOException;
+
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
+import org.apache.asterix.dataflow.data.nontagged.printers.PrintTools;
+import
org.apache.asterix.dataflow.data.nontagged.serde.ABooleanSerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.ADateSerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.ADateTimeSerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.ADoubleSerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.AInt16SerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.AInt32SerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.AInt64SerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.AInt8SerializerDeserializer;
+import
org.apache.asterix.dataflow.data.nontagged.serde.ATimeSerializerDeserializer;
+import org.apache.asterix.om.lazy.FlatLazyVisitablePointable;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.utils.ResettableByteArrayOutputStream;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.io.api.RecordConsumer;
+import org.apache.parquet.schema.PrimitiveType;
+
+public class ParquetRecordVisitorUtils {
+
+ public static final String LIST_FIELD = "list";
+ public static final String ELEMENT_FIELD = "element";
+
+ public static final String GROUP_TYPE_ERROR_FIELD = "group";
+ public static final String PRIMITIVE_TYPE_ERROR_FIELD = "primitive";
+
+ private VoidPointable voidPointable;
+ private ATypeTag typeTag;
+
+ private byte[] b;
+ int s, l;
+ private ResettableByteArrayOutputStream byteArrayOutputStream;
+
+ public ParquetRecordVisitorUtils() {
+ this.voidPointable = VoidPointable.FACTORY.createPointable();
+ byteArrayOutputStream = new ResettableByteArrayOutputStream();
+ }
+
+ private void addIntegerType(long value, PrimitiveType.PrimitiveTypeName
primitiveTypeName, ATypeTag typeTag,
+ RecordConsumer recordConsumer) throws HyracksDataException {
+ switch (primitiveTypeName) {
+ case INT32:
+ recordConsumer.addInteger((int) value);
+ break;
+ case INT64:
+ recordConsumer.addLong(value);
+ break;
+ case FLOAT:
+ recordConsumer.addFloat(value);
+ break;
+ case DOUBLE:
+ recordConsumer.addDouble(value);
+ break;
+ default:
+ throw
RuntimeDataException.create(ErrorCode.TYPE_MISMATCH_GENERIC, typeTag,
primitiveTypeName);
+ }
+ }
+
+ public void addValueToColumn(RecordConsumer recordConsumer,
FlatLazyVisitablePointable pointable,
+ PrimitiveType type) throws HyracksDataException {
+
+ typeTag = pointable.getTypeTag();
+ b = pointable.getByteArray();
+
+ if (pointable.isTagged()) {
+ s = pointable.getStartOffset() + 1;
+ l = pointable.getLength() - 1;
+ } else {
+ s = pointable.getStartOffset();
+ l = pointable.getLength();
+ }
+ voidPointable.set(b, s, l);
+
+ PrimitiveType.PrimitiveTypeName primitiveTypeName =
type.getPrimitiveTypeName();
+
+ switch (typeTag) {
+ case TINYINT:
+ byte tinyIntValue = AInt8SerializerDeserializer.getByte(b, s);
+ addIntegerType(tinyIntValue, primitiveTypeName, typeTag,
recordConsumer);
+ break;
+ case SMALLINT:
+ short smallIntValue = AInt16SerializerDeserializer.getShort(b,
s);
+ addIntegerType(smallIntValue, primitiveTypeName, typeTag,
recordConsumer);
+ break;
+ case INTEGER:
+ int intValue = AInt32SerializerDeserializer.getInt(b, s);
+ addIntegerType(intValue, primitiveTypeName, typeTag,
recordConsumer);
+ break;
+ case BIGINT:
+ long bigIntValue = AInt64SerializerDeserializer.getLong(b, s);
+ addIntegerType(bigIntValue, primitiveTypeName, typeTag,
recordConsumer);
+ break;
+ case FLOAT:
+ float floatValue = AFloatSerializerDeserializer.getFloat(b, s);
+ switch (primitiveTypeName) {
+ case INT32:
+ recordConsumer.addInteger((int) floatValue);
+ break;
+ case INT64:
+ recordConsumer.addLong((long) floatValue);
+ break;
+ case FLOAT:
+ recordConsumer.addFloat(floatValue);
+ break;
+ case DOUBLE:
+ recordConsumer.addDouble(floatValue);
+ break;
+ default:
+ throw
RuntimeDataException.create(ErrorCode.TYPE_MISMATCH_GENERIC, typeTag,
primitiveTypeName);
+ }
+ break;
+ case DOUBLE:
+ double doubleValue =
ADoubleSerializerDeserializer.getDouble(b, s);
+ switch (primitiveTypeName) {
+ case INT32:
+ recordConsumer.addInteger((int) doubleValue);
+ break;
+ case INT64:
+ recordConsumer.addLong((long) doubleValue);
+ break;
+ case FLOAT:
+ recordConsumer.addFloat((float) doubleValue);
+ break;
+ case DOUBLE:
+ recordConsumer.addDouble(doubleValue);
+ break;
+ default:
+ throw
RuntimeDataException.create(ErrorCode.TYPE_MISMATCH_GENERIC, typeTag,
primitiveTypeName);
+ }
+ break;
+ case STRING:
+ int utfLength = UTF8StringUtil.getUTFLength(b, s);
+ if (primitiveTypeName ==
PrimitiveType.PrimitiveTypeName.BINARY) {
+ byteArrayOutputStream.reset();
+ try {
+ PrintTools.writeUTF8StringAsJSONUnquoted(b, s, l,
utfLength, byteArrayOutputStream);
+ } catch (IOException e) {
+ throw HyracksDataException.create(e);
+ }
+
recordConsumer.addBinary(Binary.fromReusedByteArray(byteArrayOutputStream.getByteArray(),
0,
+ byteArrayOutputStream.getLength()));
+
+ } else {
+ throw
RuntimeDataException.create(ErrorCode.TYPE_MISMATCH_GENERIC, typeTag,
primitiveTypeName);
+ }
+ break;
+ case BOOLEAN:
+ boolean booleanValue =
ABooleanSerializerDeserializer.getBoolean(b, s);
+ if (primitiveTypeName ==
PrimitiveType.PrimitiveTypeName.BOOLEAN) {
+ recordConsumer.addBoolean(booleanValue);
+ } else {
+ throw
RuntimeDataException.create(ErrorCode.TYPE_MISMATCH_GENERIC, typeTag,
primitiveTypeName);
+ }
+ break;
+ case DATE:
+ int dateValue = ADateSerializerDeserializer.getChronon(b, s);
+ addIntegerType(dateValue, primitiveTypeName, typeTag,
recordConsumer);
+ break;
+ case TIME:
+ int timeValue = ATimeSerializerDeserializer.getChronon(b, s);
+ addIntegerType(timeValue, primitiveTypeName, typeTag,
recordConsumer);
+ break;
+ case DATETIME:
+ long dateTimeValue =
ADateTimeSerializerDeserializer.getChronon(b, s);
+ addIntegerType(dateTimeValue, primitiveTypeName, typeTag,
recordConsumer);
+ case NULL:
+ case MISSING:
+ break;
+ default:
+ throw
RuntimeDataException.create(ErrorCode.TYPE_MISMATCH_GENERIC, typeTag,
primitiveTypeName);
+ }
+
+ }
+
+}
diff --git
a/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/provider/ExternalWriterProvider.java
b/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/provider/ExternalWriterProvider.java
index b8583d0c0d..c931a93f56 100644
---
a/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/provider/ExternalWriterProvider.java
+++
b/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/provider/ExternalWriterProvider.java
@@ -31,8 +31,10 @@ import
org.apache.asterix.external.writer.LocalFSExternalFileWriterFactory;
import
org.apache.asterix.external.writer.compressor.GzipExternalFileCompressStreamFactory;
import
org.apache.asterix.external.writer.compressor.IExternalFileCompressStreamFactory;
import
org.apache.asterix.external.writer.compressor.NoOpExternalFileCompressStreamFactory;
+import
org.apache.asterix.external.writer.printer.ParquetExternalFilePrinterFactory;
import
org.apache.asterix.external.writer.printer.TextualExternalFilePrinterFactory;
import org.apache.asterix.formats.nontagged.CleanJSONPrinterFactoryProvider;
+import org.apache.asterix.om.types.IAType;
import org.apache.asterix.runtime.writer.ExternalFileWriterConfiguration;
import org.apache.asterix.runtime.writer.IExternalFileWriterFactory;
import org.apache.asterix.runtime.writer.IExternalFileWriterFactoryProvider;
@@ -41,6 +43,8 @@ import
org.apache.hyracks.algebricks.core.algebra.metadata.IWriteDataSink;
import org.apache.hyracks.algebricks.data.IPrinterFactory;
import org.apache.hyracks.api.exceptions.SourceLocation;
import org.apache.hyracks.control.cc.ClusterControllerService;
+import org.apache.hyracks.util.StorageUtil;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
public class ExternalWriterProvider {
private static final Map<String, IExternalFileWriterFactoryProvider>
CREATOR_MAP;
@@ -111,16 +115,54 @@ public class ExternalWriterProvider {
Map<String, String> configuration = sink.getConfiguration();
String format = configuration.get(ExternalDataConstants.KEY_FORMAT);
- // Only JSON is supported for now
- if
(!ExternalDataConstants.FORMAT_JSON_LOWER_CASE.equalsIgnoreCase(format)) {
+ // Only JSON and parquet is supported for now
+ if
(!ExternalDataConstants.FORMAT_JSON_LOWER_CASE.equalsIgnoreCase(format)
+ &&
!ExternalDataConstants.FORMAT_PARQUET.equalsIgnoreCase(format)) {
throw new UnsupportedOperationException("Unsupported format " +
format);
}
String compression = getCompression(configuration);
- IExternalFileCompressStreamFactory compressStreamFactory =
- createCompressionStreamFactory(appCtx, compression,
configuration);
- IPrinterFactory printerFactory =
CleanJSONPrinterFactoryProvider.INSTANCE.getPrinterFactory(sourceType);
- return new TextualExternalFilePrinterFactory(printerFactory,
compressStreamFactory);
+
+ switch (format) {
+ case ExternalDataConstants.FORMAT_JSON_LOWER_CASE:
+ IExternalFileCompressStreamFactory compressStreamFactory =
+ createCompressionStreamFactory(appCtx, compression,
configuration);
+ IPrinterFactory printerFactory =
CleanJSONPrinterFactoryProvider.INSTANCE.getPrinterFactory(sourceType);
+ return new TextualExternalFilePrinterFactory(printerFactory,
compressStreamFactory);
+ case ExternalDataConstants.FORMAT_PARQUET:
+
+ if
(!configuration.containsKey(ExternalDataConstants.KEY_SCHEMA)) {
+ throw new UnsupportedOperationException("Schema not
provided for parquet");
+ }
+ String schema =
configuration.get(ExternalDataConstants.KEY_SCHEMA);
+ CompressionCodecName compressionCodecName;
+ if (compression == null || compression.equals("") ||
compression.equals("none")) {
+ compressionCodecName = CompressionCodecName.UNCOMPRESSED;
+ } else {
+ compressionCodecName =
CompressionCodecName.valueOf(compression.toUpperCase());
+ }
+
+ String rowGroupSizeString = getRowGroupSize(configuration);
+ String pageSizeString = getPageSize(configuration);
+
+ long rowGroupSize =
StorageUtil.getByteValue(rowGroupSizeString);
+ int pageSize = (int) StorageUtil.getByteValue(pageSizeString);
+
+ return new
ParquetExternalFilePrinterFactory(compressionCodecName, schema, (IAType)
sourceType,
+ rowGroupSize, pageSize);
+ default:
+ throw new UnsupportedOperationException("Unsupported format "
+ format);
+ }
+ }
+
+ private static String getRowGroupSize(Map<String, String> configuration) {
+ return
configuration.getOrDefault(ExternalDataConstants.KEY_PARQUET_ROW_GROUP_SIZE,
+ ExternalDataConstants.PARQUET_DEFAULT_ROW_GROUP_SIZE);
+ }
+
+ private static String getPageSize(Map<String, String> configuration) {
+ return
configuration.getOrDefault(ExternalDataConstants.KEY_PARQUET_PAGE_SIZE,
+ ExternalDataConstants.PARQUET_DEFAULT_PAGE_SIZE);
}
private static String getFormat(Map<String, String> configuration) {
diff --git
a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
index 59a4da43f3..319d440e06 100644
---
a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
+++
b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
@@ -158,6 +158,9 @@ public enum ErrorCode implements IError {
UNSUPPORTED_WRITE_SPEC(128),
JOB_REJECTED(129),
FRAME_BIGGER_THAN_SORT_MEMORY(130),
+ ILLGEAL_PARQUET_SCHEMA(131),
+ RESULT_DOES_NOT_FOLLOW_SCHEMA(132),
+ EXTRA_FIELD_IN_RESULT_NOT_FOUND_IN_SCHEMA(133),
// Compilation error codes.
RULECOLLECTION_NOT_INSTANCE_OF_LIST(10000),
diff --git
a/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
b/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
index e94c12e30c..51acdc024a 100644
---
a/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
+++
b/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
@@ -148,6 +148,9 @@
128 = Unsupported copy to specification: PARTITION BY %1$s, ORDER BY %2$s
129 = Job %1$s failed to run. Cluster is not accepting jobs.
130 = Frame data=%1$s (requiring %2$s) is bigger than the sort budget.
Used=%3$s, max=%4$s. Please increase the sort memory budget.
+131 = Invalid parquet schema provided
+132 = Result does not follow the schema, %1$s type expected but found %2$s
type at '%3$s'
+133 = Extra field in the result, field '%1$s' does not exist at '%2$s' in the
schema
10000 = The given rule collection %1$s is not an instance of the List class.
10001 = Cannot compose partition constraint %1$s with %2$s