This is an automated email from the ASF dual-hosted git repository. mhubail pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit a694220b7ea2a86909c8bc0e447138219349b88f Merge: 313b9c2f9e 8ba8273518 Author: Michael Blow <[email protected]> AuthorDate: Tue Nov 19 00:01:13 2024 -0500 Merge branch 'gerrit/goldfish' into 'master' Change-Id: Ic8b85f85eb80fcfd046ebfca721de09c0ad5c753 .../deltalake/DeltaTableGenerator.java | 2 +- .../deltalake-empty/deltalake-empty.00.ddl.sqlpp | 2 +- asterixdb/asterix-external-data/pom.xml | 16 + .../reader/aws/delta/AwsS3DeltaReaderFactory.java | 116 ++++- .../record/reader/aws/delta/DataTypeJsonSerDe.java | 511 +++++++++++++++++++++ .../reader/aws/delta/DeltaFileRecordReader.java | 171 +++++++ .../input/record/reader/aws/delta/RowSerDe.java | 138 ++++++ .../aws/delta/converter/DeltaConverterContext.java | 99 ++++ .../asterix/external/parser/DeltaDataParser.java | 330 +++++++++++++ .../factory/DeltaTableDataParserFactory.java | 65 +++ .../provider/StreamRecordReaderProvider.java | 6 +- .../external/util/ExternalDataConstants.java | 16 +- .../asterix/external/util/ExternalDataUtils.java | 34 +- .../asterix/external/util/google/gcs/GCSUtils.java | 7 +- ....apache.asterix.external.api.IDataParserFactory | 3 +- 15 files changed, 1474 insertions(+), 42 deletions(-) diff --cc asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java index 2bc65f2941,ffe75cb68b..202e1315f1 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java @@@ -18,9 -18,7 +18,8 @@@ */ package org.apache.asterix.external.util; +import java.util.List; import java.util.Set; - import java.util.TimeZone; import java.util.function.LongSupplier; import java.util.function.Supplier; import java.util.regex.Pattern; @@@ -366,30 -318,25 +365,40 @@@ public class ExternalDataConstants public static final int WRITER_MAX_RESULT_MINIMUM = 1000; public static final Set<String> WRITER_SUPPORTED_FORMATS; public static final Set<String> WRITER_SUPPORTED_ADAPTERS; - public static final Set<String> WRITER_SUPPORTED_COMPRESSION; + public static final Set<String> TEXTUAL_WRITER_SUPPORTED_COMPRESSION; + public static final Set<String> PARQUET_WRITER_SUPPORTED_COMPRESSION; + public static final Set<String> PARQUET_WRITER_SUPPORTED_VERSION; + public static final int PARQUET_DICTIONARY_PAGE_SIZE = 1048576; + public static final List<String> WRITER_SUPPORTED_QUOTES; + public static final List<ATypeTag> CSV_WRITER_SUPPORTED_DATA_TYPES = + List.of(ATypeTag.TINYINT, ATypeTag.SMALLINT, ATypeTag.INTEGER, ATypeTag.BIGINT, ATypeTag.UINT8, + ATypeTag.UINT16, ATypeTag.UINT64, ATypeTag.FLOAT, ATypeTag.DOUBLE, ATypeTag.STRING, + ATypeTag.BOOLEAN, ATypeTag.DATETIME, ATypeTag.UINT32, ATypeTag.DATE, ATypeTag.TIME); + public static final String PARQUET_MAX_SCHEMAS_KEY = "max-schemas"; + public static final int PARQUET_MAX_SCHEMAS_DEFAULT_VALUE = 5; + public static final int PARQUET_MAX_SCHEMAS_MAX_VALUE = 10; static { - WRITER_SUPPORTED_FORMATS = Set.of(FORMAT_JSON_LOWER_CASE); + WRITER_SUPPORTED_FORMATS = Set.of(FORMAT_JSON_LOWER_CASE, FORMAT_PARQUET, FORMAT_CSV_LOWER_CASE); WRITER_SUPPORTED_ADAPTERS = Set.of(ALIAS_LOCALFS_ADAPTER.toLowerCase(), KEY_ADAPTER_NAME_AWS_S3.toLowerCase(), - KEY_ADAPTER_NAME_GCS.toLowerCase()); - WRITER_SUPPORTED_COMPRESSION = Set.of(KEY_COMPRESSION_GZIP); + KEY_ADAPTER_NAME_GCS.toLowerCase(), KEY_ADAPTER_NAME_HDFS.toLowerCase()); + TEXTUAL_WRITER_SUPPORTED_COMPRESSION = Set.of(KEY_COMPRESSION_GZIP); + PARQUET_WRITER_SUPPORTED_COMPRESSION = + Set.of(KEY_COMPRESSION_GZIP, KEY_COMPRESSION_SNAPPY, KEY_COMPRESSION_ZSTD); + PARQUET_WRITER_SUPPORTED_VERSION = Set.of(PARQUET_WRITER_VERSION_VALUE_1, PARQUET_WRITER_VERSION_VALUE_2); + WRITER_SUPPORTED_QUOTES = List.of(DEFAULT_QUOTE, DEFAULT_SINGLE_QUOTE, NONE); } + public static class DeltaOptions { + private DeltaOptions() { + } + + public static final String DECIMAL_TO_DOUBLE = "decimal-to-double"; + public static final String TIMESTAMP_AS_LONG = "timestamp-to-long"; + public static final String DATE_AS_INT = "date-to-int"; + public static final String TIMEZONE = "timezone"; + } + public static class ParquetOptions { private ParquetOptions() { }
