This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new c12ac6c27 IMPALA-11607: Support for TPC-H and TPC-DS test datasets
stored as Iceberg tables
c12ac6c27 is described below
commit c12ac6c27b2df1eae693b44c157d65499f491d21
Author: LPL <[email protected]>
AuthorDate: Fri Sep 30 17:09:48 2022 +0800
IMPALA-11607: Support for TPC-H and TPC-DS test datasets stored as Iceberg
tables
Impala 4.1.0 has officially supported Iceberg tables, but we don't have
tools to generate TPC-H and TPC-DS test datasets for Iceberg tables. We
should use load-data.py to generate test datasets, e.g:
"${IMPALA_HOME}/bin/load-data.py -w tpcds -f -s 10
--table_formats=text/none,iceberg/none".
Testing:
- load-data.py -w tpch -s 1 --table_formats=text/none,parquet/none,
iceberg/none -e exhaustive -f
- load-data.py -w tpcds -s 1 --table_formats=text/none,parquet/zstd,
iceberg/zstd -e exhaustive -f
- single_node_perf_run.py --workloads=tpch --scale=30 --iterations=3 --
table_formats=parquet/lz4,iceberg/lz4 --num_impalads=1 --load 3ee3dc847
Workload: TPCH, Scale Factor: 30
Table Format: parquet/lz4/block
+----------+----------------+-----------+
| Query | Time Taken (s) | Client ID |
+----------+----------------+-----------+
| TPCH-Q1 | 7.73 | 1 |
| TPCH-Q1 | 7.50 | 1 |
| TPCH-Q1 | 7.55 | 1 |
+----------+----------------+-----------+
| TPCH-Q10 | 3.65 | 1 |
| TPCH-Q10 | 3.70 | 1 |
| TPCH-Q10 | 3.69 | 1 |
+----------+----------------+-----------+
| TPCH-Q11 | 0.68 | 1 |
| TPCH-Q11 | 0.68 | 1 |
| TPCH-Q11 | 0.67 | 1 |
+----------+----------------+-----------+
| TPCH-Q12 | 1.33 | 1 |
| TPCH-Q12 | 1.08 | 1 |
| TPCH-Q12 | 1.08 | 1 |
+----------+----------------+-----------+
| TPCH-Q13 | 10.63 | 1 |
| TPCH-Q13 | 10.36 | 1 |
| TPCH-Q13 | 10.52 | 1 |
+----------+----------------+-----------+
| TPCH-Q14 | 1.63 | 1 |
| TPCH-Q14 | 1.79 | 1 |
| TPCH-Q14 | 1.63 | 1 |
+----------+----------------+-----------+
| TPCH-Q15 | 1.44 | 1 |
| TPCH-Q15 | 1.54 | 1 |
| TPCH-Q15 | 1.45 | 1 |
+----------+----------------+-----------+
| TPCH-Q16 | 2.35 | 1 |
| TPCH-Q16 | 2.35 | 1 |
| TPCH-Q16 | 2.40 | 1 |
+----------+----------------+-----------+
| TPCH-Q17 | 1.20 | 1 |
| TPCH-Q17 | 1.13 | 1 |
| TPCH-Q17 | 1.10 | 1 |
+----------+----------------+-----------+
| TPCH-Q18 | 17.50 | 1 |
| TPCH-Q18 | 17.04 | 1 |
| TPCH-Q18 | 14.43 | 1 |
+----------+----------------+-----------+
| TPCH-Q19 | 1.61 | 1 |
| TPCH-Q19 | 1.56 | 1 |
| TPCH-Q19 | 1.56 | 1 |
+----------+----------------+-----------+
| TPCH-Q2 | 0.68 | 1 |
| TPCH-Q2 | 0.68 | 1 |
| TPCH-Q2 | 0.68 | 1 |
+----------+----------------+-----------+
| TPCH-Q20 | 1.09 | 1 |
| TPCH-Q20 | 1.09 | 1 |
| TPCH-Q20 | 1.15 | 1 |
+----------+----------------+-----------+
| TPCH-Q21 | 24.94 | 1 |
| TPCH-Q21 | 26.42 | 1 |
| TPCH-Q21 | 25.62 | 1 |
+----------+----------------+-----------+
| TPCH-Q22 | 3.69 | 1 |
| TPCH-Q22 | 3.58 | 1 |
| TPCH-Q22 | 3.24 | 1 |
+----------+----------------+-----------+
| TPCH-Q3 | 3.58 | 1 |
| TPCH-Q3 | 3.68 | 1 |
| TPCH-Q3 | 3.89 | 1 |
+----------+----------------+-----------+
| TPCH-Q4 | 1.37 | 1 |
| TPCH-Q4 | 1.37 | 1 |
| TPCH-Q4 | 1.32 | 1 |
+----------+----------------+-----------+
| TPCH-Q5 | 1.68 | 1 |
| TPCH-Q5 | 1.69 | 1 |
| TPCH-Q5 | 1.65 | 1 |
+----------+----------------+-----------+
| TPCH-Q6 | 0.46 | 1 |
| TPCH-Q6 | 0.47 | 1 |
| TPCH-Q6 | 0.47 | 1 |
+----------+----------------+-----------+
| TPCH-Q7 | 2.54 | 1 |
| TPCH-Q7 | 2.49 | 1 |
| TPCH-Q7 | 2.54 | 1 |
+----------+----------------+-----------+
| TPCH-Q8 | 1.90 | 1 |
| TPCH-Q8 | 1.90 | 1 |
| TPCH-Q8 | 1.90 | 1 |
+----------+----------------+-----------+
| TPCH-Q9 | 12.72 | 1 |
| TPCH-Q9 | 12.02 | 1 |
| TPCH-Q9 | 12.67 | 1 |
+----------+----------------+-----------+
Table Format: iceberg/lz4/block
+----------+----------------+-----------+
| Query | Time Taken (s) | Client ID |
+----------+----------------+-----------+
| TPCH-Q1 | 7.51 | 1 |
| TPCH-Q1 | 7.61 | 1 |
| TPCH-Q1 | 7.60 | 1 |
+----------+----------------+-----------+
| TPCH-Q10 | 3.79 | 1 |
| TPCH-Q10 | 3.73 | 1 |
| TPCH-Q10 | 3.67 | 1 |
+----------+----------------+-----------+
| TPCH-Q11 | 0.75 | 1 |
| TPCH-Q11 | 0.88 | 1 |
| TPCH-Q11 | 0.69 | 1 |
+----------+----------------+-----------+
| TPCH-Q12 | 1.18 | 1 |
| TPCH-Q12 | 1.13 | 1 |
| TPCH-Q12 | 1.04 | 1 |
+----------+----------------+-----------+
| TPCH-Q13 | 10.11 | 1 |
| TPCH-Q13 | 9.76 | 1 |
| TPCH-Q13 | 10.31 | 1 |
+----------+----------------+-----------+
| TPCH-Q14 | 1.64 | 1 |
| TPCH-Q14 | 1.64 | 1 |
| TPCH-Q14 | 1.53 | 1 |
+----------+----------------+-----------+
| TPCH-Q15 | 1.44 | 1 |
| TPCH-Q15 | 1.39 | 1 |
| TPCH-Q15 | 1.35 | 1 |
+----------+----------------+-----------+
| TPCH-Q16 | 2.24 | 1 |
| TPCH-Q16 | 2.15 | 1 |
| TPCH-Q16 | 2.15 | 1 |
+----------+----------------+-----------+
| TPCH-Q17 | 1.00 | 1 |
| TPCH-Q17 | 1.00 | 1 |
| TPCH-Q17 | 0.99 | 1 |
+----------+----------------+-----------+
| TPCH-Q18 | 15.14 | 1 |
| TPCH-Q18 | 14.94 | 1 |
| TPCH-Q18 | 15.04 | 1 |
+----------+----------------+-----------+
| TPCH-Q19 | 1.63 | 1 |
| TPCH-Q19 | 1.66 | 1 |
| TPCH-Q19 | 1.60 | 1 |
+----------+----------------+-----------+
| TPCH-Q2 | 0.69 | 1 |
| TPCH-Q2 | 0.75 | 1 |
| TPCH-Q2 | 0.69 | 1 |
+----------+----------------+-----------+
| TPCH-Q20 | 1.10 | 1 |
| TPCH-Q20 | 1.09 | 1 |
| TPCH-Q20 | 1.21 | 1 |
+----------+----------------+-----------+
| TPCH-Q21 | 25.38 | 1 |
| TPCH-Q21 | 25.48 | 1 |
| TPCH-Q21 | 25.18 | 1 |
+----------+----------------+-----------+
| TPCH-Q22 | 3.54 | 1 |
| TPCH-Q22 | 3.59 | 1 |
| TPCH-Q22 | 3.49 | 1 |
+----------+----------------+-----------+
| TPCH-Q3 | 3.95 | 1 |
| TPCH-Q3 | 3.96 | 1 |
| TPCH-Q3 | 3.79 | 1 |
+----------+----------------+-----------+
| TPCH-Q4 | 1.43 | 1 |
| TPCH-Q4 | 1.43 | 1 |
| TPCH-Q4 | 1.43 | 1 |
+----------+----------------+-----------+
| TPCH-Q5 | 1.84 | 1 |
| TPCH-Q5 | 1.80 | 1 |
| TPCH-Q5 | 1.71 | 1 |
+----------+----------------+-----------+
| TPCH-Q6 | 0.56 | 1 |
| TPCH-Q6 | 0.47 | 1 |
| TPCH-Q6 | 0.53 | 1 |
+----------+----------------+-----------+
| TPCH-Q7 | 2.75 | 1 |
| TPCH-Q7 | 2.59 | 1 |
| TPCH-Q7 | 2.49 | 1 |
+----------+----------------+-----------+
| TPCH-Q8 | 1.90 | 1 |
| TPCH-Q8 | 2.00 | 1 |
| TPCH-Q8 | 1.91 | 1 |
+----------+----------------+-----------+
| TPCH-Q9 | 13.34 | 1 |
| TPCH-Q9 | 12.18 | 1 |
| TPCH-Q9 | 12.71 | 1 |
+----------+----------------+-----------+
Change-Id: Ic8bbc5209e7e649f67a48144a2844b35d9f9c7f1
Reviewed-on: http://gerrit.cloudera.org:8080/19114
Reviewed-by: Zoltan Borok-Nagy <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
testdata/bin/generate-schema-statements.py | 39 ++++++++++++++++++++++++------
tests/common/test_dimensions.py | 3 ++-
2 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/testdata/bin/generate-schema-statements.py
b/testdata/bin/generate-schema-statements.py
index e37e485b8..8f28e310d 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -203,6 +203,7 @@ FILE_FORMAT_MAP = {
'avro': 'AVRO',
'hbase': "'org.apache.hadoop.hive.hbase.HBaseStorageHandler'",
'kudu': "KUDU",
+ 'iceberg': "ICEBERG",
}
HIVE_TO_AVRO_TYPE_MAP = {
@@ -309,7 +310,8 @@ def build_table_template(file_format, columns,
partition_columns, row_format, tb
tblproperties_clause = "TBLPROPERTIES (\n{0}\n)"
- external = "" if is_transactional(tblproperties) else "EXTERNAL"
+ external = "" if is_transactional(tblproperties) or
is_iceberg_table(file_format) \
+ else "EXTERNAL"
if file_format == 'avro':
# TODO Is this flag ever used?
@@ -319,7 +321,8 @@ def build_table_template(file_format, columns,
partition_columns, row_format, tb
else:
tblproperties["avro.schema.url"] = "hdfs://%s/%s/%s/{table_name}.json" \
% (options.hdfs_namenode, options.hive_warehouse_dir, avro_schema_dir)
- elif file_format in ['parquet', 'orc']: # columnar formats don't need row
format
+ # columnar formats don't need row format
+ elif file_format in ['parquet', 'orc', 'iceberg']:
row_format_stmt = str()
elif file_format == 'kudu':
# Use partitioned_by to set a trivial hash distribution
@@ -341,6 +344,22 @@ def build_table_template(file_format, columns,
partition_columns, row_format, tb
else:
tblproperties_clause =
tblproperties_clause.format(",\n".join(all_tblproperties))
+ columns_str = ""
+ if is_iceberg_table(file_format):
+ for col in columns.split("\n"):
+ # The primary keys and foreign keys of the Iceberg tables should be
omitted
+ if col.lower().startswith(("primary key", "foreign key")):
+ continue
+ # Omit PRIMARY KEY declaration e.g 'col_i INT PRIMARY KEY,'
+ col = re.sub(r"(?i)\s*primary\s*key\s*", " ", col)
+ # Iceberg tables do not support TINYINT and SMALLINT
+ col = re.sub(r"(?i)\s*tinyint\s*", " INT", col)
+ col = re.sub(r"(?i)\s*smallint\s*", " INT", col)
+ columns_str = columns_str + col + ",\n"
+ columns_str = columns_str.strip().strip(",")
+ else:
+ columns_str = ",\n".join(columns.split("\n"))
+
# Note: columns are ignored but allowed if a custom serde is specified
# (e.g. Avro)
stmt = """
@@ -357,13 +376,12 @@ LOCATION '{{hdfs_location}}'
external=external,
table_comment=table_comment_stmt,
row_format=row_format_stmt,
- columns=',\n'.join(columns.split('\n')),
+ columns=columns_str,
primary_keys=primary_keys_clause,
partitioned_by=partitioned_by,
tblproperties=tblproperties_clause,
file_format_string=file_format_string
).strip()
-
# Remove empty lines from the stmt string. There is an empty line for
# each of the sections that didn't have anything (e.g. partitioned_by)
stmt = os.linesep.join([s for s in stmt.splitlines() if s])
@@ -454,8 +472,9 @@ def build_impala_parquet_codec_statement(codec):
def build_insert_into_statement(insert, db_name, db_suffix, table_name,
file_format,
hdfs_path, for_impala=False):
- insert_hint = "/* +shuffle, clustered */" \
- if for_impala and file_format == 'parquet' else ""
+ insert_hint = ""
+ if for_impala and (file_format == 'parquet' or
is_iceberg_table(file_format)):
+ insert_hint = "/* +shuffle, clustered */"
insert_statement = insert.format(db_name=db_name,
db_suffix=db_suffix,
table_name=table_name,
@@ -509,7 +528,7 @@ def build_insert(insert, db_name, db_suffix, file_format,
output += build_codec_enabled_statement(codec) + "\n"
output += build_compression_codec_statement(codec, compression_type,
file_format) + "\n"
- elif file_format == 'parquet':
+ elif file_format == 'parquet' or is_iceberg_table(file_format):
# This is for Impala parquet, add the appropriate codec statement
output += build_impala_parquet_codec_statement(codec) + "\n"
output += build_insert_into_statement(insert, db_name, db_suffix,
@@ -807,7 +826,7 @@ def generate_statements(output_name, test_vectors, sections,
db_suffix,
table_name))
else:
print 'Empty base table load for %s. Skipping load generation' %
table_name
- elif file_format in ['kudu', 'parquet']:
+ elif file_format in ['kudu', 'parquet', 'iceberg']:
if insert_hive:
hive_output.load.append(build_insert(insert_hive, db_name,
db_suffix,
file_format, codec, compression_type, table_name, data_path))
@@ -845,6 +864,10 @@ def is_transactional(table_properties):
return table_properties.get('transactional', "").lower() == 'true'
+def is_iceberg_table(file_format):
+ return file_format == 'iceberg'
+
+
def parse_schema_template_file(file_name):
VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS',
'PARTITION_COLUMNS',
'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU',
'COMMENT',
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index 5a5788aac..e551d572b 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -31,7 +31,8 @@ WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
# of what specific table format to target along with the exec options
(num_nodes, etc)
# to use when running the query.
class TableFormatInfo(object):
- KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro',
'hbase', 'kudu']
+ KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 'hbase',
+ 'kudu', 'iceberg']
KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'zstd',
'lz4']
KNOWN_COMPRESSION_TYPES = ['none', 'block', 'record']