Repository: incubator-impala Updated Branches: refs/heads/master ca55b5926 -> e4f585240
IMPALA-6068: Fix dataload for complextypes_fileformat Dataload typically follows a pattern of loading data into a text version of a table, and then using an insert overwrite from the text table to populate the table for other file formats. This insert is always done in Impala for Parquet and Kudu. Otherwise it runs in Hive. Since Impala doesn't support writing nested data, the population of complextypes_fileformat tries to hack the insert to run in Hive by including it in the ALTER part of the table definition. ALTER runs immediately after CREATE and always runs in Hive. The problem is that ALTER also runs before the base table (functional.complextypes_fileformat) is populated. The insert succeeds, but it is inserting zero rows. This code change introduces a way to force the Parquet load to run using Hive. This lets complextypes_fileformat specify that the insert should happen in Hive and fixes the ordering so that the table is populated correctly. This is also useful for loading custom Parquet files into Parquet tables. Hive supports the DATA LOAD LOCAL syntax, which can read a file from the local filesystem. This means that several locations that currently use the hdfs commandline can be modified to use this SQL. This change speeds up dataload by a few minutes, as it avoids the overhead of the hdfs commandline. Any other location that could use DATA LOAD LOCAL is also switched over to use it. This includes the testescape* tables which now print the appropriate DATA LOAD commands as a result of text_delims_table.py. Any location that already uses DATA LOAD LOCAL is also switched to indicate that it must run in Hive. Any location that was doing an HDFS command in the LOAD section is moved to the LOAD_DEPENDENT_HIVE section. Testing: Ran dataload and core tests. Also verified that functional_parquet.complextypes_fileformat has rows. Change-Id: I7152306b2907198204a6d8d282a0bad561129b82 Reviewed-on: http://gerrit.cloudera.org:8080/8350 Reviewed-by: Joe McDonnell <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/e4f58524 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/e4f58524 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/e4f58524 Branch: refs/heads/master Commit: e4f585240ac8f478e25402806f4ea38531b4bf84 Parents: ca55b59 Author: Joe McDonnell <[email protected]> Authored: Fri Oct 20 11:41:59 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Oct 25 03:43:26 2017 +0000 ---------------------------------------------------------------------- testdata/bin/create-load-data.sh | 28 --- testdata/bin/generate-schema-statements.py | 24 +- testdata/common/text_delims_table.py | 9 +- testdata/common/widetable.py | 5 +- .../functional/functional_schema_template.sql | 246 ++++++++++--------- 5 files changed, 160 insertions(+), 152 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/e4f58524/testdata/bin/create-load-data.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 099fe59..c5207a9 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -348,34 +348,6 @@ function load-custom-data { hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ - # IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0 - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \ - /test-warehouse/bad_parquet_parquet - - # Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary) - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \ - /test-warehouse/bad_parquet_parquet - - # IMPALA-720: data file produced by parquet-mr with multiple row groups - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \ - /test-warehouse/bad_parquet_parquet - - # IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max - # string values - hadoop fs -put -f ${IMPALA_HOME}/testdata/data/long_page_header.parquet \ - /test-warehouse/bad_parquet_parquet - - # IMPALA-3732: parquet files with corrupt strings - local parq_file - for parq_file in dict-encoded-negative-len.parq plain-encoded-negative-len.parq; do - hadoop fs -put -f ${IMPALA_HOME}/testdata/bad_parquet_data/$parq_file \ - /test-warehouse/bad_parquet_strings_negative_len_parquet - done - for parq_file in dict-encoded-out-of-bounds.parq plain-encoded-out-of-bounds.parq; do - hadoop fs -put -f ${IMPALA_HOME}/testdata/bad_parquet_data/$parq_file \ - /test-warehouse/bad_parquet_strings_out_of_bounds_parquet - done - # Remove all index files in this partition. hadoop fs -rm -f /test-warehouse/alltypes_text_lzo/year=2009/month=1/*.lzo.index http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/e4f58524/testdata/bin/generate-schema-statements.py ---------------------------------------------------------------------- diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index a31eca1..b8f6e8c 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -523,9 +523,13 @@ def generate_statements(output_name, test_vectors, sections, alter = section.get('ALTER') create = section['CREATE'] create_hive = section['CREATE_HIVE'] + assert not (create and create_hive), "Can't set both CREATE and CREATE_HIVE" table_properties = section['TABLE_PROPERTIES'] insert = eval_section(section['DEPENDENT_LOAD']) + insert_hive = eval_section(section['DEPENDENT_LOAD_HIVE']) + assert not (insert and insert_hive),\ + "Can't set both DEPENDENT_LOAD and DEPENDENT_LOAD_HIVE" load = eval_section(section['LOAD']) if file_format == 'kudu': @@ -570,7 +574,8 @@ def generate_statements(output_name, test_vectors, sections, # HBASE we need to create these tables with a supported insert format. create_file_format = file_format create_codec = codec - if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD']): + if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD'] \ + or section['DEPENDENT_LOAD_HIVE']): create_codec = 'none' create_file_format = file_format if file_format not in IMPALA_SUPPORTED_INSERT_FORMATS: @@ -665,19 +670,23 @@ def generate_statements(output_name, test_vectors, sections, else: print 'Empty base table load for %s. Skipping load generation' % table_name elif file_format in ['kudu', 'parquet']: - if insert: + if insert_hive: + hive_output.load.append(build_insert(insert_hive, db_name, db_suffix, + file_format, codec, compression_type, table_name, data_path)) + elif insert: impala_load.load.append(build_insert_into_statement(insert, db_name, db_suffix, table_name, file_format, data_path, for_impala=True)) else: print 'Empty parquet/kudu load for table %s. Skipping insert generation' \ % table_name else: + if insert_hive: + insert = insert_hive if insert: hive_output.load.append(build_insert(insert, db_name, db_suffix, file_format, - codec, compression_type, table_name, data_path, - create_hive=create_hive)) + codec, compression_type, table_name, data_path, create_hive=create_hive)) else: - print 'Empty insert for table %s. Skipping insert generation' % table_name + print 'Empty insert for table %s. Skipping insert generation' % table_name impala_output.write_to_file("load-%s-impala-generated-%s-%s-%s.sql" % (output_name, file_format, codec, compression_type)) @@ -694,8 +703,9 @@ def generate_statements(output_name, test_vectors, sections, def parse_schema_template_file(file_name): VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS', 'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU', - 'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'LOAD', - 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES', 'TABLE_PROPERTIES'] + 'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'DEPENDENT_LOAD_HIVE', + 'LOAD', 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES', + 'TABLE_PROPERTIES'] return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False) if __name__ == "__main__": http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/e4f58524/testdata/common/text_delims_table.py ---------------------------------------------------------------------- diff --git a/testdata/common/text_delims_table.py b/testdata/common/text_delims_table.py index 68d5823..b51441b 100755 --- a/testdata/common/text_delims_table.py +++ b/testdata/common/text_delims_table.py @@ -18,7 +18,9 @@ # under the License. # Functions for generating test files with specific length, and ended with all -# permutation (with replacement) of items in suffix_list. +# permutation (with replacement) of items in suffix_list. When run from the +# command line, will generate data files in the specified directory and a +# print a SQL load statement to incorporate into dataload SQL script generation. from shutil import rmtree from optparse import OptionParser @@ -55,4 +57,9 @@ if __name__ == "__main__": if not options.table_dir: parser.error("--table_dir option must be specified") + # Generate data locally, and output the SQL load command for use in dataload generate_testescape_files(options.table_dir, options.only_newline, options.file_len) + + print ("LOAD DATA LOCAL INPATH '%s' " + "OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};" + % options.table_dir) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/e4f58524/testdata/common/widetable.py ---------------------------------------------------------------------- diff --git a/testdata/common/widetable.py b/testdata/common/widetable.py index c95d5e6..d83c62e 100755 --- a/testdata/common/widetable.py +++ b/testdata/common/widetable.py @@ -19,7 +19,8 @@ # Functions for creating wide (i.e. many-column) tables. When run from the command line, # specify either --get_columns to generate column descriptors, or --create_data to -# generate a CSV data file. +# generate a CSV data file and prints a SQL load statement to incorporate +# into dataload SQL script generation. from datetime import datetime, timedelta import itertools @@ -121,7 +122,7 @@ if __name__ == "__main__": print '\n'.join(get_columns(options.num_columns)) if options.create_data: - # Generate data locally, and output the command template to load it into HDFS + # Generate data locally, and output the SQL load command for use in dataload if not options.output_file: parser.error("--output_file option must be specified") http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/e4f58524/testdata/datasets/functional/functional_schema_template.sql ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index ff7b00d..a876e44 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -690,12 +690,11 @@ int_array_array array<array<int>> int_map map<string, int> int_map_array array<map<string, int>> nested_struct struct<a: int, b: array<int>, c: struct<d: array<array<struct<e: int, f: string>>>>, g: map<string, struct<h: struct<i: array<double>>>>> ----- DEPENDENT_LOAD -`hadoop fs -mkdir -p /test-warehouse/complextypestbl_parquet && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/ComplexTypesTbl/nullable.parq \ -/test-warehouse/complextypestbl_parquet/ && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/ComplexTypesTbl/nonnullable.parq \ -/test-warehouse/complextypestbl_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/ComplexTypesTbl/nullable.parq' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/ComplexTypesTbl/nonnullable.parq' +INTO TABLE {db_name}{db_suffix}.{table_name}; ---- LOAD ==== ---- DATASET @@ -712,13 +711,12 @@ CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( a array<int>, m map<string,bigint>) STORED AS {file_format}; ----- ALTER --- This INSERT is placed in the ALTER section and not in the DEPENDENT_LOAD section because --- it must always be executed in Hive. The DEPENDENT_LOAD section is sometimes executed in --- Impala, but Impala currently does not support inserting into tables with complex types. -INSERT OVERWRITE TABLE {table_name} SELECT * FROM functional.{table_name}; ---- LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny; +---- DEPENDENT_LOAD_HIVE +-- This INSERT must run in Hive, because Impala doesn't support inserting into tables +-- with complex types. +INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM functional.{table_name}; ==== ---- DATASET functional @@ -1477,8 +1475,9 @@ old_rcfile_table ---- COLUMNS key INT value STRING ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/data/oldrcfile.rc' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/oldrcfile.rc' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1486,9 +1485,10 @@ functional bad_text_lzo ---- COLUMNS field STRING ----- DEPENDENT_LOAD +---- DEPENDENT_LOAD_HIVE -- Error recovery test data for LZO compression. -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_text_lzo/bad_text.lzo' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_text_lzo/bad_text.lzo' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1497,8 +1497,9 @@ bad_text_gzip ---- COLUMNS s STRING i INT ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_text_gzip/file_not_finished.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_text_gzip/file_not_finished.gz' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1506,9 +1507,10 @@ functional bad_seq_snap ---- COLUMNS field STRING ----- DEPENDENT_LOAD +---- DEPENDENT_LOAD_HIVE -- This data file contains format errors and is accessed by the unit test: sequence-file-recover-test. -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_seq_snap/bad_file' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_seq_snap/bad_file' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1516,10 +1518,13 @@ functional bad_avro_snap_strings ---- COLUMNS s STRING ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/negative_string_len.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/invalid_union.avro' INTO TABLE {db_name}{db_suffix}.{table_name}; -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/truncated_string.avro' INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/negative_string_len.avro' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/invalid_union.avro' +INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/truncated_string.avro' +INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1527,8 +1532,9 @@ functional bad_avro_snap_floats ---- COLUMNS c1 FLOAT ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/truncated_float.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/truncated_float.avro' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1537,33 +1543,61 @@ bad_avro_decimal_schema ---- COLUMNS name STRING value DECIMAL(5,2) ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/invalid_decimal_schema.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/invalid_decimal_schema.avro' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- IMPALA-694: uses data file produced by parquet-mr version 1.2.5-cdh4.5.0 --- (can't use LOAD DATA LOCAL with Impala so copied in create-load-data.sh) functional ---- BASE_TABLE_NAME bad_parquet ---- COLUMNS field STRING +---- DEPENDENT_LOAD_HIVE +-- IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0 +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/bad_parquet_data.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +-- Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary) +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/repeated_values.parquet' +INTO TABLE {db_name}{db_suffix}.{table_name}; +-- IMPALA-720: data file produced by parquet-mr with multiple row groups +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/multiple_rowgroups.parquet' +INTO TABLE {db_name}{db_suffix}.{table_name}; +-- IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max +-- string values +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/long_page_header.parquet' +INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET --- Can't use LOAD DATA LOCAL with Impala so copied in create-load-data.sh. functional ---- BASE_TABLE_NAME bad_parquet_strings_negative_len ---- COLUMNS s STRING +---- DEPENDENT_LOAD_HIVE +-- IMPALA-3732: parquet files with corrupt strings +LOAD DATA LOCAL INPATH +'{impala_home}/testdata/bad_parquet_data/dict-encoded-negative-len.parq' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH +'{impala_home}/testdata/bad_parquet_data/plain-encoded-negative-len.parq' +INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET --- Can't use LOAD DATA LOCAL with Impala so copied in create-load-data.sh. functional ---- BASE_TABLE_NAME bad_parquet_strings_out_of_bounds ---- COLUMNS s STRING +---- DEPENDENT_LOAD_HIVE +-- IMPALA-3732: parquet files with corrupt strings +LOAD DATA LOCAL INPATH +'{impala_home}/testdata/bad_parquet_data/dict-encoded-out-of-bounds.parq' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH +'{impala_home}/testdata/bad_parquet_data/plain-encoded-out-of-bounds.parq' +INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- IMPALA-2130: Wrong verification of parquet file version @@ -1572,10 +1606,9 @@ functional bad_magic_number ---- COLUMNS field STRING ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/bad_magic_number_parquet && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_magic_number.parquet \ -/test-warehouse/bad_magic_number_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/bad_magic_number.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- IMPALA-1658: Timestamps written by Hive are local-to-UTC adjusted. @@ -1597,10 +1630,9 @@ timestamp_col timestamp year int month int day int ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/alltypesagg_hive_13_1_parquet && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/alltypesagg_hive_13_1.parquet \ -/test-warehouse/alltypesagg_hive_13_1_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/alltypesagg_hive_13_1.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- Parquet file with invalid metadata size in the file footer. @@ -1609,10 +1641,9 @@ functional bad_metadata_len ---- COLUMNS field TINYINT ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/bad_metadata_len_parquet && hadoop fs -put -f \ -${IMPALA_HOME}/testdata/data/bad_metadata_len.parquet \ -/test-warehouse/bad_metadata_len_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/bad_metadata_len.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- Parquet file with invalid column dict_page_offset. @@ -1621,10 +1652,9 @@ functional bad_dict_page_offset ---- COLUMNS field TINYINT ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/bad_dict_page_offset_parquet && hadoop fs -put -f \ -${IMPALA_HOME}/testdata/data/bad_dict_page_offset.parquet \ -/test-warehouse/bad_dict_page_offset_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/bad_dict_page_offset.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- Parquet file with invalid column total_compressed_size. @@ -1633,10 +1663,9 @@ functional bad_compressed_size ---- COLUMNS field TINYINT ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/bad_compressed_size_parquet && hadoop fs -put -f \ -${IMPALA_HOME}/testdata/data/bad_compressed_size.parquet \ -/test-warehouse/bad_compressed_size_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/bad_compressed_size.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- Parquet file with required columns written by Kite. Hive and Impala always write files @@ -1655,10 +1684,9 @@ opt_int_2 bigint opt_int_3 bigint req_int_2 bigint req_int_3 bigint ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/kite_required_fields_parquet && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/kite_required_fields.parquet \ -/test-warehouse/kite_required_fields_parquet/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/kite_required_fields.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- Parquet file with incorrect column metadata in multiple row groups @@ -1668,10 +1696,9 @@ bad_column_metadata ---- COLUMNS id bigint int_array array<int> ----- LOAD -`hadoop fs -mkdir -p /test-warehouse/bad_column_metadata_parquet && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_column_metadata.parquet \ -/test-warehouse/bad_column_metadata_parquet +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/bad_column_metadata.parquet' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1713,8 +1740,8 @@ ALTER TABLE {table_name} ADD IF NOT EXISTS PARTITION(d6=1); ---- ROW_FORMAT delimited fields terminated by ',' ---- LOAD -`hadoop fs -mkdir -p /test-warehouse/decimal_tbl/d6=1 && hadoop fs -put -f \ -${IMPALA_HOME}/testdata/data/decimal_tbl.txt /test-warehouse/decimal_tbl/d6=1/ +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/decimal_tbl.txt' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name} PARTITION(d6=1); ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} partition(d6) select * from functional.{table_name}; @@ -1730,8 +1757,8 @@ c3 DECIMAL(1,1) ---- ROW_FORMAT delimited fields terminated by ',' ---- LOAD -`hadoop fs -mkdir -p /test-warehouse/decimal_tiny && hadoop fs -put -f \ -${IMPALA_HOME}/testdata/data/decimal-tiny.txt /test-warehouse/decimal_tiny/ +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/decimal-tiny.txt' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} select * from functional.{table_name}; @@ -1747,8 +1774,8 @@ vc VARCHAR(32) ---- ROW_FORMAT delimited fields terminated by ',' ---- LOAD -`hadoop fs -mkdir -p /test-warehouse/chars_tiny && hadoop fs -put -f \ -${IMPALA_HOME}/testdata/data/chars-tiny.txt /test-warehouse/chars_tiny/ +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/chars-tiny.txt' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} select * from functional.{table_name}; @@ -1802,8 +1829,8 @@ avro_decimal_tbl ---- COLUMNS name STRING value DECIMAL(5,2) ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/data/avro_decimal_tbl.avro' +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/avro_decimal_tbl.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET @@ -1844,8 +1871,8 @@ id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP) row format delimited fields terminated by ',' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`hadoop fs -mkdir -p /test-warehouse/table_no_newline && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-warehouse/table_no_newline +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_no_newline.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1860,11 +1887,10 @@ LOCATION '/test-warehouse/{table_name}'; ALTER TABLE {db_name}{db_suffix}.{table_name} ADD IF NOT EXISTS PARTITION (year=2015, month=3); ALTER TABLE {db_name}{db_suffix}.{table_name} ADD IF NOT EXISTS PARTITION (year=2010, month=3); ---- LOAD -`hadoop fs -mkdir -p /test-warehouse/table_no_newline_part && \ -hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2010/month=3 && \ -hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2015/month=3 && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-warehouse/table_no_newline_part/year=2010/month=3 && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_missing_columns.csv /test-warehouse/table_no_newline_part/year=2015/month=3 +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_no_newline.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name} PARTITION(year=2010, month=3); +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_missing_columns.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name} PARTITION(year=2015, month=3); ==== ---- DATASET functional @@ -1876,9 +1902,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( row format delimited fields terminated by ',' escaped by '\\' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_16_lf' --file_len 16 --only_newline && \ -hadoop fs -mkdir -p /test-warehouse/testescape_16_lf && \ -hadoop fs -put -f /tmp/testescape_16_lf/* /test-warehouse/testescape_16_lf/ +`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_16_lf' --file_len 16 --only_newline ==== ---- DATASET functional @@ -1890,9 +1914,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( row format delimited fields terminated by ',' escaped by '\\' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_16_crlf' --file_len 16 && \ -hadoop fs -mkdir -p /test-warehouse/testescape_16_crlf && \ -hadoop fs -put -f /tmp/testescape_16_crlf/* /test-warehouse/testescape_16_crlf/ +`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_16_crlf' --file_len 16 ==== ---- DATASET functional @@ -1904,9 +1926,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( row format delimited fields terminated by ',' escaped by '\\' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_17_lf' --file_len 17 --only_newline && \ -hadoop fs -mkdir -p /test-warehouse/testescape_17_lf && \ -hadoop fs -put -f /tmp/testescape_17_lf/* /test-warehouse/testescape_17_lf/ +`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_17_lf' --file_len 17 --only_newline ==== ---- DATASET functional @@ -1918,9 +1938,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( row format delimited fields terminated by ',' escaped by '\\' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_17_crlf' --file_len 17 && \ -hadoop fs -mkdir -p /test-warehouse/testescape_17_crlf && \ -hadoop fs -put -f /tmp/testescape_17_crlf/* /test-warehouse/testescape_17_crlf/ +`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_17_crlf' --file_len 17 ==== ---- DATASET functional @@ -1932,9 +1950,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( row format delimited fields terminated by ',' escaped by '\\' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_32_lf' --file_len 32 --only_newline && \ -hadoop fs -mkdir -p /test-warehouse/testescape_32_lf && \ -hadoop fs -put -f /tmp/testescape_32_lf/* /test-warehouse/testescape_32_lf/ +`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_32_lf' --file_len 32 --only_newline ==== ---- DATASET functional @@ -1946,9 +1962,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( row format delimited fields terminated by ',' escaped by '\\' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_32_crlf' --file_len 32 && \ -hadoop fs -mkdir -p /test-warehouse/testescape_32_crlf && \ -hadoop fs -put -f /tmp/testescape_32_crlf/* /test-warehouse/testescape_32_crlf/ +`${IMPALA_HOME}/testdata/common/text_delims_table.py --table_dir '/tmp/testescape_32_crlf' --file_len 32 ==== ---- DATASET functional @@ -1960,8 +1974,8 @@ timezone STRING, utctime TIMESTAMP, localtime TIMESTAMP) row format delimited fields terminated by ',' LOCATION '/test-warehouse/{table_name}'; ---- LOAD -`hadoop fs -mkdir -p /test-warehouse/alltimezones && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/timezoneverification.csv /test-warehouse/alltimezones +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/timezoneverification.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -1969,9 +1983,9 @@ functional avro_unicode_nulls ---- CREATE_HIVE create external table if not exists {db_name}{db_suffix}.{table_name} like {db_name}.liketbl stored as avro LOCATION '/test-warehouse/avro_null_char'; ----- LOAD -`hdfs dfs -mkdir -p /test-warehouse/avro_null_char && \ -hdfs dfs -put -f ${IMPALA_HOME}/testdata/avro_null_char/000000_0 /test-warehouse/avro_null_char/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/avro_null_char/000000_0' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET -- IMPALA-1881: Maximize data locality when scanning Parquet files with multiple row groups. @@ -2048,9 +2062,9 @@ functional bzip2_tbl ---- COLUMNS col string ----- DEPENDENT_LOAD -`hadoop fs -mkdir -p /test-warehouse/bzip2_tbl_text_bzip/ && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/data-bzip2.bz2 /test-warehouse/bzip2_tbl_text_bzip/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/data-bzip2.bz2' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -2058,9 +2072,9 @@ functional large_bzip2_tbl ---- COLUMNS col string ----- DEPENDENT_LOAD -`hadoop fs -mkdir -p /test-warehouse/large_bzip2_tbl_text_bzip/ && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/large_bzip2.bz2 /test-warehouse/large_bzip2_tbl_text_bzip/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/large_bzip2.bz2' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -2068,9 +2082,9 @@ functional multistream_bzip2_tbl ---- COLUMNS col string ----- DEPENDENT_LOAD -`hadoop fs -mkdir -p /test-warehouse/multistream_bzip2_tbl_text_bzip/ && \ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/data-pbzip2.bz2 /test-warehouse/multistream_bzip2_tbl_text_bzip/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/data-pbzip2.bz2' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -2078,9 +2092,9 @@ functional large_multistream_bzip2_tbl ---- COLUMNS col string ----- DEPENDENT_LOAD -`hdfs dfs -mkdir -p /test-warehouse/large_multistream_bzip2_tbl_text_bzip/ && \ -hdfs dfs -put -f ${IMPALA_HOME}/testdata/data/large_pbzip2.bz2 /test-warehouse/large_multistream_bzip2_tbl_text_bzip/ +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/large_pbzip2.bz2' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -2094,9 +2108,11 @@ delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='1'); ---- LOAD -LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -2110,9 +2126,11 @@ delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='2'); ---- LOAD -LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ----- DEPENDENT_LOAD -LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD_HIVE +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional
