IMPALA-5287: Test skip.header.line.count on gzip This change fixed IMPALA-4873 by adding the capability to supply a dict 'test_file_vars' to run_test_case(). Keys in this dict will be replaced with their values inside test queries before they are executed.
Change-Id: Ie3f3c29a42501cfb2751f7ad0af166eb88f63b70 Reviewed-on: http://gerrit.cloudera.org:8080/6817 Reviewed-by: Michael Brown <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/12f3ecce Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/12f3ecce Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/12f3ecce Branch: refs/heads/master Commit: 12f3ecceabc5a7cdf401956376ebcd483d0c2376 Parents: fd62a7f Author: Lars Volker <[email protected]> Authored: Sat May 6 22:17:05 2017 +0200 Committer: Impala Public Jenkins <[email protected]> Committed: Tue May 9 01:36:46 2017 +0000 ---------------------------------------------------------------------- testdata/bin/generate-schema-statements.py | 3 +- testdata/data/README | 19 ++++++-- testdata/data/table_with_header.gz | Bin 0 -> 64 bytes testdata/data/table_with_header_2.gz | Bin 0 -> 82 bytes .../functional/functional_schema_template.sql | 4 ++ .../datasets/functional/schema_constraints.csv | 4 ++ .../QueryTest/hdfs-text-scan-with-header.test | 48 ++++++++++--------- tests/common/impala_test_suite.py | 14 +++++- tests/query_test/test_scanners.py | 22 +++++---- 9 files changed, 75 insertions(+), 39 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/bin/generate-schema-statements.py ---------------------------------------------------------------------- diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index a214822..fdb9c64 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -359,7 +359,8 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for insert_statement = insert.format(db_name=db_name, db_suffix=db_suffix, table_name=table_name, - hdfs_location=hdfs_path) + hdfs_location=hdfs_path, + impala_home = os.getenv("IMPALA_HOME")) # Kudu tables are managed and don't support OVERWRITE, so we replace OVERWRITE # with INTO to make this a regular INSERT. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index 465d80b..23b0586 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -68,24 +68,23 @@ first rowgroup column metadata for 'int_array' incorrectly states there are 50 v (instead of 100), and the second rowgroup column metadata for 'id' incorrectly states there are 11 values (instead of 10). The third rowgroup has the correct metadata. -data-bzip2.bz2 +data-bzip2.bz2: Generated with bzip2, contains single bzip2 stream Contains 1 column, uncompressed data size < 8M -large_bzip2.bz2 +large_bzip2.bz2: Generated with bzip2, contains single bzip2 stream Contains 1 column, uncompressed data size > 8M -data-pbzip2.bz2 +data-pbzip2.bz2: Generated with pbzip2, contains multiple bzip2 streams Contains 1 column, uncompressed data size < 8M -large_pbzip2.bz2 +large_pbzip2.bz2: Generated with pbzip2, contains multiple bzip2 stream Contains 1 column, uncompressed data size > 8M out_of_range_timestamp.parquet: ------------ Generated with a hacked version of Impala parquet writer. Contains a single timestamp column with 4 values, 2 of which are out of range and should be read as NULL by Impala: @@ -93,3 +92,13 @@ and should be read as NULL by Impala: 1400-01-01 00:00:00 9999-12-31 00:00:00 10000-01-01 00:00:00 (invalid - date too large) + +table_with_header.csv: +Created with a text editor, contains a header line before the data rows. + +table_with_header_2.csv: +Created with a text editor, contains two header lines before the data rows. + +table_with_header.gz, table_with_header_2.gz: +Generated by gzip'ing table_with_header.csv and table_with_header_2.csv. + http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/table_with_header.gz ---------------------------------------------------------------------- diff --git a/testdata/data/table_with_header.gz b/testdata/data/table_with_header.gz new file mode 100644 index 0000000..a7c86df Binary files /dev/null and b/testdata/data/table_with_header.gz differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/table_with_header_2.gz ---------------------------------------------------------------------- diff --git a/testdata/data/table_with_header_2.gz b/testdata/data/table_with_header_2.gz new file mode 100644 index 0000000..d8600fd Binary files /dev/null and b/testdata/data/table_with_header_2.gz differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/datasets/functional/functional_schema_template.sql ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 1aacf43..e7b8a07 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -2088,6 +2088,8 @@ delimited fields terminated by ',' escaped by '\\' ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='1'); ---- LOAD LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional @@ -2102,6 +2104,8 @@ delimited fields terminated by ',' escaped by '\\' ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='2'); ---- LOAD LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET functional http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/datasets/functional/schema_constraints.csv ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index d6d1111..bb3487f 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -189,6 +189,10 @@ table_name:nullescapedtable, constraint:only, table_format:kudu/none/none table_name:table_with_header, constraint:restrict_to, table_format:text/none/none table_name:table_with_header_2, constraint:restrict_to, table_format:text/none/none table_name:table_with_header_insert, constraint:restrict_to, table_format:text/none/none +# We also test that skipping header lines works on compressed tables (IMPALA-5287) +table_name:table_with_header, constraint:restrict_to, table_format:text/gzip/block +table_name:table_with_header_2, constraint:restrict_to, table_format:text/gzip/block +table_name:table_with_header_insert, constraint:restrict_to, table_format:text/gzip/block # Inserting into parquet tables should not be affected by the 'skip.header.line.count' # property, so we test parquet format as well. table_name:table_with_header_insert, constraint:restrict_to, table_format:parquet/none/none http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test index 4aab121..d5f92f7 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test +++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test @@ -1,7 +1,7 @@ ==== ---- QUERY set max_scan_range_length=0; -select c1, c2 from functional.table_with_header +select c1, c2 from table_with_header ---- RESULTS 1,2 3,4 @@ -11,7 +11,7 @@ INT,DOUBLE ==== ---- QUERY set max_scan_range_length=0; -select count(*) from functional.table_with_header +select count(*) from table_with_header ---- RESULTS 3 ---- TYPES @@ -19,7 +19,7 @@ BIGINT ==== ---- QUERY set max_scan_range_length=2; -select c1, c2 from functional.table_with_header +select c1, c2 from table_with_header ---- RESULTS 1,2 3,4 @@ -29,7 +29,7 @@ INT,DOUBLE ==== ---- QUERY set max_scan_range_length=2; -select count(*) from functional.table_with_header +select count(*) from table_with_header ---- RESULTS 3 ---- TYPES @@ -37,7 +37,7 @@ BIGINT ==== ---- QUERY set max_scan_range_length=30; -select c1, c2 from functional.table_with_header +select c1, c2 from table_with_header ---- RESULTS 1,2 3,4 @@ -47,7 +47,7 @@ INT,DOUBLE ==== ---- QUERY set max_scan_range_length=30; -select count(*) from functional.table_with_header +select count(*) from table_with_header ---- RESULTS 3 ---- TYPES @@ -55,7 +55,7 @@ BIGINT ==== ---- QUERY set max_scan_range_length=0; -select c1, c2 from functional.table_with_header_2 +select c1, c2 from table_with_header_2 ---- RESULTS 1,2 3,4 @@ -65,13 +65,15 @@ INT,DOUBLE ==== ---- QUERY set max_scan_range_length=0; -select count(*) from functional.table_with_header_2 +select count(*) from table_with_header_2 ---- RESULTS 3 ---- TYPES BIGINT ==== ---- QUERY +# This test is only supported on uncompressed tables, since we always only issue one +# single scan range for a compressed file. set max_scan_range_length=2; set abort_on_error=1; select c1, c2 from functional.table_with_header_2 @@ -81,6 +83,8 @@ increasing max_scan_range_length to a value larger than the size of the file's h INT,DOUBLE ==== ---- QUERY +# This test is only supported on uncompressed tables, since we always only issue one +# single scan range for a compressed file. set max_scan_range_length=2; set abort_on_error=0; select c1, c2 from functional.table_with_header_2 @@ -91,7 +95,7 @@ INT,DOUBLE ==== ---- QUERY set max_scan_range_length=30; -select c1, c2 from functional.table_with_header_2 +select c1, c2 from table_with_header_2 ---- RESULTS 1,2 3,4 @@ -101,24 +105,24 @@ INT,DOUBLE ==== ---- QUERY set max_scan_range_length=30; -select count(*) from functional.table_with_header_2 +select count(*) from table_with_header_2 ---- RESULTS 3 ---- TYPES BIGINT ==== ---- QUERY -drop table if exists mixed; -create table mixed (kf smallint) partitioned by (year smallint) stored as textfile; -alter table mixed add partition (year=2012); -alter table mixed add partition (year=2013); -alter table mixed partition (year=2013) set fileformat parquet; -insert into mixed partition (year=2012) values (1),(2),(3); -insert into mixed partition (year=2013) values (4),(5),(6); -alter table mixed set tblproperties("skip.header.line.count"="1"); -alter table mixed set fileformat parquet; -alter table mixed set tblproperties("skip.header.line.count"="2"); -select * from mixed; +drop table if exists $UNIQUE_DB.mixed; +create table $UNIQUE_DB.mixed (kf smallint) partitioned by (year smallint) stored as textfile; +alter table $UNIQUE_DB.mixed add partition (year=2012); +alter table $UNIQUE_DB.mixed add partition (year=2013); +alter table $UNIQUE_DB.mixed partition (year=2013) set fileformat parquet; +insert into $UNIQUE_DB.mixed partition (year=2012) values (1),(2),(3); +insert into $UNIQUE_DB.mixed partition (year=2013) values (4),(5),(6); +alter table $UNIQUE_DB.mixed set tblproperties("skip.header.line.count"="1"); +alter table $UNIQUE_DB.mixed set fileformat parquet; +alter table $UNIQUE_DB.mixed set tblproperties("skip.header.line.count"="2"); +select * from $UNIQUE_DB.mixed; ---- RESULTS 3,2012 4,2013 @@ -128,5 +132,5 @@ select * from mixed; SMALLINT,SMALLINT ==== ---- QUERY -drop table mixed; +drop table $UNIQUE_DB.mixed; ==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/tests/common/impala_test_suite.py ---------------------------------------------------------------------- diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index af0ed1d..f7602af 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -274,7 +274,7 @@ class ImpalaTestSuite(BaseTestSuite): def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False, - encoding=None): + encoding=None, test_file_vars=None): """ Runs the queries in the specified test based on the vector values @@ -285,6 +285,9 @@ class ImpalaTestSuite(BaseTestSuite): Additionally, the encoding for all test data can be specified using the 'encoding' parameter. This is useful when data is ingested in a different encoding (ex. latin). If not set, the default system encoding will be used. + If a dict 'test_file_vars' is provided, then all keys will be replaced with their + values in queries before they are executed. Callers need to avoid using reserved key + names, see 'reserved_keywords' below. """ table_format_info = vector.get_value('table_format') exec_options = vector.get_value('exec_option') @@ -336,6 +339,15 @@ class ImpalaTestSuite(BaseTestSuite): .replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str())) if use_db: query = query.replace('$DATABASE', use_db) + reserved_keywords = ["$DATABASE", "$FILESYSTEM_PREFIX", "$GROUP_NAME", + "$IMPALA_HOME", "$NAMENODE", "$QUERY", "$SECONDARY_FILESYSTEM"] + + if test_file_vars: + for key, value in test_file_vars.iteritems(): + if key in reserved_keywords: + raise RuntimeError("Key {0} is reserved".format(key)) + query = query.replace(key, value) + if 'QUERY_NAME' in test_section: LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME']) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index b0e2e80..5dbe02a 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -122,7 +122,7 @@ class TestUnmatchedSchema(ImpalaTestSuite): cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension()) # Avro has a more advanced schema evolution process which is covered in more depth # in the test_avro_schema_evolution test suite. - cls.ImpalaTestMatrix.add_constraint(\ + cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format != 'avro') def _create_test_table(self, vector): @@ -574,8 +574,8 @@ class TestTextScanRangeLengths(ImpalaTestSuite): super(TestTextScanRangeLengths, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS)) - cls.ImpalaTestMatrix.add_constraint(lambda v:\ - v.get_value('table_format').file_format == 'text' and\ + cls.ImpalaTestMatrix.add_constraint(lambda v: + v.get_value('table_format').file_format == 'text' and v.get_value('table_format').compression_codec == 'none') def test_text_scanner(self, vector): @@ -605,8 +605,8 @@ class TestTextSplitDelimiters(ImpalaTestSuite): @classmethod def add_test_dimensions(cls): super(TestTextSplitDelimiters, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_constraint(lambda v:\ - v.get_value('table_format').file_format == 'text' and\ + cls.ImpalaTestMatrix.add_constraint(lambda v: + v.get_value('table_format').file_format == 'text' and v.get_value('table_format').compression_codec == 'none') def test_text_split_delimiters(self, vector, unique_database): @@ -682,11 +682,13 @@ class TestTextScanRangeLengths(ImpalaTestSuite): @classmethod def add_test_dimensions(cls): super(TestTextScanRangeLengths, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_constraint( - lambda v: v.get_value('table_format').file_format == 'text') + cls.ImpalaTestMatrix.add_constraint(lambda v: + v.get_value('table_format').file_format == 'text' and + v.get_value('table_format').compression_codec in ['none', 'gzip']) def test_text_scanner_with_header(self, vector, unique_database): - self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector, unique_database) + self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector, + test_file_vars={'$UNIQUE_DB': unique_database}) # Missing Coverage: No coverage for truncated files errors or scans. @@ -708,8 +710,8 @@ class TestScanTruncatedFiles(ImpalaTestSuite): # strategy. # TODO: Test other file formats if cls.exploration_strategy() == 'exhaustive': - cls.ImpalaTestMatrix.add_constraint(lambda v:\ - v.get_value('table_format').file_format == 'text' and\ + cls.ImpalaTestMatrix.add_constraint(lambda v: + v.get_value('table_format').file_format == 'text' and v.get_value('table_format').compression_codec == 'none') else: cls.ImpalaTestMatrix.add_constraint(lambda v: False)
