Repository: incubator-impala Updated Branches: refs/heads/master d7d6c0367 -> 57d7c614b
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/57d7c614/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test deleted file mode 100644 index d03b4c9..0000000 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test +++ /dev/null @@ -1,460 +0,0 @@ -==== ----- QUERY -select id, bool_col from functional_parquet.alltypessmall where int_col < 0 ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -set explain_level=2; -explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0; ----- RESULTS: VERIFY_IS_SUBSET -' parquet statistics predicates: int_col < 0' -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col < 0 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where smallint_col < 0 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where int_col < 0 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where bigint_col < 0 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where float_col < 0 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where double_col < 0 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -# Test with inverted predicate -select id, bool_col from functional_parquet.alltypessmall where -1 > int_col ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col > 9 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where smallint_col > 9 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select id, bool_col from functional_parquet.alltypessmall where int_col > 9 ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where bigint_col > 90 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where float_col > 9.9 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where double_col > 99 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0 ----- RESULTS -12 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9 ----- RESULTS -8 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col = -1 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where tinyint_col = 10 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -set explain_level=2; -explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10 ----- RESULTS: VERIFY_IS_SUBSET -' parquet statistics predicates: tinyint_col = 10' -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80 ----- RESULTS -51 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 1 -==== ----- QUERY -# Mix with partitioning columns -select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3 ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3 ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -# Test that without expr rewrite and thus without constant folding, constant exprs still -# can be used to prune row groups. -set enable_expr_rewrites=0; -select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3 ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -# Test that without expr rewrite and thus without constant folding, constant exprs still -# can be used to prune row groups. -set enable_expr_rewrites=0; -select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col ----- RESULTS ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -# Test name based column resolution -create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall; -alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int); -set parquet_fallback_schema_resolution=NAME; -# If this picks up the stats from int_col, then it will filter all row groups and return -# an incorrect result. -select count(*) from name_resolve where id > 10; ----- RESULTS -89 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -# Query that has an implicit cast to a larger integer type -select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -# Predicates with explicit casts are not supported when evaluating parquet::Statistics. -select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string) ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -# Explicit casts between numerical types can violate the transitivity of "min()", so they -# are not supported when evaluating parquet::Statistics. -select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10; ----- RESULTS -3878 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 24 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -select count(*) from functional_parquet.complextypestbl.int_array where pos < 5; ----- RESULTS -9 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 2 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -# Test the conversion of constant IN lists to min/max predicats -select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4); ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 24 -aggregation(SUM, NumStatsFilteredRowGroups): 24 -==== ----- QUERY -select count(*) from functional_parquet.alltypes where id IN (1,25,49); ----- RESULTS -3 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 24 -aggregation(SUM, NumStatsFilteredRowGroups): 23 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where string_col < "0" ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where string_col <= "/" ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where string_col < "1" ----- RESULTS -12 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where string_col >= "9" ----- RESULTS -8 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where string_col > ":" ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00" ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00" ----- RESULTS -1 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 3 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00" ----- RESULTS -1 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 3 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96" ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 4 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96" ----- RESULTS -1 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 3 -==== ----- QUERY -select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96" ----- RESULTS -1 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 4 -aggregation(SUM, NumStatsFilteredRowGroups): 3 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d1 < 1234 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 1 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 1 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 1 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789 ----- RESULTS -1 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 1 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678 ----- RESULTS -5 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679 ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 1 -==== ----- QUERY -# Test that stats are disabled for CHAR type columns. -create table chars (id int, c char(4)) stored as parquet; -insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4))); -select count(*) from chars; ----- RESULTS -2 -==== ----- QUERY -select count(*) from chars where c <= "aaaa" ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 1 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== ----- QUERY -# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics -# query option. -set parquet_read_statistics=0; -select count(*) from functional_parquet.alltypes where id < 0; ----- RESULTS -0 ----- RUNTIME_PROFILE -aggregation(SUM, NumRowGroups): 24 -aggregation(SUM, NumStatsFilteredRowGroups): 0 -==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/57d7c614/tests/query_test/test_aggregation.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py index a257bc0..289a867 100644 --- a/tests/query_test/test_aggregation.py +++ b/tests/query_test/test_aggregation.py @@ -271,6 +271,15 @@ class TestAggregationQueries(ImpalaTestSuite): # Verify codegen was enabled for all four stages of the aggregation. assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6]) + def test_parquet_count_star_optimization(self, vector): + if (vector.get_value('table_format').file_format != 'text' or + vector.get_value('table_format').compression_codec != 'none'): + # No need to run this test on all file formats + pytest.skip() + self.run_test_case('QueryTest/parquet-stats-agg', vector) + vector.get_value('exec_option')['batch_size'] = 1 + self.run_test_case('QueryTest/parquet-stats-agg', vector) + class TestWideAggregationQueries(ImpalaTestSuite): """Test that aggregations with many grouping columns work""" @classmethod http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/57d7c614/tests/query_test/test_parquet_stats.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_parquet_stats.py b/tests/query_test/test_parquet_stats.py index 93fc06d..00afd09 100644 --- a/tests/query_test/test_parquet_stats.py +++ b/tests/query_test/test_parquet_stats.py @@ -46,7 +46,7 @@ class TestParquetStats(ImpalaTestSuite): # The test makes assumptions about the number of row groups that are processed and # skipped inside a fragment, so we ensure that the tests run in a single fragment. vector.get_value('exec_option')['num_nodes'] = 1 - self.run_test_case('QueryTest/parquet_stats', vector, use_db=unique_database) + self.run_test_case('QueryTest/parquet-stats', vector, use_db=unique_database) def test_deprecated_stats(self, vector, unique_database): """Test that reading parquet files with statistics with deprecated 'min'/'max' fields @@ -69,4 +69,3 @@ class TestParquetStats(ImpalaTestSuite): # skipped inside a fragment, so we ensure that the tests run in a single fragment. vector.get_value('exec_option')['num_nodes'] = 1 self.run_test_case('QueryTest/parquet-deprecated-stats', vector, unique_database) -
