Repository: incubator-impala Updated Branches: refs/heads/master 56e371664 -> 6441ca65b
IMPALA-5039: Fix variability in parquet dictionary filtering test The tests for dictionary filtering look at how many row groups are processed and how many are filtered by matching text in the profile. However, the number of row groups processed and filtered by any individual fragment depends on how the work is split and how many impalads are running. This causes variability in the test output. To fix this, the test needs a way to aggregate the results across fragments. This fix introduces the following syntax for specifying these aggregates: aggregate(function_name, field_name): expected_value This searches the runtime profile for lines that contain 'field_name: number'. It skips the averaged fragment, as this is derived from all the other fragments. Currently, only SUM is implemented, and the expected_value is required to be an integer. It should be easy to implement other interesting functions like COUNT and MIN/MAX. It would also be possible to extend it to floats. Switching the dictionary filtering tests over to this new syntax eliminates the variability in the tests. Change-Id: I6b7b84d973b3ac678a24e82900f2637d569158bb Reviewed-on: http://gerrit.cloudera.org:8080/6301 Tested-by: Impala Public Jenkins Reviewed-by: Alex Behm <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6441ca65 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6441ca65 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6441ca65 Branch: refs/heads/master Commit: 6441ca65bda83c23dacfed8a27d944a0dabe6b65 Parents: 56e3716 Author: Joe McDonnell <[email protected]> Authored: Tue Mar 7 12:20:09 2017 -0800 Committer: Alex Behm <[email protected]> Committed: Mon Mar 13 17:37:15 2017 +0000 ---------------------------------------------------------------------- .../QueryTest/mt-dop-parquet-filtering.test | 250 ------------------- .../queries/QueryTest/parquet-filtering.test | 96 +++---- tests/common/test_result_verifier.py | 83 +++++- tests/query_test/test_mt_dop.py | 18 +- tests/query_test/test_scanners.py | 4 - 5 files changed, 134 insertions(+), 317 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test b/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test deleted file mode 100644 index 295a1ae..0000000 --- a/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test +++ /dev/null @@ -1,250 +0,0 @@ -# This tests parquet dictionary filtering. It is mirrored without mt_dop -# in parquet-filtering.test. Since the two rely on counting -# the number of row groups filtered, differing parallelism changes -# the counts seen in the output. -# TODO: Fix test execution to allow aggregation of these counts -# so that the results do not depend on the number of threads. -==== ----- QUERY -# id: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where id < 10000; ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# id: Some values pass -# Filters 2/3 row groups -select count(*) from functional_parquet.alltypes where mod(id, 10000) < 20; ----- RESULTS -20 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 2.* -==== ----- QUERY -# id: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(id,10000) = 7301; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# tinyint_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where tinyint_col < 10; ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# tinyint_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(tinyint_col,50) > 10; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# smallint_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where smallint_col < 10 ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# smallint_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(smallint_col,50) > 10; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# int_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where int_col < 10 ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# int_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(int_col, 50) > 10; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# bigint_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where bigint_col < 100 ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# bigint_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(bigint_col, 500) > 100; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# float_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where float_col < 10 ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# float_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(float_col, 100) > 10; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# double_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where double_col < 100 ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# double_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where mod(double_col, 100) > 100; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# date_string_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where date_string_col like '%/%/%'; ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# date_string_col: Half of the values pass -# Filters 1/3 row groups -select count(*) from functional_parquet.alltypes where date_string_col like '%/10'; ----- RESULTS -3650 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 1.* -==== ----- QUERY -# date_string_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where date_string_col = '01/01/11'; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# string_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where length(string_col) = 1 ; ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# string_col: No values pass -# Filters 3/3 row groups -select count(*) from functional_parquet.alltypes where string_col = '10'; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 3.* -==== ----- QUERY -# timestamp_col: All values pass -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where timestamp_col >= '2009-01-01 00:00:00'; ----- RESULTS -7300 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# timestamp_col: No values pass -# Note: dictionary filtering currently does not work on timestamps -# Filters 0/3 row groups -select count(*) from functional_parquet.alltypes where timestamp_col = '2009-01-01 00:00:01'; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 3.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# TPC-H lineitem.l_orderkey has high cardinality (1500000) -# It always uses mixed encoding (PLAIN_DICTIONARY + PLAIN) -# Verify that no dictionary filtering is used even for a predicate -# that eliminates all rows. -select count(*) from tpch_parquet.lineitem where l_orderkey = 50; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 1.* -row_regex: .*NumDictFilteredRowGroups: 0.* -==== ----- QUERY -# Verify dictionary filtering on top level of a schema with nested -# data. -select count(*) from tpch_nested_parquet.customer where c_mktsegment = 'COMEDY'; ----- RESULTS -0 ----- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 1.* -row_regex: .*NumDictFilteredRowGroups: 1.* -==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test index 189c226..932ba60 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test @@ -10,8 +10,8 @@ select count(*) from functional_parquet.alltypes where id < 10000; ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # id: Some values pass @@ -20,8 +20,8 @@ select count(*) from functional_parquet.alltypes where mod(id, 10000) < 20; ---- RESULTS 20 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 7.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 23 ==== ---- QUERY # id: No values pass @@ -30,8 +30,8 @@ select count(*) from functional_parquet.alltypes where mod(id,10000) = 7301; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # tinyint_col: All values pass @@ -40,8 +40,8 @@ select count(*) from functional_parquet.alltypes where tinyint_col < 10; ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # tinyint_col: No values pass @@ -50,8 +50,8 @@ select count(*) from functional_parquet.alltypes where mod(tinyint_col,50) > 10; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # smallint_col: All values pass @@ -60,8 +60,8 @@ select count(*) from functional_parquet.alltypes where smallint_col < 10 ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # smallint_col: No values pass @@ -70,8 +70,8 @@ select count(*) from functional_parquet.alltypes where mod(smallint_col,50) > 10 ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # int_col: All values pass @@ -80,8 +80,8 @@ select count(*) from functional_parquet.alltypes where int_col < 10 ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # int_col: No values pass @@ -90,8 +90,8 @@ select count(*) from functional_parquet.alltypes where mod(int_col, 50) > 10; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # bigint_col: All values pass @@ -100,8 +100,8 @@ select count(*) from functional_parquet.alltypes where bigint_col < 100 ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # bigint_col: No values pass @@ -110,8 +110,8 @@ select count(*) from functional_parquet.alltypes where mod(bigint_col, 500) > 10 ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # float_col: All values pass @@ -120,8 +120,8 @@ select count(*) from functional_parquet.alltypes where float_col < 10 ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # float_col: No values pass @@ -130,8 +130,8 @@ select count(*) from functional_parquet.alltypes where mod(float_col, 100) > 10; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # double_col: All values pass @@ -140,8 +140,8 @@ select count(*) from functional_parquet.alltypes where double_col < 100 ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # double_col: No values pass @@ -150,8 +150,8 @@ select count(*) from functional_parquet.alltypes where mod(double_col, 100) > 10 ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # date_string_col: All values pass @@ -160,8 +160,8 @@ select count(*) from functional_parquet.alltypes where date_string_col like '%/% ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # date_string_col: Half of the values pass @@ -170,8 +170,8 @@ select count(*) from functional_parquet.alltypes where date_string_col like '%/1 ---- RESULTS 3650 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 4.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 12 ==== ---- QUERY # date_string_col: No values pass @@ -180,8 +180,8 @@ select count(*) from functional_parquet.alltypes where date_string_col = '01/01/ ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # string_col: All values pass @@ -190,8 +190,8 @@ select count(*) from functional_parquet.alltypes where length(string_col) = 1 ; ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # string_col: No values pass @@ -200,8 +200,8 @@ select count(*) from functional_parquet.alltypes where string_col = '10'; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 8.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 24 ==== ---- QUERY # timestamp_col: All values pass @@ -210,8 +210,8 @@ select count(*) from functional_parquet.alltypes where timestamp_col >= '2009-01 ---- RESULTS 7300 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # timestamp_col: No values pass @@ -221,8 +221,8 @@ select count(*) from functional_parquet.alltypes where timestamp_col = '2009-01- ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 8.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # TPC-H lineitem.l_orderkey has high cardinality (1500000) @@ -233,8 +233,8 @@ select count(*) from tpch_parquet.lineitem where l_orderkey = 50; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 1.* -row_regex: .*NumDictFilteredRowGroups: 0.* +aggregation(SUM, NumRowGroups): 3 +aggregation(SUM, NumDictFilteredRowGroups): 0 ==== ---- QUERY # Verify dictionary filtering on top level of a schema with nested @@ -243,6 +243,6 @@ select count(*) from tpch_nested_parquet.customer where c_mktsegment = 'COMEDY'; ---- RESULTS 0 ---- RUNTIME_PROFILE -row_regex: .*NumRowGroups: 1.* -row_regex: .*NumDictFilteredRowGroups: 1.* +aggregation(SUM, NumRowGroups): 4 +aggregation(SUM, NumDictFilteredRowGroups): 4 ==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/tests/common/test_result_verifier.py ---------------------------------------------------------------------- diff --git a/tests/common/test_result_verifier.py b/tests/common/test_result_verifier.py index c7f890a..a816eaf 100644 --- a/tests/common/test_result_verifier.py +++ b/tests/common/test_result_verifier.py @@ -456,18 +456,87 @@ def parse_result_rows(exec_result): result.append(','.join(new_cols)) return result +# Special syntax for basic aggregation over fields in the runtime profile. +# The syntax is: +# aggregation(function, field_name): expected_value +# Currently, the only implemented function is SUM and only integers are supported. +AGGREGATION_PREFIX_PATTERN = 'aggregation\(' +AGGREGATION_PREFIX = re.compile(AGGREGATION_PREFIX_PATTERN) +AGGREGATION_SYNTAX_MATCH_PATTERN = 'aggregation\((\w+)[ ]*,[ ]*(\w+)\):[ ]*(\d+)' + +def try_compile_aggregation(row_string): + """ + Check to see if this row string specifies an aggregation. If the row string contains + an aggregation, it returns a tuple with all the information for evaluating the + aggregation. Otherwise, it returns None. + """ + if row_string and AGGREGATION_PREFIX.match(row_string): + function, field, value = re.findall(AGGREGATION_SYNTAX_MATCH_PATTERN, row_string)[0] + # Validate function + assert(function == 'SUM') + # Validate value is integer + expected_value = int(value) + return (function, field, expected_value) + return None + +def compute_aggregation(function, field, runtime_profile): + """ + Evaluate an aggregation function over a field on the runtime_profile. This skips + the averaged fragment and returns the aggregate value. It currently supports only + integer values and the SUM function. + """ + start_avg_fragment_re = re.compile('[ ]*Averaged Fragment') + field_regex = "{0}: (\d+)".format(field) + field_regex_re = re.compile(field_regex) + inside_avg_fragment = False + avg_fragment_indent = None + past_avg_fragment = False + match_list = [] + for line in runtime_profile.splitlines(): + # Detect the boundaries of the averaged fragment by looking at indentation. + # The averaged fragment starts with a particular indentation level. All of + # its children are at a greater indent. When the indentation gets back to + # the level of the the averaged fragment start, then the averaged fragment + # is done. + if inside_avg_fragment: + indentation = len(line) - len(line.lstrip()) + if indentation > avg_fragment_indent: + continue + else: + inside_avg_fragment = False + past_avg_fragment = True + + if not past_avg_fragment and start_avg_fragment_re.match(line): + inside_avg_fragment = True + avg_fragment_indent = len(line) - len(line.lstrip()) + continue + + if (field_regex_re.search(line)): + match_list.extend(re.findall(field_regex, line)) + + int_match_list = map(int, match_list) + result = None + if function == 'SUM': + result = sum(int_match_list) + + return result + def verify_runtime_profile(expected, actual): """ Check that lines matching all of the expected runtime profile entries are present in the actual text runtime profile. The check passes if, for each of the expected rows, at least one matching row is present in the actual runtime profile. Rows - with the "row_regex:" prefix are treated as regular expressions. + with the "row_regex:" prefix are treated as regular expressions. Rows with + the "aggregation(function,field): value" syntax specifies an aggregation over + the runtime profile. """ expected_lines = remove_comments(expected).splitlines() matched = [False] * len(expected_lines) expected_regexes = [] + expected_aggregations = [] for expected_line in expected_lines: expected_regexes.append(try_compile_regex(expected_line)) + expected_aggregations.append(try_compile_aggregation(expected_line)) # Check the expected and actual rows pairwise. for line in actual.splitlines(): @@ -475,6 +544,9 @@ def verify_runtime_profile(expected, actual): if matched[i]: continue if expected_regexes[i] is not None: match = expected_regexes[i].match(line) + elif expected_aggregations[i] is not None: + # Aggregations are enforced separately + match = True else: match = expected_lines[i].strip() == line.strip() if match: @@ -489,6 +561,15 @@ def verify_runtime_profile(expected, actual): "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines), actual)) + # Compute the aggregations and check against values + for i in xrange(len(expected_aggregations)): + if (expected_aggregations[i] is None): continue + function, field, expected_value = expected_aggregations[i] + actual_value = compute_aggregation(function, field, actual) + assert actual_value == expected_value, ("Aggregation of %s over %s did not match " + "expected results.\nEXPECTED VALUE:\n%d\n\nACTUAL VALUE:\n%d" + "\n\nPROFILE:\n%s\n" % (function, field, expected_value, actual_value, actual)) + def get_node_exec_options(profile_string, exec_node_id): """ Return a list with all of the ExecOption strings for the given exec node id. """ results = [] http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/tests/query_test/test_mt_dop.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_mt_dop.py b/tests/query_test/test_mt_dop.py index 1096e64..6ba8184 100644 --- a/tests/query_test/test_mt_dop.py +++ b/tests/query_test/test_mt_dop.py @@ -98,18 +98,8 @@ class TestMtDopParquet(ImpalaTestSuite): vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') self.run_test_case('QueryTest/mt-dop-parquet-nested', vector) -# Parquet filtering test rlies on a specific mt_dop value, so keep in its own test -class TestMtDopParquetFiltering(ImpalaTestSuite): - @classmethod - def get_workload(cls): - return 'functional-query' - - @classmethod - def add_test_dimensions(cls): - super(TestMtDopParquetFiltering, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_constraint( - lambda v: v.get_value('table_format').file_format == 'parquet') - def test_parquet_filtering(self, vector): - vector.get_value('exec_option')['mt_dop'] = 3 - self.run_test_case('QueryTest/mt-dop-parquet-filtering', vector) + """IMPALA-4624: Test that dictionary filtering eliminates row groups correctly.""" + vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') + self.run_test_case('QueryTest/parquet-filtering', vector) + http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index cb24923..8ba2e0b 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -327,10 +327,6 @@ class TestParquet(ImpalaTestSuite): self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort', vector, unique_database) - def test_filtering(self, vector): - """IMPALA-4624: Test that dictionary filtering eliminates row groups correctly.""" - self.run_test_case('QueryTest/parquet-filtering', vector) - @SkipIfS3.hdfs_block_size @SkipIfIsilon.hdfs_block_size @SkipIfLocal.multiple_impalad
