incubator-impala git commit: IMPALA-5039: Fix variability in parquet dictionary filtering test

jrussell Mon, 13 Mar 2017 10:43:04 -0700

Repository: incubator-impala
Updated Branches:
  refs/heads/master 56e371664 -> 6441ca65b



IMPALA-5039: Fix variability in parquet dictionary filtering test

The tests for dictionary filtering look at how many row groups are
processed and how many are filtered by matching text in the profile.
However, the number of row groups processed and filtered by any
individual fragment depends on how the work is split and how many
impalads are running. This causes variability in the test output.

To fix this, the test needs a way to aggregate the results across
fragments. This fix introduces the following syntax for specifying
these aggregates:
aggregate(function_name, field_name): expected_value
This searches the runtime profile for lines that contain
'field_name: number'. It skips the averaged fragment, as this is
derived from all the other fragments.

Currently, only SUM is implemented, and the expected_value is
required to be an integer. It should be easy to implement other
interesting functions like COUNT and MIN/MAX. It would also be
possible to extend it to floats.

Switching the dictionary filtering tests over to this new syntax
eliminates the variability in the tests.

Change-Id: I6b7b84d973b3ac678a24e82900f2637d569158bb
Reviewed-on: http://gerrit.cloudera.org:8080/6301
Tested-by: Impala Public Jenkins
Reviewed-by: Alex Behm <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6441ca65
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6441ca65
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6441ca65

Branch: refs/heads/master
Commit: 6441ca65bda83c23dacfed8a27d944a0dabe6b65
Parents: 56e3716
Author: Joe McDonnell <[email protected]>
Authored: Tue Mar 7 12:20:09 2017 -0800
Committer: Alex Behm <[email protected]>
Committed: Mon Mar 13 17:37:15 2017 +0000

----------------------------------------------------------------------
 .../QueryTest/mt-dop-parquet-filtering.test     | 250 -------------------
 .../queries/QueryTest/parquet-filtering.test    |  96 +++----
 tests/common/test_result_verifier.py            |  83 +++++-
 tests/query_test/test_mt_dop.py                 |  18 +-
 tests/query_test/test_scanners.py               |   4 -
 5 files changed, 134 insertions(+), 317 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test
 
b/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test
deleted file mode 100644
index 295a1ae..0000000
--- 
a/testdata/workloads/functional-query/queries/QueryTest/mt-dop-parquet-filtering.test
+++ /dev/null
@@ -1,250 +0,0 @@
-# This tests parquet dictionary filtering. It is mirrored without mt_dop
-# in parquet-filtering.test. Since the two rely on counting
-# the number of row groups filtered, differing parallelism changes
-# the counts seen in the output.
-# TODO: Fix test execution to allow aggregation of these counts
-# so that the results do not depend on the number of threads.
-====
----- QUERY
-# id: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where id < 10000;
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# id: Some values pass
-# Filters 2/3 row groups
-select count(*) from functional_parquet.alltypes where mod(id, 10000) < 20;
----- RESULTS
-20
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 2.*
-====
----- QUERY
-# id: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(id,10000) = 7301;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# tinyint_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where tinyint_col < 10;
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# tinyint_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(tinyint_col,50) > 
10;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# smallint_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where smallint_col < 10
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# smallint_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(smallint_col,50) > 
10;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# int_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where int_col < 10
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# int_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(int_col, 50) > 10;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# bigint_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where bigint_col < 100
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# bigint_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(bigint_col, 500) > 
100;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# float_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where float_col < 10
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# float_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(float_col, 100) > 
10;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# double_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where double_col < 100
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# double_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where mod(double_col, 100) > 
100;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# date_string_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where date_string_col like 
'%/%/%';
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# date_string_col: Half of the values pass
-# Filters 1/3 row groups
-select count(*) from functional_parquet.alltypes where date_string_col like 
'%/10';
----- RESULTS
-3650
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 1.*
-====
----- QUERY
-# date_string_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where date_string_col = 
'01/01/11';
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# string_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where length(string_col) = 1 ;
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# string_col: No values pass
-# Filters 3/3 row groups
-select count(*) from functional_parquet.alltypes where string_col = '10';
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 3.*
-====
----- QUERY
-# timestamp_col: All values pass
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where timestamp_col >= 
'2009-01-01 00:00:00';
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# timestamp_col: No values pass
-# Note: dictionary filtering currently does not work on timestamps
-# Filters 0/3 row groups
-select count(*) from functional_parquet.alltypes where timestamp_col = 
'2009-01-01 00:00:01';
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 3.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# TPC-H lineitem.l_orderkey has high cardinality (1500000)
-# It always uses mixed encoding (PLAIN_DICTIONARY + PLAIN)
-# Verify that no dictionary filtering is used even for a predicate
-# that eliminates all rows.
-select count(*) from tpch_parquet.lineitem where l_orderkey = 50;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 1.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
-====
----- QUERY
-# Verify dictionary filtering on top level of a schema with nested
-# data.
-select count(*) from tpch_nested_parquet.customer where c_mktsegment = 
'COMEDY';
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 1.*
-row_regex: .*NumDictFilteredRowGroups: 1.*
-====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test
index 189c226..932ba60 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-filtering.test
@@ -10,8 +10,8 @@ select count(*) from functional_parquet.alltypes where id < 
10000;
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # id: Some values pass
@@ -20,8 +20,8 @@ select count(*) from functional_parquet.alltypes where 
mod(id, 10000) < 20;
 ---- RESULTS
 20
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 7.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 23
 ====
 ---- QUERY
 # id: No values pass
@@ -30,8 +30,8 @@ select count(*) from functional_parquet.alltypes where 
mod(id,10000) = 7301;
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # tinyint_col: All values pass
@@ -40,8 +40,8 @@ select count(*) from functional_parquet.alltypes where 
tinyint_col < 10;
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # tinyint_col: No values pass
@@ -50,8 +50,8 @@ select count(*) from functional_parquet.alltypes where 
mod(tinyint_col,50) > 10;
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # smallint_col: All values pass
@@ -60,8 +60,8 @@ select count(*) from functional_parquet.alltypes where 
smallint_col < 10
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # smallint_col: No values pass
@@ -70,8 +70,8 @@ select count(*) from functional_parquet.alltypes where 
mod(smallint_col,50) > 10
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # int_col: All values pass
@@ -80,8 +80,8 @@ select count(*) from functional_parquet.alltypes where 
int_col < 10
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # int_col: No values pass
@@ -90,8 +90,8 @@ select count(*) from functional_parquet.alltypes where 
mod(int_col, 50) > 10;
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # bigint_col: All values pass
@@ -100,8 +100,8 @@ select count(*) from functional_parquet.alltypes where 
bigint_col < 100
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # bigint_col: No values pass
@@ -110,8 +110,8 @@ select count(*) from functional_parquet.alltypes where 
mod(bigint_col, 500) > 10
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # float_col: All values pass
@@ -120,8 +120,8 @@ select count(*) from functional_parquet.alltypes where 
float_col < 10
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # float_col: No values pass
@@ -130,8 +130,8 @@ select count(*) from functional_parquet.alltypes where 
mod(float_col, 100) > 10;
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # double_col: All values pass
@@ -140,8 +140,8 @@ select count(*) from functional_parquet.alltypes where 
double_col < 100
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # double_col: No values pass
@@ -150,8 +150,8 @@ select count(*) from functional_parquet.alltypes where 
mod(double_col, 100) > 10
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # date_string_col: All values pass
@@ -160,8 +160,8 @@ select count(*) from functional_parquet.alltypes where 
date_string_col like '%/%
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # date_string_col: Half of the values pass
@@ -170,8 +170,8 @@ select count(*) from functional_parquet.alltypes where 
date_string_col like '%/1
 ---- RESULTS
 3650
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 4.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 12
 ====
 ---- QUERY
 # date_string_col: No values pass
@@ -180,8 +180,8 @@ select count(*) from functional_parquet.alltypes where 
date_string_col = '01/01/
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # string_col: All values pass
@@ -190,8 +190,8 @@ select count(*) from functional_parquet.alltypes where 
length(string_col) = 1 ;
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # string_col: No values pass
@@ -200,8 +200,8 @@ select count(*) from functional_parquet.alltypes where 
string_col = '10';
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 8.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 24
 ====
 ---- QUERY
 # timestamp_col: All values pass
@@ -210,8 +210,8 @@ select count(*) from functional_parquet.alltypes where 
timestamp_col >= '2009-01
 ---- RESULTS
 7300
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # timestamp_col: No values pass
@@ -221,8 +221,8 @@ select count(*) from functional_parquet.alltypes where 
timestamp_col = '2009-01-
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 8.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 24
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # TPC-H lineitem.l_orderkey has high cardinality (1500000)
@@ -233,8 +233,8 @@ select count(*) from tpch_parquet.lineitem where l_orderkey 
= 50;
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 1.*
-row_regex: .*NumDictFilteredRowGroups: 0.*
+aggregation(SUM, NumRowGroups): 3
+aggregation(SUM, NumDictFilteredRowGroups): 0
 ====
 ---- QUERY
 # Verify dictionary filtering on top level of a schema with nested
@@ -243,6 +243,6 @@ select count(*) from tpch_nested_parquet.customer where 
c_mktsegment = 'COMEDY';
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE
-row_regex: .*NumRowGroups: 1.*
-row_regex: .*NumDictFilteredRowGroups: 1.*
+aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumDictFilteredRowGroups): 4
 ====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/tests/common/test_result_verifier.py
----------------------------------------------------------------------
diff --git a/tests/common/test_result_verifier.py 
b/tests/common/test_result_verifier.py
index c7f890a..a816eaf 100644
--- a/tests/common/test_result_verifier.py
+++ b/tests/common/test_result_verifier.py
@@ -456,18 +456,87 @@ def parse_result_rows(exec_result):
     result.append(','.join(new_cols))
   return result
 
+# Special syntax for basic aggregation over fields in the runtime profile.
+# The syntax is:
+# aggregation(function, field_name): expected_value
+# Currently, the only implemented function is SUM and only integers are 
supported.
+AGGREGATION_PREFIX_PATTERN = 'aggregation\('
+AGGREGATION_PREFIX = re.compile(AGGREGATION_PREFIX_PATTERN)
+AGGREGATION_SYNTAX_MATCH_PATTERN = 'aggregation\((\w+)[ ]*,[ ]*(\w+)\):[ 
]*(\d+)'
+
+def try_compile_aggregation(row_string):
+  """
+  Check to see if this row string specifies an aggregation. If the row string 
contains
+  an aggregation, it returns a tuple with all the information for evaluating 
the
+  aggregation. Otherwise, it returns None.
+  """
+  if row_string and AGGREGATION_PREFIX.match(row_string):
+    function, field, value = re.findall(AGGREGATION_SYNTAX_MATCH_PATTERN, 
row_string)[0]
+    # Validate function
+    assert(function == 'SUM')
+    # Validate value is integer
+    expected_value = int(value)
+    return (function, field, expected_value)
+  return None
+
+def compute_aggregation(function, field, runtime_profile):
+  """
+  Evaluate an aggregation function over a field on the runtime_profile. This 
skips
+  the averaged fragment and returns the aggregate value. It currently supports 
only
+  integer values and the SUM function.
+  """
+  start_avg_fragment_re = re.compile('[ ]*Averaged Fragment')
+  field_regex = "{0}: (\d+)".format(field)
+  field_regex_re = re.compile(field_regex)
+  inside_avg_fragment = False
+  avg_fragment_indent = None
+  past_avg_fragment = False
+  match_list = []
+  for line in runtime_profile.splitlines():
+    # Detect the boundaries of the averaged fragment by looking at indentation.
+    # The averaged fragment starts with a particular indentation level. All of
+    # its children are at a greater indent. When the indentation gets back to
+    # the level of the the averaged fragment start, then the averaged fragment
+    # is done.
+    if inside_avg_fragment:
+      indentation = len(line) - len(line.lstrip())
+      if indentation > avg_fragment_indent:
+        continue
+      else:
+        inside_avg_fragment = False
+        past_avg_fragment = True
+
+    if not past_avg_fragment and start_avg_fragment_re.match(line):
+      inside_avg_fragment = True
+      avg_fragment_indent = len(line) - len(line.lstrip())
+      continue
+
+    if (field_regex_re.search(line)):
+      match_list.extend(re.findall(field_regex, line))
+
+  int_match_list = map(int, match_list)
+  result = None
+  if function == 'SUM':
+    result = sum(int_match_list)
+
+  return result
+
 def verify_runtime_profile(expected, actual):
   """
   Check that lines matching all of the expected runtime profile entries are 
present
   in the actual text runtime profile. The check passes if, for each of the 
expected
   rows, at least one matching row is present in the actual runtime profile. 
Rows
-  with the "row_regex:" prefix are treated as regular expressions.
+  with the "row_regex:" prefix are treated as regular expressions. Rows with
+  the "aggregation(function,field): value" syntax specifies an aggregation over
+  the runtime profile.
   """
   expected_lines = remove_comments(expected).splitlines()
   matched = [False] * len(expected_lines)
   expected_regexes = []
+  expected_aggregations = []
   for expected_line in expected_lines:
     expected_regexes.append(try_compile_regex(expected_line))
+    expected_aggregations.append(try_compile_aggregation(expected_line))
 
   # Check the expected and actual rows pairwise.
   for line in actual.splitlines():
@@ -475,6 +544,9 @@ def verify_runtime_profile(expected, actual):
       if matched[i]: continue
       if expected_regexes[i] is not None:
         match = expected_regexes[i].match(line)
+      elif expected_aggregations[i] is not None:
+        # Aggregations are enforced separately
+        match = True
       else:
         match = expected_lines[i].strip() == line.strip()
       if match:
@@ -489,6 +561,15 @@ def verify_runtime_profile(expected, actual):
       "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % 
('\n'.join(unmatched_lines),
         actual))
 
+  # Compute the aggregations and check against values
+  for i in xrange(len(expected_aggregations)):
+    if (expected_aggregations[i] is None): continue
+    function, field, expected_value = expected_aggregations[i]
+    actual_value = compute_aggregation(function, field, actual)
+    assert actual_value == expected_value, ("Aggregation of %s over %s did not 
match "
+        "expected results.\nEXPECTED VALUE:\n%d\n\nACTUAL VALUE:\n%d"
+        "\n\nPROFILE:\n%s\n" % (function, field, expected_value, actual_value, 
actual))
+
 def get_node_exec_options(profile_string, exec_node_id):
   """ Return a list with all of the ExecOption strings for the given exec node 
id. """
   results = []

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/tests/query_test/test_mt_dop.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_mt_dop.py b/tests/query_test/test_mt_dop.py
index 1096e64..6ba8184 100644
--- a/tests/query_test/test_mt_dop.py
+++ b/tests/query_test/test_mt_dop.py
@@ -98,18 +98,8 @@ class TestMtDopParquet(ImpalaTestSuite):
     vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
     self.run_test_case('QueryTest/mt-dop-parquet-nested', vector)
 
-# Parquet filtering test rlies on a specific mt_dop value, so keep in its own 
test
-class TestMtDopParquetFiltering(ImpalaTestSuite):
-  @classmethod
-  def get_workload(cls):
-    return 'functional-query'
-
-  @classmethod
-  def add_test_dimensions(cls):
-    super(TestMtDopParquetFiltering, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_constraint(
-      lambda v: v.get_value('table_format').file_format == 'parquet')
-
   def test_parquet_filtering(self, vector):
-    vector.get_value('exec_option')['mt_dop'] = 3
-    self.run_test_case('QueryTest/mt-dop-parquet-filtering', vector)
+    """IMPALA-4624: Test that dictionary filtering eliminates row groups 
correctly."""
+    vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
+    self.run_test_case('QueryTest/parquet-filtering', vector)
+

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6441ca65/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index cb24923..8ba2e0b 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -327,10 +327,6 @@ class TestParquet(ImpalaTestSuite):
     self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort',
                        vector, unique_database)
 
-  def test_filtering(self, vector):
-    """IMPALA-4624: Test that dictionary filtering eliminates row groups 
correctly."""
-    self.run_test_case('QueryTest/parquet-filtering', vector)
-
   @SkipIfS3.hdfs_block_size
   @SkipIfIsilon.hdfs_block_size
   @SkipIfLocal.multiple_impalad

incubator-impala git commit: IMPALA-5039: Fix variability in parquet dictionary filtering test

Reply via email to