This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new e7839c4 IMPALA-10416: Add raw string mode for testfiles to verify
non-ascii results
e7839c4 is described below
commit e7839c4530df7161240eac9852c87a4c37c53fd1
Author: stiga-huang <[email protected]>
AuthorDate: Mon Jan 4 11:00:09 2021 +0800
IMPALA-10416: Add raw string mode for testfiles to verify non-ascii results
Currently, the result section of the testfile is required to used
escaped strings. Take the following result section as an example:
--- RESULTS
'Alice\nBob'
'Alice\\nBob'
The first line is a string with a newline character. The second line is
a string with a '\' and an 'n' character. When comparing with the actual
query results, we need to escape the special characters in the actual
results, e.g. replace newline characters with '\n'. This is done by
invoking encode('unicode_escape') on the actual result strings. However,
the input type of this method is unicode instead of str. When calling it
on str vars, Python will implicitly convert the input vars to unicode
type. The default encoding, ascii, is used. This causes
UnicodeDecodeError when the str contains non-ascii bytes. To fix this,
this patch explicitly decodes the input str using 'utf-8' encoding.
After fixing the logic of escaping the actual result strings, the next
problem is that it's painful to write unicode-escaped expected results.
Here is an example:
---- QUERY
select "你好\n你好"
---- RESULTS
'\u4f60\u597d\n\u4f60\u597d'
---- TYPES
STRING
It's painful to manually translate the unicode characters.
This patch adds a new comment, RAW_STRING, for the result section to use
raw strings instead of unicode-escaped strings. Here is an example:
---- QUERY
select "你好"
---- RESULTS: RAW_STRING
'你好'
---- TYPES
STRING
If the result contains special characters, it's recommended to use the
default string mode. If the special characters only contain newline
characters, we can use RAW_STRING and the existing MULTI_LINE comment
together.
This patch also fixes the issue that pytest fails to report assertion
failures if any of the compared str values contain non-ascii bytes
(IMPALA-10419). However, pytest works if the compared values are both
in unicode type. So we explicitly converting the actual and expected str
values to unicode type.
Test:
- Add tests in special-strings.test for raw string mode and the escaped
string mode (default).
- Run test_exprs.py::TestExprs::test_special_strings locally.
Change-Id: I7cc2ea3e5849bd3d973f0cb91322633bcc0ffa4b
Reviewed-on: http://gerrit.cloudera.org:8080/16919
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../queries/QueryTest/special-strings.test | 82 ++++++++++++++++++++++
tests/common/test_result_verifier.py | 59 +++++++++++-----
tests/util/test_file_parser.py | 2 +
3 files changed, 125 insertions(+), 18 deletions(-)
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/special-strings.test
b/testdata/workloads/functional-query/queries/QueryTest/special-strings.test
index 99a694c..495682d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/special-strings.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/special-strings.test
@@ -24,3 +24,85 @@ select "'"
---- TYPES
STRING
====
+---- QUERY
+select "你好"
+---- RESULTS
+'\u4f60\u597d'
+---- TYPES
+STRING
+====
+---- QUERY
+select "你好"
+---- RESULTS: RAW_STRING
+'你好'
+---- TYPES
+STRING
+====
+---- QUERY
+select "你好\n你好"
+---- RESULTS
+'\u4f60\u597d\n\u4f60\u597d'
+---- TYPES
+STRING
+====
+---- QUERY
+select "你好\n你好"
+---- RESULTS: RAW_STRING,MULTI_LINE
+['你好
+你好']
+---- TYPES
+STRING
+====
+---- QUERY
+select "你好\\n你好"
+---- RESULTS
+'\u4f60\u597d\\n\u4f60\u597d'
+---- TYPES
+STRING
+====
+---- QUERY
+select "你好\\n你好"
+---- RESULTS: RAW_STRING
+'你好\n你好'
+---- TYPES
+STRING
+====
+---- QUERY
+select "你好", "Halló", "여보세요"
+---- RESULTS
+'\u4f60\u597d','Hall\xf3','\uc5ec\ubcf4\uc138\uc694'
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+select "你好", "Halló", "여보세요"
+---- RESULTS: RAW_STRING
+'你好','Halló','여보세요'
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+values ("你好"), ("Halló"), ("여보세요")
+---- RESULTS: RAW_STRING,VERIFY_IS_SUBSET
+'你好'
+'여보세요'
+---- TYPES
+STRING
+====
+---- QUERY
+values ("你好"), ("Halló"), ("여보세요")
+---- RESULTS: RAW_STRING,VERIFY_IS_SUPERSET
+'你好'
+'여보세요'
+'Halló'
+'hello'
+---- TYPES
+STRING
+====
+---- QUERY
+values ("你好"), ("Halló"), ("여보세요")
+---- RESULTS: RAW_STRING,VERIFY_IS_NOT_IN
+'hello'
+---- TYPES
+STRING
+====
diff --git a/tests/common/test_result_verifier.py
b/tests/common/test_result_verifier.py
index 22ad1fc..124fa86 100644
--- a/tests/common/test_result_verifier.py
+++ b/tests/common/test_result_verifier.py
@@ -240,8 +240,8 @@ def verify_query_result_is_subset(expected_results,
actual_results):
"""Check whether the results in expected_results are a subset of the results
in
actual_results. This uses set semantics, i.e. any duplicates are ignored."""
expected_literals, expected_non_literals = expected_results.separate_rows()
- expected_literal_strings = set([str(row) for row in expected_literals])
- actual_literal_strings = set([str(row) for row in actual_results.rows])
+ expected_literal_strings = set([unicode(row) for row in expected_literals])
+ actual_literal_strings = set([unicode(row) for row in actual_results.rows])
# Expected literal strings must all be present in the actual strings.
assert expected_literal_strings <= actual_literal_strings
# Expected patterns must be present in the actual strings.
@@ -251,18 +251,18 @@ def verify_query_result_is_subset(expected_results,
actual_results):
if actual_row == expected_row:
matched = True
break
- assert matched, "Could not find expected row {0} in actual
rows:\n{1}".format(
- str(expected_row), str(actual_results))
+ assert matched, u"Could not find expected row {0} in actual
rows:\n{1}".format(
+ unicode(expected_row), unicode(actual_results))
def verify_query_result_is_superset(expected_results, actual_results):
"""Check whether the results in expected_results are a superset of the
results in
actual_results. This uses set semantics, i.e. any duplicates are ignored."""
expected_literals, expected_non_literals = expected_results.separate_rows()
- expected_literal_strings = set([str(row) for row in expected_literals])
+ expected_literal_strings = set([unicode(row) for row in expected_literals])
# Check that all actual rows are present in either expected_literal_strings
or
# expected_non_literals.
for actual_row in actual_results.rows:
- if str(actual_row) in expected_literal_strings:
+ if unicode(actual_row) in expected_literal_strings:
# Matched to a literal string
continue
matched = False
@@ -270,8 +270,8 @@ def verify_query_result_is_superset(expected_results,
actual_results):
if actual_row == expected_row:
matched = True
break
- assert matched, "Could not find actual row {0} in expected
rows:\n{1}".format(
- str(actual_row), str(expected_results))
+ assert matched, u"Could not find actual row {0} in expected
rows:\n{1}".format(
+ unicode(actual_row), unicode(expected_results))
def verify_query_result_is_equal(expected_results, actual_results):
assert_args_not_none(expected_results, actual_results)
@@ -279,8 +279,8 @@ def verify_query_result_is_equal(expected_results,
actual_results):
def verify_query_result_is_not_in(expected_results, actual_results):
assert_args_not_none(expected_results, actual_results)
- expected_set = set(map(str, expected_results.rows))
- actual_set = set(map(str, actual_results.rows))
+ expected_set = set(map(unicode, expected_results.rows))
+ actual_set = set(map(unicode, actual_results.rows))
assert expected_set.isdisjoint(actual_set)
# Global dictionary that maps the verification type to appropriate verifier.
@@ -357,6 +357,15 @@ def verify_raw_results(test_section, exec_result,
file_format, result_section,
expected_results = None
if result_section in test_section:
expected_results = remove_comments(test_section[result_section])
+ if isinstance(expected_results, str):
+ # Always convert 'str' to 'unicode' since pytest will fail to report
assertion
+ # failures when any 'str' values contain non-ascii bytes (IMPALA-10419).
+ try:
+ expected_results = expected_results.decode('utf-8')
+ except UnicodeDecodeError as e:
+ LOG.info("Illegal UTF-8 characters in expected results:
{0}\n{1}".format(
+ expected_results, e))
+ assert False
else:
assert 'ERRORS' not in test_section, "'ERRORS' section must have
accompanying 'RESULTS' section"
LOG.info("No results found. Skipping verification")
@@ -442,15 +451,19 @@ def verify_raw_results(test_section, exec_result,
file_format, result_section,
if verifier and verifier.upper() == 'VERIFY_IS_EQUAL_SORTED':
order_matters = False
expected_results_list = []
+ is_raw_string = 'RAW_STRING' in test_section
if 'MULTI_LINE' in test_section:
- expected_results_list = map(lambda s: s.replace('\n', '\\n'),
- re.findall(r'\[(.*?)\]', expected_results, flags=re.DOTALL))
+ expected_results_list = re.findall(r'\[(.*?)\]', expected_results,
flags=re.DOTALL)
+ if not is_raw_string:
+ # Needs escaping
+ expected_results_list = map(lambda s: s.replace('\n', '\\n'),
expected_results_list)
else:
expected_results_list = split_section_lines(expected_results)
expected = QueryTestResult(expected_results_list, expected_types,
actual_labels, order_matters)
- actual = QueryTestResult(parse_result_rows(exec_result), actual_types,
- actual_labels, order_matters)
+ actual = QueryTestResult(
+ parse_result_rows(exec_result, escape_strings=(not is_raw_string)),
+ actual_types, actual_labels, order_matters)
assert verifier in VERIFIER_MAP.keys(), "Unknown verifier: " + verifier
try:
VERIFIER_MAP[verifier](expected, actual)
@@ -470,7 +483,8 @@ def create_query_result(exec_result, order_matters=False):
return QueryTestResult(data, exec_result.column_types,
exec_result.column_labels,
order_matters)
-def parse_result_rows(exec_result):
+
+def parse_result_rows(exec_result, escape_strings=True):
"""
Parses a query result set and transforms it to the format used by the query
test files
"""
@@ -490,9 +504,18 @@ def parse_result_rows(exec_result):
new_cols = list()
for i in xrange(len(cols)):
if col_types[i] in ['STRING', 'CHAR', 'VARCHAR']:
- col = cols[i].encode('unicode_escape')
- # Escape single quotes to match .test file format.
- col = col.replace("'", "''")
+ col = cols[i]
+ if isinstance(col, str):
+ try:
+ col = col.decode('utf-8')
+ except UnicodeDecodeError as e:
+ LOG.info("Illegal UTF-8 characters in actual results:
{0}\n{1}".format(
+ col, e))
+ assert False
+ if escape_strings:
+ col = col.encode('unicode_escape').decode('utf-8')
+ # Escape single quotes to match .test file format.
+ col = col.replace("'", "''")
new_cols.append("'%s'" % col)
else:
new_cols.append(cols[i])
diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index 095312f..b4d4a17 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -215,6 +215,8 @@ def parse_test_file_text(text, valid_section_names,
skip_unknown_sections=True):
for comment in subsection_comment.split(','):
if comment == 'MULTI_LINE':
parsed_sections['MULTI_LINE'] = comment
+ elif comment == 'RAW_STRING':
+ parsed_sections['RAW_STRING'] = comment
elif comment.startswith('VERIFY'):
parsed_sections['VERIFIER'] = comment
else: