IMPALA-2717: fix output of formatted unicode to non-TTY The bug is that PrettyOutputFormatter.format() returned a unicode object, and Python cannot automatically write unicode objects to output streams where there is no default encoding.
The fix is to convert to UTF-8 encoded in a regular string, which can be output to any output device. This makes the output type consistent with DelimitedOutputFormatter.format(). Based on code by Marcell Szabo. Testing: Added a basic test. Played around in an interactive shell to make sure that unicode characters still work in interactive mode. Change-Id: I9de641ecf767a2feef3b9f48b344ef2d55e17a7f Reviewed-on: http://gerrit.cloudera.org:8080/9928 Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/371107ab Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/371107ab Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/371107ab Branch: refs/heads/2.x Commit: 371107abd66fd9f519c49559a850d10245958a2d Parents: 1c896ef Author: Tim Armstrong <tarmstr...@cloudera.com> Authored: Wed Apr 4 11:51:51 2018 -0700 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Fri Apr 13 03:26:26 2018 +0000 ---------------------------------------------------------------------- shell/impala_shell.py | 4 +++- shell/shell_output.py | 8 +++++++- tests/shell/test_shell_commandline.py | 25 +++++++++++++++++++++---- 3 files changed, 31 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/371107ab/shell/impala_shell.py ---------------------------------------------------------------------- diff --git a/shell/impala_shell.py b/shell/impala_shell.py index 67391bf..a2dd89f 100755 --- a/shell/impala_shell.py +++ b/shell/impala_shell.py @@ -70,7 +70,9 @@ class CmdStatus: ERROR = False class ImpalaPrettyTable(prettytable.PrettyTable): - """Patched version of PrettyTable that TODO""" + """Patched version of PrettyTable with different unicode handling - instead of throwing + exceptions when a character can't be converted to unicode, it is replaced with a + placeholder character.""" def _unicode(self, value): if not isinstance(value, basestring): value = str(value) http://git-wip-us.apache.org/repos/asf/impala/blob/371107ab/shell/shell_output.py ---------------------------------------------------------------------- diff --git a/shell/shell_output.py b/shell/shell_output.py index f0cecc8..8ab3bee 100644 --- a/shell/shell_output.py +++ b/shell/shell_output.py @@ -28,11 +28,16 @@ class PrettyOutputFormatter(object): self.prettytable = prettytable def format(self, rows): + """Returns string containing UTF-8-encoded representation of the table data.""" # Clear rows that already exist in the table. self.prettytable.clear_rows() try: map(self.prettytable.add_row, rows) - return self.prettytable.get_string() + # PrettyTable.get_string() converts UTF-8-encoded strs added via add_row() into + # Python unicode strings. We need to convert it back to a UTF-8-encoded str for + # output, since Python won't do the encoding automatically when outputting to a + # non-terminal (see IMPALA-2717). + return self.prettytable.get_string().encode('utf-8') except Exception, e: # beeswax returns each row as a tab separated string. If a string column # value in a row has tabs, it will break the row split. Default to displaying @@ -53,6 +58,7 @@ class DelimitedOutputFormatter(object): raise ValueError, error_msg def format(self, rows): + """Returns string containing UTF-8-encoded representation of the table data.""" # csv.writer expects a file handle to the input. # cStringIO is used as the temporary buffer. temp_buffer = StringIO() http://git-wip-us.apache.org/repos/asf/impala/blob/371107ab/tests/shell/test_shell_commandline.py ---------------------------------------------------------------------- diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py index f5f67c0..10513b6 100644 --- a/tests/shell/test_shell_commandline.py +++ b/tests/shell/test_shell_commandline.py @@ -33,6 +33,8 @@ from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell DEFAULT_QUERY = 'select 1' QUERY_FILE_PATH = os.path.join(os.environ['IMPALA_HOME'], 'tests', 'shell') +RUSSIAN_CHARS = (u"Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð ," + u"С, Т, У, Ф, Ð¥, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Ð, Ю, Я") @pytest.fixture def empty_table(unique_database, request): @@ -406,12 +408,27 @@ class TestImpalaShell(ImpalaTestSuite): def test_international_characters(self): """Sanity test to ensure that the shell can read international characters.""" - russian_chars = (u"Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð, Ð ," - u"С, Т, У, Ф, Ð¥, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Ð, Ю, Я") - args = """-B -q "select '%s'" """ % russian_chars + args = """-B -q "select '%s'" """ % RUSSIAN_CHARS result = run_impala_shell_cmd(args.encode('utf-8')) assert 'UnicodeDecodeError' not in result.stderr - assert russian_chars.encode('utf-8') in result.stdout + assert RUSSIAN_CHARS.encode('utf-8') in result.stdout + + def test_international_characters_prettyprint(self): + """IMPALA-2717: ensure we can handle international characters in pretty-printed + output""" + args = """-q "select '%s'" """ % RUSSIAN_CHARS + result = run_impala_shell_cmd(args.encode('utf-8')) + assert 'UnicodeDecodeError' not in result.stderr + assert RUSSIAN_CHARS.encode('utf-8') in result.stdout + + def test_international_characters_prettyprint_tabs(self): + """IMPALA-2717: ensure we can handle international characters in pretty-printed + output when pretty-printing falls back to delimited output.""" + args = """-q "select '%s\\t'" """ % RUSSIAN_CHARS + result = run_impala_shell_cmd(args.encode('utf-8')) + assert 'Reverting to tab delimited text' in result.stderr + assert 'UnicodeDecodeError' not in result.stderr + assert RUSSIAN_CHARS.encode('utf-8') in result.stdout @pytest.mark.execute_serially # This tests invalidates metadata, and must run serially def test_config_file(self):