IMPALA-2717: fix output of formatted unicode to non-TTY

The bug is that PrettyOutputFormatter.format() returned a unicode
object, and Python cannot automatically write unicode objects to
output streams where there is no default encoding.

The fix is to convert to UTF-8 encoded in a regular string, which
can be output to any output device. This makes the output type
consistent with DelimitedOutputFormatter.format().

Based on code by Marcell Szabo.

Testing:
Added a basic test.

Played around in an interactive shell to make sure that unicode
characters still work in interactive mode.

Change-Id: I9de641ecf767a2feef3b9f48b344ef2d55e17a7f
Reviewed-on: http://gerrit.cloudera.org:8080/9928
Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/318051cc
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/318051cc
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/318051cc

Branch: refs/heads/master
Commit: 318051cc21cc7fbe96886e30b3f13b90bbb7b50a
Parents: 75b612a
Author: Tim Armstrong <tarmstr...@cloudera.com>
Authored: Wed Apr 4 11:51:51 2018 -0700
Committer: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
Committed: Thu Apr 12 20:34:47 2018 +0000

----------------------------------------------------------------------
 shell/impala_shell.py                 |  4 +++-
 shell/shell_output.py                 |  8 +++++++-
 tests/shell/test_shell_commandline.py | 25 +++++++++++++++++++++----
 3 files changed, 31 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/318051cc/shell/impala_shell.py
----------------------------------------------------------------------
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index 93bdafb..55ea692 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -70,7 +70,9 @@ class CmdStatus:
   ERROR = False
 
 class ImpalaPrettyTable(prettytable.PrettyTable):
-  """Patched version of PrettyTable that TODO"""
+  """Patched version of PrettyTable with different unicode handling - instead 
of throwing
+  exceptions when a character can't be converted to unicode, it is replaced 
with a
+  placeholder character."""
   def _unicode(self, value):
     if not isinstance(value, basestring):
       value = str(value)

http://git-wip-us.apache.org/repos/asf/impala/blob/318051cc/shell/shell_output.py
----------------------------------------------------------------------
diff --git a/shell/shell_output.py b/shell/shell_output.py
index f0cecc8..8ab3bee 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -28,11 +28,16 @@ class PrettyOutputFormatter(object):
     self.prettytable = prettytable
 
   def format(self, rows):
+    """Returns string containing UTF-8-encoded representation of the table 
data."""
     # Clear rows that already exist in the table.
     self.prettytable.clear_rows()
     try:
       map(self.prettytable.add_row, rows)
-      return self.prettytable.get_string()
+      # PrettyTable.get_string() converts UTF-8-encoded strs added via 
add_row() into
+      # Python unicode strings. We need to convert it back to a UTF-8-encoded 
str for
+      # output, since Python won't do the encoding automatically when 
outputting to a
+      # non-terminal (see IMPALA-2717).
+      return self.prettytable.get_string().encode('utf-8')
     except Exception, e:
       # beeswax returns each row as a tab separated string. If a string column
       # value in a row has tabs, it will break the row split. Default to 
displaying
@@ -53,6 +58,7 @@ class DelimitedOutputFormatter(object):
         raise ValueError, error_msg
 
   def format(self, rows):
+    """Returns string containing UTF-8-encoded representation of the table 
data."""
     # csv.writer expects a file handle to the input.
     # cStringIO is used as the temporary buffer.
     temp_buffer = StringIO()

http://git-wip-us.apache.org/repos/asf/impala/blob/318051cc/tests/shell/test_shell_commandline.py
----------------------------------------------------------------------
diff --git a/tests/shell/test_shell_commandline.py 
b/tests/shell/test_shell_commandline.py
index e69c512..6aa05f6 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -33,6 +33,8 @@ from util import assert_var_substitution, 
run_impala_shell_cmd, ImpalaShell
 DEFAULT_QUERY = 'select 1'
 QUERY_FILE_PATH = os.path.join(os.environ['IMPALA_HOME'], 'tests', 'shell')
 
+RUSSIAN_CHARS = (u"А, Б, В, Г, Д, Е, Ё, Ж, З, И, Й, К, Л, М, Н, 
О, П, Р,"
+                 u"С, Т, У, Ф, Х, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Э, Ю, Я")
 
 @pytest.fixture
 def empty_table(unique_database, request):
@@ -405,12 +407,27 @@ class TestImpalaShell(ImpalaTestSuite):
 
   def test_international_characters(self):
     """Sanity test to ensure that the shell can read international 
characters."""
-    russian_chars = (u"А, Б, В, Г, Д, Е, Ё, Ж, З, И, Й, К, Л, М, 
Н, О, П, Р,"
-                     u"С, Т, У, Ф, Х, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Э, Ю, 
Я")
-    args = """-B -q "select '%s'" """ % russian_chars
+    args = """-B -q "select '%s'" """ % RUSSIAN_CHARS
     result = run_impala_shell_cmd(args.encode('utf-8'))
     assert 'UnicodeDecodeError' not in result.stderr
-    assert russian_chars.encode('utf-8') in result.stdout
+    assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
+
+  def test_international_characters_prettyprint(self):
+    """IMPALA-2717: ensure we can handle international characters in 
pretty-printed
+    output"""
+    args = """-q "select '%s'" """ % RUSSIAN_CHARS
+    result = run_impala_shell_cmd(args.encode('utf-8'))
+    assert 'UnicodeDecodeError' not in result.stderr
+    assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
+
+  def test_international_characters_prettyprint_tabs(self):
+    """IMPALA-2717: ensure we can handle international characters in 
pretty-printed
+    output when pretty-printing falls back to delimited output."""
+    args = """-q "select '%s\\t'" """ % RUSSIAN_CHARS
+    result = run_impala_shell_cmd(args.encode('utf-8'))
+    assert 'Reverting to tab delimited text' in result.stderr
+    assert 'UnicodeDecodeError' not in result.stderr
+    assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
 
   @pytest.mark.execute_serially  # This tests invalidates metadata, and must 
run serially
   def test_config_file(self):

Reply via email to