Repository: impala Updated Branches: refs/heads/master 70e2d57fc -> 2a40e8f2a
IMPALA-7259: Improve Impala shell performance This patch fixes the slow performance in Impala shell, especially for large queries by replacing all calls to sqlparse.format(sql_string, strip_comments=True) with the custom implementation of strip comments that does not use grouping. The code to strip leading comments was also refactored to not use grouping. * Benchmark running a query with 12K columns * Before the patch: $ time impala-shell.sh -f large.sql --quiet real 2m4.154s user 2m0.536s sys 0m0.088s After the patch: $ time impala-shell.sh -f large.sql --quiet real 0m3.885s user 0m1.516s sys 0m0.048s Testing: - Added a new test to test the Impala shell performance - Ran all shell tests on Python 2.6 and Python 2.7 Change-Id: Idac9f3caed7c44846a8c922dbe5ca3bf3b095b81 Reviewed-on: http://gerrit.cloudera.org:8080/10939 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1d491d64 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1d491d64 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1d491d64 Branch: refs/heads/master Commit: 1d491d64803908ece376df2d445c7a7f6d5920c8 Parents: 70e2d57 Author: Fredy Wijaya <[email protected]> Authored: Fri Jul 13 03:23:37 2018 -0500 Committer: Fredy Wijaya <[email protected]> Committed: Thu Jul 19 15:32:46 2018 +0000 ---------------------------------------------------------------------- shell/impala_shell.py | 47 +++++++++++++++++++++--------- tests/shell/test_shell_commandline.py | 38 +++++++++++++++++++++++- 2 files changed, 70 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/1d491d64/shell/impala_shell.py ---------------------------------------------------------------------- diff --git a/shell/impala_shell.py b/shell/impala_shell.py index aea8350..95ff8ab 100755 --- a/shell/impala_shell.py +++ b/shell/impala_shell.py @@ -59,6 +59,18 @@ try: except Exception: pass +def strip_comments(sql): + """sqlparse default implementation of strip comments has a bad performance when parsing + very large SQL due to the grouping. This is because the default implementation tries to + format the SQL for pretty-printing. Impala shell use of strip comments is mostly for + checking and not for altering the actual SQL, so having a pretty-formatted SQL is + irrelevant in Impala shell. Removing the grouping gives a significant performance boost. + """ + stack = sqlparse.engine.FilterStack() + stack.stmtprocess.append(sqlparse.filters.StripCommentsFilter()) + stack.postprocess.append(sqlparse.filters.SerializerUnicode()) + return ''.join(stack.run(sql, 'utf-8')).strip() + class CmdStatus: """Values indicate the execution status of a command to the cmd shell driver module SUCCESS and ERROR continue running the shell and ABORT exits the shell @@ -400,13 +412,13 @@ class ImpalaShell(object, cmd.Cmd): # Strip any comments to make a statement such as the following be considered as # ending with a delimiter: # select 1 + 1; -- this is a comment - line = sqlparse.format(line, strip_comments=True).encode('utf-8').rstrip() + line = strip_comments(line).encode('utf-8').rstrip() if line.endswith(ImpalaShell.CMD_DELIM): try: # Look for an open quotation in the entire command, and not just the # current line. if self.partial_cmd: - line = sqlparse.format('%s %s' % (self.partial_cmd, line), strip_comments=True) + line = strip_comments('%s %s' % (self.partial_cmd, line)) self._shlex_split(line) return True # If the command ends with a delimiter, check if it has an open quotation. @@ -1138,7 +1150,7 @@ class ImpalaShell(object, cmd.Cmd): query = self._create_beeswax_query(args) # Set posix=True and add "'" to escaped quotes # to deal with escaped quotes in string literals - lexer = shlex.shlex(sqlparse.format(query.query.lstrip(), strip_comments=True) + lexer = shlex.shlex(strip_comments(query.query.lstrip()) .encode('utf-8'), posix=True) lexer.escapedquotes += "'" try: @@ -1341,24 +1353,31 @@ class ImpalaShell(object, cmd.Cmd): def _process(self, tlist): token = tlist.token_first() if self._is_comment(token): - self.comment = token.value - tidx = tlist.token_index(token) - tlist.tokens.pop(tidx) + self.comment = '' + while token: + if self._is_comment(token) or self._is_whitespace(token): + tidx = tlist.token_index(token) + self.comment += token.value + tlist.tokens.pop(tidx) + tidx -= 1 + token = tlist.token_next(tidx, False) + else: + break def _is_comment(self, token): - if isinstance(token, sqlparse.sql.Comment): - return True - for comment in sqlparse.tokens.Comment: - if token.ttype == comment: - return True - return False + return isinstance(token, sqlparse.sql.Comment) or \ + token.ttype == sqlparse.tokens.Comment.Single or \ + token.ttype == sqlparse.tokens.Comment.Multiline + + def _is_whitespace(self, token): + return token.ttype == sqlparse.tokens.Whitespace or \ + token.ttype == sqlparse.tokens.Newline def process(self, stack, stmt): [self.process(stack, sgroup) for sgroup in stmt.get_sublists()] self._process(stmt) stack = sqlparse.engine.FilterStack() - stack.enable_grouping() strip_leading_comment_filter = StripLeadingCommentFilter() stack.stmtprocess.append(strip_leading_comment_filter) stack.postprocess.append(sqlparse.filters.SerializerUnicode()) @@ -1481,7 +1500,7 @@ def parse_query_text(query_text, utf8_encode_policy='strict'): # "--comment2" is sent as is. Impala's parser however doesn't consider it a valid SQL # and throws an exception. We identify such trailing comments and ignore them (do not # send them to Impala). - if query_list and not sqlparse.format(query_list[-1], strip_comments=True).strip("\n"): + if query_list and not strip_comments(query_list[-1]).strip("\n"): query_list.pop() return query_list http://git-wip-us.apache.org/repos/asf/impala/blob/1d491d64/tests/shell/test_shell_commandline.py ---------------------------------------------------------------------- diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py index 93387fa..40453c8 100644 --- a/tests/shell/test_shell_commandline.py +++ b/tests/shell/test_shell_commandline.py @@ -23,12 +23,13 @@ import re import signal import shlex import socket +import tempfile from subprocess import call, Popen from tests.common.impala_service import ImpaladService from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.skip import SkipIf -from time import sleep +from time import sleep, time from util import IMPALAD, SHELL_CMD from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell @@ -659,3 +660,38 @@ class TestImpalaShell(ImpalaTestSuite): args = "-q \"with v as (select 1) \;\"" result = run_impala_shell_cmd(args, expect_success=False) assert "Encountered: Unexpected character" in result.stderr + + def test_large_sql(self, unique_database): + sql_file, sql_path = tempfile.mkstemp() + os.write(sql_file, "create table large_table (\n") + num_cols = 4000 + for i in xrange(num_cols): + os.write(sql_file, "col_{0} int".format(i)) + if i < num_cols - 1: + os.write(sql_file, ",") + os.write(sql_file, "\n") + os.write(sql_file, ");") + os.write(sql_file, "select \n") + + for i in xrange(num_cols): + if i < num_cols: + os.write(sql_file, 'col_{0} as a{1},\n'.format(i, i)) + os.write(sql_file, 'col_{0} as b{1},\n'.format(i, i)) + os.write(sql_file, 'col_{0} as c{1}{2}\n'.format(i, i, + ',' if i < num_cols - 1 else '')) + os.write(sql_file, 'from large_table;') + os.close(sql_file) + + try: + args = "-f {0} -d {1}".format(sql_path, unique_database) + start_time = time() + result = run_impala_shell_cmd(args) + end_time = time() + assert "Fetched 0 row(s)" in result.stderr + time_limit_s = 30 + actual_time_s = end_time - start_time + assert actual_time_s <= time_limit_s, ( + "It took {0} seconds to execute the query. Time limit is {1} seconds.".format( + actual_time_s, time_limit_s)) + finally: + os.remove(sql_path)
