Repository: incubator-impala Updated Branches: refs/heads/master 60c41c4f0 -> 2665866c1
IMPALA-4351,IMPALA-4353: [qgen] randomly generate INSERT statements - Generate INSERT statements that are either INSERT ... VALUES or INSERT ... SELECT - On both types of INSERTs, we either insert into all columns, or into some column list. If the column list exists, all primary keys will be present, and 0 or more additional columns will also be in the list. The ordering of the column list is random. - For INSERT ... SELECT, occasionally generate a WITH clause - For INSERT ... VALUES, generate non-null constants for the primary keys, but for the non-primary keys, randomly generate a value expression. The type system in the random statement/query generator isn't sophisticated enough to the implicit type of a SELECT item or a value expression. It knows it will be some INT-based type, but not if it's going to be a SMALLINT or a BIGINT. To get around this, the easiest thing seems to be to explicitly cast the SELECT items or value expressions to the columns' so-called exact_type attribute. Much of the testing here involved running discrepancy_searcher.py --explain-only on both tpch_kudu and a random HDFS table, using both the default profile and DML-only profile. This was done to quickly find bugs in the statement generation, as they tend to bubble up as analysis errors. I expect to make other changes as follow on patches and more random statements find small test issues. For actual use against Kudu data, you need to migrate data from Kudu into PostgreSQL 5 (instructions tests/comparison/POSTGRES.txt) and run something like: tests/comparison/discrepancy_searcher.py \ --use-postgresql \ --postgresql-port 5433 \ --profile dmlonly \ --timeout 300 \ --db-name tpch_kudu \ --query-count 10 Change-Id: I842b41f0eed07ab30ec76d8fc3cdd5affb525af6 Reviewed-on: http://gerrit.cloudera.org:8080/5486 Reviewed-by: Jim Apple <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/db7facde Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/db7facde Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/db7facde Branch: refs/heads/master Commit: db7facdee0017965e276c830e2aee16590f7ce37 Parents: 60c41c4 Author: Michael Brown <[email protected]> Authored: Wed Dec 7 14:20:05 2016 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Jan 13 01:31:47 2017 +0000 ---------------------------------------------------------------------- tests/comparison/common.py | 2 +- tests/comparison/db_connection.py | 2 +- tests/comparison/funcs.py | 13 ++ tests/comparison/model_translator.py | 43 +++++-- tests/comparison/query_generator.py | 28 ++--- tests/comparison/query_profile.py | 114 +++++++++++++----- tests/comparison/statement_generator.py | 119 +++++++++++++++---- tests/comparison/tests/query_object_testdata.py | 34 ++++-- 8 files changed, 265 insertions(+), 90 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/common.py ---------------------------------------------------------------------- diff --git a/tests/comparison/common.py b/tests/comparison/common.py index 0736f5a..2b3217f 100644 --- a/tests/comparison/common.py +++ b/tests/comparison/common.py @@ -289,7 +289,7 @@ class Column(ValExpr): def __repr__(self): return '%s<name: %s, type: %s>' % ( - type(self).__name__, self.name, self.type.__name__) + type(self).__name__, self.name, self.exact_type.__name__) def __deepcopy__(self, memo): # Don't return a deep copy of owner, since that is a circular reference http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/db_connection.py ---------------------------------------------------------------------- diff --git a/tests/comparison/db_connection.py b/tests/comparison/db_connection.py index 040d30a..7117bff 100644 --- a/tests/comparison/db_connection.py +++ b/tests/comparison/db_connection.py @@ -123,7 +123,7 @@ class DbCursor(object): mismatch = True break for left, right in izip(common_table.cols, table.cols): - if not left.name == right.name and left.type == right.type: + if not (left.name == right.name and left.type == right.type): LOG.debug('Ignoring table %s. It has different columns %s vs %s.' % (table_name, left, right)) mismatch = True http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/funcs.py ---------------------------------------------------------------------- diff --git a/tests/comparison/funcs.py b/tests/comparison/funcs.py index b6d7b92..a534c00 100644 --- a/tests/comparison/funcs.py +++ b/tests/comparison/funcs.py @@ -458,6 +458,19 @@ def create_analytic( return func +class CastFunc(Func): + """ + This function is used internally by the InsertStatementGenerator to cast ValExprs into + the proper exact types of columns. + + Arguments: + val_expr: ValExpr to cast + type_: Type to cast ValExpr + """ + def __init__(self, val_expr, type_): + self.args = [val_expr, type_] + + create_func('IsNull', returns=Boolean, accepts=[DataType]) create_func('IsNotNull', returns=Boolean, accepts=[DataType]) create_func('And', returns=Boolean, accepts=[Boolean, Boolean]) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/model_translator.py ---------------------------------------------------------------------- diff --git a/tests/comparison/model_translator.py b/tests/comparison/model_translator.py index 14e70b7..ef87bed 100644 --- a/tests/comparison/model_translator.py +++ b/tests/comparison/model_translator.py @@ -24,10 +24,12 @@ from tests.comparison.common import StructColumn, CollectionColumn from tests.comparison.db_types import ( Char, Decimal, + Double, Float, Int, String, Timestamp, + TinyInt, VarChar) from tests.comparison.query import InsertStatement, Query from tests.comparison.query_flattener import QueryFlattener @@ -314,6 +316,12 @@ class SqlWriter(object): def _write_cast(self, arg, type): return 'CAST(%s AS %s)' % (self._write(arg), type) + def _write_cast_func(self, func): + val_expr = func.args[0] + type_ = func.args[1] + return 'CAST({val_expr} AS {type_})'.format( + val_expr=self._write(val_expr), type_=self._write(type_)) + def _write_date_add_year(self, func): return "%s + INTERVAL %s YEAR" \ % (self._write(func.args[0]), self._write(func.args[1])) @@ -399,11 +407,17 @@ class SqlWriter(object): return sql def _write_data_type_metaclass(self, data_type_class): - '''Write a data type class such as Int or Boolean.''' - aliases = getattr(data_type_class, self.DIALECT, None) - if aliases: - return aliases[0] - return data_type_class.__name__.upper() + '''Write a data type class such as Int, Boolean, or Decimal(4, 2).''' + if data_type_class == Char: + return 'CHAR({0})'.format(data_type_class.MAX) + elif data_type_class == VarChar: + return 'VARCHAR({0})'.format(data_type_class.MAX) + elif data_type_class == Decimal: + return 'DECIMAL({scale},{precision})'.format( + scale=data_type_class.MAX_DIGITS, + precision=data_type_class.MAX_FRACTIONAL_DIGITS) + else: + return data_type_class.__name__.upper() def _write_subquery(self, subquery): return '({0})'.format(self._write(subquery.query)) @@ -449,9 +463,9 @@ class SqlWriter(object): """ Return a string representing the VALUES clause of an INSERT query. """ - return 'VALUES {values_rows}'.format( - values_rows=', '.join([self._write(values_row) - for values_row in values_clause.values_rows])) + return 'VALUES\n{values_rows}'.format( + values_rows=',\n'.join([self._write(values_row) + for values_row in values_clause.values_rows])) def _write(self, object_): '''Return a sql string representation of the given object.''' @@ -694,9 +708,18 @@ class PostgresqlSqlWriter(SqlWriter): def _write_data_type_metaclass(self, data_type_class): '''Write a data type class such as Int or Boolean.''' - if data_type_class in (Char, String, VarChar): + if data_type_class == Double: + return 'DOUBLE PRECISION' + elif data_type_class == Float: + return 'REAL' + elif data_type_class == String: return 'VARCHAR(%s)' % data_type_class.MAX - return data_type_class.__name__.upper() + elif data_type_class == Timestamp: + return 'TIMESTAMP WITHOUT TIME ZONE' + elif data_type_class == TinyInt: + return 'SMALLINT' + else: + return super(PostgresqlSqlWriter, self)._write_data_type_metaclass(data_type_class) def _write_order_by_clause(self, order_by_clause): sql = 'ORDER BY ' http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/query_generator.py ---------------------------------------------------------------------- diff --git a/tests/comparison/query_generator.py b/tests/comparison/query_generator.py index dd30287..22e92f2 100644 --- a/tests/comparison/query_generator.py +++ b/tests/comparison/query_generator.py @@ -189,7 +189,7 @@ class QueryGenerator(object): and not select_clause.analytic_items \ and not select_clause.contains_approximate_types \ and self.profile.use_group_by_clause()): - group_by_items = [item for item in select_clause.basic_items + group_by_items = [deepcopy(item) for item in select_clause.basic_items if not item.val_expr.is_constant] # TODO: What if there are select_clause.analytic_items? if group_by_items: @@ -376,15 +376,15 @@ class QueryGenerator(object): def _create_basic_select_item(self, table_exprs, return_type): max_children = self.profile.choose_nested_expr_count() if max_children: - value = self._create_func_tree(return_type) - value = self._populate_func_with_vals(value, table_exprs) + value = self.create_func_tree(return_type) + value = self.populate_func_with_vals(value, table_exprs) elif return_type in table_exprs.col_types: value = self.profile.choose_val_expr(table_exprs.cols_by_type[return_type]) else: value = self.profile.choose_constant(return_type) return SelectItem(value) - def _create_func_tree(self, return_type, allow_subquery=False): + def create_func_tree(self, return_type, allow_subquery=False): '''Returns an instance of a basic function that has all of it's arguments either set to None or another instance of a function that has it's arguments set likewise. The caller should replace the None values with column references or constants as @@ -468,7 +468,7 @@ class QueryGenerator(object): matching_signatures.append(signature) return matching_signatures - def _populate_func_with_vals(self, + def populate_func_with_vals(self, func, table_exprs=TableExprList(), val_exprs=ValExprList(), @@ -538,8 +538,8 @@ class QueryGenerator(object): query.where_clause = WhereClause(correlation_condition) func.args[idx] = Subquery(query) else: - replacement_func = self._create_func_tree(func.type) - return self._populate_func_with_vals( + replacement_func = self.create_func_tree(func.type) + return self.populate_func_with_vals( replacement_func, table_exprs=table_exprs, val_exprs=val_exprs, @@ -565,7 +565,7 @@ class QueryGenerator(object): func.args[idx] = val arg = val elif arg.is_func: - func.args[idx] = self._populate_func_with_vals( + func.args[idx] = self.populate_func_with_vals( arg, table_exprs=table_exprs, val_exprs=val_exprs, @@ -579,7 +579,7 @@ class QueryGenerator(object): def _create_agg_select_item(self, table_exprs, basic_select_item_exprs, return_type): value = self._create_agg_func_tree(return_type) - value = self._populate_func_with_vals(value, table_exprs, basic_select_item_exprs) + value = self.populate_func_with_vals(value, table_exprs, basic_select_item_exprs) return SelectItem(value) def _create_agg_func_tree(self, return_type): @@ -589,7 +589,7 @@ class QueryGenerator(object): basic_funcs=FUNCS): '''Returns an instance of a function that is guaranteed to either be or contain an aggregate or analytic function. The arguments of the returned function will either - be None or an instance of a function as in _create_func_tree. + be None or an instance of a function as in create_func_tree. The chosen aggregate or analytic functions will be restricted to the list of functions in agg_funcs and analytic_funcs. @@ -934,7 +934,7 @@ class QueryGenerator(object): allow_agg = any(ifilter(lambda expr: expr.contains_agg, select_item_exprs)) value = self._create_analytic_func_tree(return_type, excluded_designs, allow_agg) - value = self._populate_func_with_vals( + value = self.populate_func_with_vals( value, table_exprs=TableExprList(), val_exprs=ValExprList()) @@ -1274,7 +1274,7 @@ class QueryGenerator(object): table_exprs = TableExprList(possible_left_table_exprs) table_exprs.append(right_table_expr) - self._populate_func_with_vals(root_predicate, table_exprs=table_exprs) + self.populate_func_with_vals(root_predicate, table_exprs=table_exprs) return root_predicate def _populate_null_args_list(self, @@ -1306,7 +1306,7 @@ class QueryGenerator(object): table_exprs, table_alias_prefix): predicate, _ = self._create_boolean_func_tree(allow_subquery=True) - predicate = self._populate_func_with_vals( + predicate = self.populate_func_with_vals( predicate, table_exprs=table_exprs, table_alias_prefix=table_alias_prefix) @@ -1320,7 +1320,7 @@ class QueryGenerator(object): predicate = self._create_agg_func_tree(Boolean) else: predicate, _ = self._create_boolean_func_tree() - predicate = self._populate_func_with_vals( + predicate = self.populate_func_with_vals( predicate, val_exprs=basic_select_item_exprs) # https://issues.cloudera.org/browse/IMPALA-1423 # Make sure any cols used have a table identifier. As of this writing the only http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/query_profile.py ---------------------------------------------------------------------- diff --git a/tests/comparison/query_profile.py b/tests/comparison/query_profile.py index b1e4ab4..06b8815 100644 --- a/tests/comparison/query_profile.py +++ b/tests/comparison/query_profile.py @@ -16,7 +16,7 @@ # under the License. from logging import getLogger -from random import choice, randint, random +from random import choice, randint, random, shuffle from tests.comparison.db_types import ( Boolean, @@ -26,7 +26,11 @@ from tests.comparison.db_types import ( Int, TYPES, Timestamp) -from tests.comparison.query import InsertStatement, Query, StatementExecutionMode +from tests.comparison.query import ( + InsertStatement, + Query, + StatementExecutionMode, + ValuesClause) from tests.comparison.funcs import ( AnalyticAvg, AnalyticCount, @@ -76,7 +80,8 @@ class DefaultProfile(object): 'WITH_TABLE_COUNT': (1, 3), 'TABLE_COUNT': (1, 2), 'ANALYTIC_LEAD_LAG_OFFSET': (1, 100), - 'ANALYTIC_WINDOW_OFFSET': (1, 100)} + 'ANALYTIC_WINDOW_OFFSET': (1, 100), + 'INSERT_VALUES_ROWS': (1, 10)} # Below are interdependent weights used to determine probabilities. The probability # of any item being selected should be (item weight) / sum(weights). A weight of @@ -94,13 +99,14 @@ class DefaultProfile(object): Int: 10, Timestamp: 1}, 'RELATIONAL_FUNCS': { - # The weights below are "best effort" suggestions. Because QueryGenerator - # prefers to set column types first, and some functions are "supported" only by - # some types, it means functions can be pruned off from this dictionary, and - # that will shift the probabilities. A quick example if that if a Char column is - # chosen: LessThan may not have a pre-defined signature for Char comparison, so - # LessThan shouldn't be chosen with Char columns. The tendency to prune will - # shift as the "funcs" module is adjusted to add/remove signatures. + # The weights below are "best effort" suggestions. Because QueryGenerator + # prefers to set column types first, and some functions are "supported" only + # by some types, it means functions can be pruned off from this dictionary, + # and that will shift the probabilities. A quick example if that if a Char + # column is chosen: LessThan may not have a pre-defined signature for Char + # comparison, so LessThan shouldn't be chosen with Char columns. The + # tendency to prune will shift as the "funcs" module is adjusted to + # add/remove signatures. And: 2, Coalesce: 2, Equals: 40, @@ -115,19 +121,18 @@ class DefaultProfile(object): LessThanOrEquals: 2, NotEquals: 2, NotIn: 2, - Or: 2, - }, + Or: 2}, 'CONJUNCT_DISJUNCTS': { - # And and Or appear both under RELATIONAL_FUNCS and CONJUNCT_DISJUNCTS for the - # following reasons: - # 1. And and Or are considered "relational" by virtue of taking two arguments - # and returning a Boolean. The crude signature selection means they could be - # selected, so we describe weights there. - # 2. They are set here explicitly as well so that - # QueryGenerator._create_bool_func_tree() can create a "more realistic" - # expression that has a Boolean operator at the top of the tree by explicitly - # asking for an And or Or. - # IMPALA-3896 tracks a better way to do this. + # And and Or appear both under RELATIONAL_FUNCS and CONJUNCT_DISJUNCTS for the + # following reasons: + # 1. And and Or are considered "relational" by virtue of taking two arguments + # and returning a Boolean. The crude signature selection means they could be + # selected, so we describe weights there. + # 2. They are set here explicitly as well so that + # QueryGenerator._create_bool_func_tree() can create a "more realistic" + # expression that has a Boolean operator at the top of the tree by explicitly + # asking for an And or Or. + # IMPALA-3896 tracks a better way to do this. And: 5, Or: 1}, 'ANALYTIC_WINDOW': { @@ -197,7 +202,16 @@ class DefaultProfile(object): StatementExecutionMode.SELECT_STATEMENT: 10}, 'STATEMENT': { # TODO: Eventually make this a mix of DML and SELECT (IMPALA-4601) - Query: 1}} + Query: 1}, + 'INSERT_SOURCE_CLAUSE': { + Query: 3, + ValuesClause: 1}, + 'INSERT_COLUMN_LIST': { + 'partial': 3, + 'none': 1}, + 'VALUES_ITEM_EXPR': { + 'constant': 1, + 'function': 2}} # On/off switches self._flags = { @@ -290,8 +304,8 @@ class DefaultProfile(object): weights = self.weights(*weights) else: weights = weights[0] - return self._choose_from_weights(dict( - (choice_, weight) for choice_, weight in weights.iteritems() if filter(choice_))) + return self._choose_from_weights(dict((choice_, weight) for choice_, weight + in weights.iteritems() if filter(choice_))) def _decide_from_probability(self, *keys): return random() < self.probability(*keys) @@ -373,9 +387,9 @@ class DefaultProfile(object): allow_correlated = False weights = dict(((name, use_agg, use_correlated), weight) for (name, use_agg, use_correlated), weight in weights.iteritems() - if name == func_name \ - and (allow_agg or use_agg == 'NON_AGG') \ - and weight) + if name == func_name and + (allow_agg or use_agg == 'NON_AGG') and + weight) if weights: return self._choose_from_weights(weights) @@ -534,8 +548,8 @@ class DefaultProfile(object): signature_length += 1 if not signature_weight: continue - if signature.func not in func_weights \ - or signature_weight > func_weights[signature.func]: + if (signature.func not in func_weights or + signature_weight > func_weights[signature.func]): func_weights[signature.func] = signature_weight signature_length_by_func[signature.func] = signature_length if not func_weights: @@ -615,6 +629,46 @@ class DefaultProfile(object): def choose_statement(self): return self._choose_from_weights('STATEMENT') + def choose_insert_source_clause(self): + """ + Returns whether we generate an INSERT SELECT or an INSERT VALUES + """ + return self._choose_from_weights('INSERT_SOURCE_CLAUSE') + + def choose_insert_column_list(self, table): + """ + Decide whether or not an INSERT will be in the form of: + INSERT INTO table SELECT|VALUES ... + or + INSERT INTO table (col1, col2, ...) SELECT|VALUES ... + If the second form, the column list is shuffled. The column list will always contain + the primary key columns and between 0 and all additional columns. + """ + if 'partial' == self._choose_from_weights('INSERT_COLUMN_LIST'): + columns_to_insert = list(table.primary_keys) + min_additional_insert_cols = 0 if columns_to_insert else 1 + remaining_columns = [col for col in table.cols if not col.is_primary_key] + shuffle(remaining_columns) + additional_column_count = randint(min_additional_insert_cols, len(remaining_columns)) + columns_to_insert.extend(remaining_columns[:additional_column_count]) + shuffle(columns_to_insert) + return columns_to_insert + else: + return None + + def choose_insert_values_row_count(self): + """ + Choose the number of rows to insert in an INSERT VALUES + """ + return self._choose_from_bounds('INSERT_VALUES_ROWS') + + def choose_values_item_expr(self): + """ + For a VALUES clause, Choose whether a particular item in a particular row will be a + constant or a function. + """ + return self._choose_from_weights('VALUES_ITEM_EXPR') + class ImpalaNestedTypesProfile(DefaultProfile): http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/statement_generator.py ---------------------------------------------------------------------- diff --git a/tests/comparison/statement_generator.py b/tests/comparison/statement_generator.py index 5c37458..55496ac 100644 --- a/tests/comparison/statement_generator.py +++ b/tests/comparison/statement_generator.py @@ -15,9 +15,10 @@ # specific language governing permissions and limitations # under the License. +from copy import deepcopy + from tests.comparison.common import Table -from tests.comparison.db_types import Char, Float, Int -from tests.comparison.model_translator import SqlWriter +from tests.comparison.funcs import CastFunc from tests.comparison.query import ( InsertClause, InsertStatement, @@ -26,15 +27,16 @@ from tests.comparison.query import ( ValuesClause, ValuesRow) from tests.comparison.query_generator import QueryGenerator -from tests.comparison.query_profile import DefaultProfile class InsertStatementGenerator(object): def __init__(self, profile): # QueryProfile-like object self.profile = profile - # used to generate SELECT queries for INSERT ... SELECT statements - self.query_generator = QueryGenerator(profile) + # used to generate SELECT queries for INSERT ... SELECT statements; + # to ensure state is completely reset, this is created anew with each call to + # generate_statement() + self.select_stmt_generator = None def generate_statement(self, tables, dml_table): """ @@ -45,34 +47,103 @@ class InsertStatementGenerator(object): "sources" of the INSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT will be into this table. - - This is just a stub, good enough to generatea valid INSERT INTO ... VALUES - statement. Actual implementation is tracked IMPALA-4353. """ if not (isinstance(tables, list) and len(tables) > 0 and all((isinstance(t, Table) for t in tables))): - raise Exception("tables must be a not-empty list of Table objects") + raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') + self.select_stmt_generator = QueryGenerator(self.profile) + if dml_table.primary_keys: - conflict_action = InsertStatement.CONFLICT_ACTION_IGNORE + insert_statement = InsertStatement( + conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE) + else: + insert_statement = InsertStatement( + conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT) + + insert_statement.execution = StatementExecutionMode.DML_TEST + + # Choose whether this is a + # INSERT INTO table SELECT/VALUES + # or + # INSERT INTO table (col1, col2, ...) SELECT/VALUES + # If the method returns None, it's the former. + insert_column_list = self.profile.choose_insert_column_list(dml_table) + insert_statement.insert_clause = InsertClause( + dml_table, column_list=insert_column_list) + # We still need to internally track the columns we're inserting. Keep in mind None + # means "all" without an explicit column list. Since we've already created the + # InsertClause object though, we can fill this in for ourselves. + if insert_column_list is None: + insert_column_list = dml_table.cols + insert_item_data_types = [col.type for col in insert_column_list] + + # Decide whether this is INSERT VALUES or INSERT SELECT + insert_source_clause = self.profile.choose_insert_source_clause() + + if issubclass(insert_source_clause, Query): + # Use QueryGenerator()'s public interface to generate the SELECT. + select_query = self.select_stmt_generator.generate_statement( + tables, select_item_data_types=insert_item_data_types) + # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The + # generator's type system is not near sophisticated enough to know how random + # expressions will be implicitly casted in the databases. This requires less work + # to implement. IMPALA-4693 considers alternative approaches. + self._cast_select_items(select_query, insert_column_list) + insert_statement.with_clause = deepcopy(select_query.with_clause) + select_query.with_clause = None + insert_statement.select_query = select_query + elif issubclass(insert_source_clause, ValuesClause): + insert_statement.values_clause = self._generate_values_clause(insert_column_list) else: - conflict_action = InsertStatement.CONFLICT_ACTION_DEFAULT - - return InsertStatement( - insert_clause=InsertClause(dml_table), - values_clause=self.generate_values_clause(dml_table.cols), - conflict_action=conflict_action, execution=StatementExecutionMode.DML_TEST) - - def generate_values_clause(self, table_columns): - constants = [] - for col in table_columns: - val = self.profile.choose_constant(return_type=col.exact_type, - allow_null=(not col.is_primary_key)) - constants.append(val) - return ValuesClause([ValuesRow(constants)]) + raise Exception('unsupported INSERT source clause: {0}'.format( + insert_source_clause)) + return insert_statement + + def _generate_values_clause(self, columns): + """ + Return a VALUES clause containing a variable number of rows. + + The values corresponding to primary keys will be non-null constants. Any other + columns could be null, constants, or function trees that may or may not evaluate to + null. + """ + values_rows = [] + for _ in xrange(self.profile.choose_insert_values_row_count()): + values_row = [] + for col in columns: + if col.is_primary_key: + val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False) + elif 'constant' == self.profile.choose_values_item_expr(): + val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True) + else: + func_tree = self.select_stmt_generator.create_func_tree( + col.type, allow_subquery=False) + val = self.select_stmt_generator.populate_func_with_vals(func_tree) + # Only the generic type, not the exact type, of the value will be known. To + # avoid a lot of failed queries due to precision errors, we cast the val to + # the exact type of the column. This will still not prevent "out of range" + # conditions, as we don't try to evaluate the random expressions. + val = CastFunc(val, col.exact_type) + values_row.append(val) + values_rows.append(ValuesRow(values_row)) + return ValuesClause(values_rows) + + def _cast_select_items(self, select_query, column_list): + """ + For a given Query select_query and a column_list (list of Columns), cast each select + item in select_query to the exact type of the column. + + A Query may have a UNION, recursively do this down the line. + """ + for col_idx, select_item in enumerate(select_query.select_clause.items): + cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type) + select_item.val_expr = cast_val_expr + if select_query.union_clause: + self._cast_select_items(select_query.union_clause.query, column_list) def get_generator(statement_type): http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/db7facde/tests/comparison/tests/query_object_testdata.py ---------------------------------------------------------------------- diff --git a/tests/comparison/tests/query_object_testdata.py b/tests/comparison/tests/query_object_testdata.py index dc2f839..4861781 100644 --- a/tests/comparison/tests/query_object_testdata.py +++ b/tests/comparison/tests/query_object_testdata.py @@ -286,11 +286,15 @@ INSERT_QUERY_TEST_CASES = [ ), impala_query_string=( 'INSERT INTO kudu_table\n' - "VALUES (1, 'a'), (2, 'b')" + 'VALUES\n' + "(1, 'a'),\n" + "(2, 'b')" ), postgres_query_string=( 'INSERT INTO kudu_table\n' - "VALUES (1, 'a' || ''), (2, 'b' || '')" + 'VALUES\n' + "(1, 'a' || ''),\n" + "(2, 'b' || '')" ), ), InsertStatementTest( @@ -303,11 +307,13 @@ INSERT_QUERY_TEST_CASES = [ ), impala_query_string=( 'INSERT INTO kudu_table\n' - 'VALUES (1)' + 'VALUES\n' + '(1)' ), postgres_query_string=( 'INSERT INTO kudu_table\n' - 'VALUES (1)' + 'VALUES\n' + '(1)' ), ), InsertStatementTest( @@ -320,11 +326,13 @@ INSERT_QUERY_TEST_CASES = [ ), impala_query_string=( 'INSERT INTO kudu_table (int_col, char_col)\n' - "VALUES (1, 'a')" + 'VALUES\n' + "(1, 'a')" ), postgres_query_string=( 'INSERT INTO kudu_table (int_col, char_col)\n' - "VALUES (1, 'a' || '')" + 'VALUES\n' + "(1, 'a' || '')" ), ), InsertStatementTest( @@ -338,11 +346,13 @@ INSERT_QUERY_TEST_CASES = [ ), impala_query_string=( 'INSERT INTO kudu_table (int_col)\n' - "VALUES (1)" + 'VALUES\n' + '(1)' ), postgres_query_string=( 'INSERT INTO kudu_table (int_col)\n' - "VALUES (1)" + 'VALUES\n' + '(1)' ), ), InsertStatementTest( @@ -413,11 +423,15 @@ INSERT_QUERY_TEST_CASES = [ ), impala_query_string=( 'INSERT INTO kudu_table\n' - "VALUES (1, 'a'), (2, 'b')" + 'VALUES\n' + "(1, 'a'),\n" + "(2, 'b')" ), postgres_query_string=( 'INSERT INTO kudu_table\n' - "VALUES (1, 'a' || ''), (2, 'b' || '')\n" + 'VALUES\n' + "(1, 'a' || ''),\n" + "(2, 'b' || '')\n" 'ON CONFLICT DO NOTHING' ), ),
