Hi! Here is part 1 of the review; part 2 should come later today.
>>>>> "Sergey" == Sergey Petrunya <[email protected]> writes: Sergey> On Tue, Feb 22, 2011 at 05:24:37PM +0300, Sergey Petrunya wrote: >> Hi Monty, >> >> Please find attached the complete patch of MWL#90 for review. The tree with >> the code is on launchpad/buildbot: >> https://code.launchpad.net/~maria-captains/maria/5.3-subqueries-mwl90 >> http://buildbot.askmonty.org/buildbot/grid?branch=5.3-subqueries-mwl90 <cut> > +++ maria-5.3-subqueries-r36-noc/mysql-test/include/mix1.inc 2011-02-16 > 14:42:52.000000000 +0300 > @@ -1532,6 +1532,12 @@ > > SELECT 1 FROM (SELECT COUNT(DISTINCT c1) > FROM t1 WHERE c2 IN (1, 1) AND c3 = 2 GROUP BY c2) x; > +--echo # MariaDB note: > +--echo # This will show 2 for table which has 5 rows. > +--echo # This is because the access method employed is actually range access > +--echo # which scans 2 records (yes, EXPLAIN displays it incorrectly). > +--echo # our correct printing is an artifact of changing in > select_describe() > +--echo # from printing table->starts.records() to tab->records. Please remove comment as this is not an issue anymore. > diff -urN --exclude='.*' 5.3-noc/sql/item_subselect.cc > maria-5.3-subqueries-r36-noc/sql/item_subselect.cc > > @@ -4067,6 +4078,17 @@ > } > > > +int subselect_hash_sj_engine::optimize() > +{ > + int res= 0; Don't reset res > + SELECT_LEX *save_select= thd->lex->current_select; > + thd->lex->current_select= materialize_join->select_lex; > + res= materialize_join->optimize(); > + thd->lex->current_select= save_select; > + > + return res; > +} > + > diff -urN --exclude='.*' 5.3-noc/sql/item_subselect.h > maria-5.3-subqueries-r36-noc/sql/item_subselect.h > --- 5.3-noc/sql/item_subselect.h 2011-01-27 21:39:33.000000000 +0300 > +++ maria-5.3-subqueries-r36-noc/sql/item_subselect.h 2011-02-16 > 14:42:52.000000000 +0300 > @@ -46,16 +46,17 @@ > < child_join->prepare > < engine->prepare > *ref= substitution; > + substitution= NULL; > < Item_subselect::fix_fields > */ > - Item *substitution; > public: > + Item *substitution; > /* unit of subquery */ > st_select_lex_unit *unit; > -protected: > Item *expr_cache; > /* engine that perform execution of subselect (single select or union) */ > subselect_engine *engine; > +protected: > /* old engine if engine was changed */ > subselect_engine *old_engine; > /* cache of used external tables */ > @@ -148,6 +149,7 @@ > bool mark_as_dependent(THD *thd, st_select_lex *select, Item *item); > void fix_after_pullout(st_select_lex *new_parent, Item **ref); > void recalc_used_tables(st_select_lex *new_parent, bool after_pullout); > + virtual int optimize(); > virtual bool exec(); > virtual void fix_length_and_dec(); > table_map used_tables() const; > @@ -351,7 +353,9 @@ > all JOIN in UNION > */ > Item *expr; > +public: > Item_in_optimizer *optimizer; > +protected: Move the above to after Item *substitions; No reason to have many :public :protected blocks > @@ -397,6 +401,16 @@ > }; > enum_exec_method exec_method; > > + /* > + TRUE<=>this is a flattenable semi-join, false overwise. > + */ > + bool is_flattenable_semijoin; You should set this to zero in Item_subselect::Item_subselect() ! > + > + /* > + Cost to populate the temporary table (set on if-needed basis). > + */ > + //double startup_cost; > + Remove startup_cost as it's not used <cut> > @@ -752,7 +767,7 @@ > > class subselect_hash_sj_engine : public subselect_engine > { > -protected: > +public: > /* The table into which the subquery is materialized. */ > TABLE *tmp_table; > /* TRUE if the subquery was materialized into a temp table. */ > @@ -764,14 +779,16 @@ > of subselect_single_select_engine::[prepare | cols]. > */ > subselect_single_select_engine *materialize_engine; > +protected: > /* The engine used to compute the IN predicate. */ > subselect_engine *lookup_engine; > /* > QEP to execute the subquery and materialize its result into a > temporary table. Created during the first call to exec(). > */ > +public: > JOIN *materialize_join; > - > +protected: > /* Keyparts of the only non-NULL composite index in a rowid merge. */ > MY_BITMAP non_null_key_parts; > /* Keyparts of the single column indexes with NULL, one keypart per index. > */ > @@ -784,7 +801,9 @@ > IN results because index lookups sometimes match values that are actually > not equal to the search key in SQL terms. > */ > +public: > Item_cond_and *semi_join_conds; > +protected: > /* Possible execution strategies that can be used to compute hash > semi-join.*/ Please move all public variables to one block. > +++ maria-5.3-subqueries-r36-noc/sql/opt_subselect.cc 2011-02-16 > 14:42:52.000000000 +0300 > @@ -166,6 +328,7 @@ > (void)subquery_types_allow_materialization(in_subs); > > in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest; > + in_subs->is_flattenable_semijoin= TRUE; > > /* Register the subquery for further processing in > flatten_subqueries() */ > select_lex-> > @@ -220,10 +383,24 @@ > (in_subs->is_top_level_item() || > optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) || > optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) > &&//4 > - !in_subs->is_correlated && // 5 > - in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6 > + !in_subs->is_correlated) // 5 > { > + if (in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) > in_subs->exec_method= Item_in_subselect::MATERIALIZATION; You removed //6 comment without updating the comment for the block that describes 6. Please update comment! > + > + /* > + If the subquery is an AND-part of WHERE register for being > processed > + with jtbm strategy > + */ > + if (in_subs->exec_method == Item_in_subselect::MATERIALIZATION && > + thd->thd_marker.emb_on_expr_nest == (TABLE_LIST*)0x1 && > + optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN)) Instead of doing: thd->thd_marker.emb_on_expr_nest == (TABLE_LIST*)0x1 please create an inline function with a meningfull name to test test or at least replace (TABLE_LIST*) 0x with a define or both. > + { > + in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest; Here you could make things cleared and set in_subs->emb_on_expr_nest to a define. <cut> > @@ -339,6 +516,69 @@ > > > /* > + Finalize IN->EXISTS conversion in case we couldn't use materialization. > + > + DESCRIPTION Invoke the IN->EXISTS converter > + Replace the Item_in_subselect with its wrapper Item_in_optimizer in > WHERE. > + > + RETURN > + FALSE - Ok > + TRUE - Fatal error > +*/ > + > +static > +bool make_in_exists_conversion(THD *thd, JOIN *join, Item_in_subselect *item) > +{ > + DBUG_ENTER("make_in_exists_conversion"); > + JOIN *child_join= item->unit->first_select()->join; > + Item_subselect::trans_res res; > + item->changed= 0; Please add a comment why you set item->fixed to 0 as this is not obvious > + item->fixed= 0; > + > + SELECT_LEX *save_select_lex= thd->lex->current_select; > + thd->lex->current_select= item->unit->first_select(); > + > + res= item->select_transformer(child_join); > + > + thd->lex->current_select= save_select_lex; > + > + if (res == Item_subselect::RES_ERROR) > + DBUG_RETURN(TRUE); > + > + item->changed= 1; > + item->fixed= 1; > + > + Item *substitute= item->substitution; > + bool do_fix_fields= !item->substitution->fixed; item->substitution -> substitute > + /* > + The Item_subselect has already been wrapped with Item_in_optimizer, so we > + should search for item->optimizer, not 'item'. > + */ > + Item *replace_me= item->optimizer; > + DBUG_ASSERT(replace_me==substitute); > + > + Item **tree= (item->emb_on_expr_nest == (TABLE_LIST*)1)? > + &join->conds : &(item->emb_on_expr_nest->on_expr); > + if (replace_where_subcondition(join, tree, replace_me, substitute, > + do_fix_fields)) > + DBUG_RETURN(TRUE); > + item->substitution= NULL; > + The code belove makes a permanent change to the prepared statement tree. Please add a comment about this! Also please add a test where you do a prepared statement that hits his code and excecute it twice with different parameters. > + if (!thd->stmt_arena->is_conventional()) > + { > + tree= (item->emb_on_expr_nest == (TABLE_LIST*)1)? > + &join->select_lex->prep_where : > + &(item->emb_on_expr_nest->prep_on_expr); > + > + if (replace_where_subcondition(join, tree, replace_me, substitute, > + FALSE)) > + DBUG_RETURN(TRUE); > + } > + DBUG_RETURN(FALSE); > +} > + > + > +/* > Convert semi-join subquery predicates into semi-join join nests > > SYNOPSIS > @@ -445,25 +685,41 @@ > // #tables-in-parent-query + #tables-in-subquery < MAX_TABLES > /* Replace all subqueries to be flattened with Item_int(1) */ > arena= thd->activate_stmt_arena_if_needed(&backup); > - for (in_subq= join->sj_subselects.front(); > - in_subq != in_subq_end && > - join->tables + (*in_subq)->unit->first_select()->join->tables < > MAX_TABLES; > - in_subq++) > - { > - Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)? > - &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr); > - if (replace_where_subcondition(join, tree, *in_subq, new Item_int(1), > - FALSE)) > - DBUG_RETURN(TRUE); /* purecov: inspected */ > - } > > for (in_subq= join->sj_subselects.front(); > in_subq != in_subq_end && > join->tables + (*in_subq)->unit->first_select()->join->tables < > MAX_TABLES; > in_subq++) > { > - if (convert_subq_to_sj(join, *in_subq)) > - DBUG_RETURN(TRUE); > + bool remove_item= TRUE; > + if ((*in_subq)->is_flattenable_semijoin) > + { > + if (convert_subq_to_sj(join, *in_subq)) > + DBUG_RETURN(TRUE); > + } > + else > + { > + if (convert_subq_to_jtbm(join, *in_subq, &remove_item)) > + DBUG_RETURN(TRUE); > + } > + if (remove_item) > + { > + Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)? > + &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr); > + Item *replace_me= *in_subq; > + /* > + JTBM: the subquery was already mapped with Item_in_optimizer, so we > + should search for that, not for original Item_in_subselect. > + TODO: what about delaying that rewrite until here? > + */ > + if (!(*in_subq)->is_flattenable_semijoin) > + { > + replace_me= (*in_subq)->optimizer; > + } Consider doing a function: Item_in_subselect::original_item() { return is_flattenable_semijoin ? self : item->optimizer; } This would make the above code a bit easier to understand by doing: Item *replace_me= (*in_subq)->original_item() This would thus be a bit like real_item() > + if (replace_where_subcondition(join, tree, replace_me, new Item_int(1), > + FALSE)) > + DBUG_RETURN(TRUE); /* purecov: inspected */ > + } > } > skip_conversion: > /* > @@ -494,7 +750,19 @@ > bool do_fix_fields= !(*in_subq)->substitution->fixed; > Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)? > &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr); > - if (replace_where_subcondition(join, tree, *in_subq, substitute, > + > + Item *replace_me= *in_subq; > + /* > + JTBM: the subquery was already mapped with Item_in_optimizer, so we > + should search for that, not for original Item_in_subselect. > + TODO: what about delaying that rewrite until here? > + */ > + if (!(*in_subq)->is_flattenable_semijoin) > + { > + replace_me= (*in_subq)->optimizer; > + } The above code can also be made easier with (*in_subq)->original_item(). The above comment is a good one for original_item()! > + > +/* > + Get #output_rows and scan_time estimates for a "delayed" table. > + > + SYNOPSIS > + get_delayed_table_estimates() > + table IN Table to get estimates for > + out_rows OUT E(#rows in the table) > + scan_time OUT E(scan_time). > + startup_cost OUT cost to populate the table. > + > + DESCRIPTION > + Get #output_rows and scan_time estimates for a "delayed" table. By > + "delayed" here we mean that the table is filled at the start of query > + execution. This means that the optimizer can't use table statistics to > + get #rows estimate for it, it has to call this function instead. > + > + This function is expected to make different actions depending on the > nature > + of the table. At the moment there is only one kind of delayed tables, > + non-flattenable semi-joins. > +*/ > + > +void get_delayed_table_estimates(TABLE *table, > + ha_rows *out_rows, > + double *scan_time, > + double *startup_cost) > +{ > + Item_in_subselect *item= table->pos_in_table_list->jtbm_subselect; > + item->optimize(); > + > + DBUG_ASSERT(item->engine->engine_type() == > + subselect_engine::HASH_SJ_ENGINE); > + > + subselect_hash_sj_engine *hash_sj_engine= > + ((subselect_hash_sj_engine*)item->engine); > + JOIN *join= hash_sj_engine->materialize_join; > + > + double rows= 1; > + double read_time= 0.0; > + > + /* Calculate #rows and cost of join execution */ > + for (uint i= join->const_tables; i < join->tables; i++) > + { > + rows *= join->best_positions[i].records_read; > + read_time += join->best_positions[i].read_time; > + } > + *out_rows= (ha_rows)rows; > + *startup_cost= read_time; The above loop is identical to doing: get_partial_join_cost(join, join->table, startup_cost, &rows); *out_rows= (ha_rows) rows; > + /* Calculate cost of scanning the temptable */ > + double data_size= rows * hash_sj_engine->tmp_table->s->reclength; > + /* Do like in handler::read_time */ > + *scan_time= data_size/IO_SIZE + 2; > +} > + > +/* > + Convert subquery predicate into non-mergeable semi-join nest. > + > + TODO: > + why does this do IN-EXISTS conversion? Can't we unify it with mergeable > + semi-joins? currently, convert_subq_to_sj() cannot fail to convert > (unless > + fatal errors) > + > + > + RETURN > + FALSE - Ok > + TRUE - Fatal error > +*/ > + > +static bool convert_subq_to_jtbm(JOIN *parent_join, > + Item_in_subselect *subq_pred, > + bool *remove_item) > +{ > + SELECT_LEX *parent_lex= parent_join->select_lex; > + List<TABLE_LIST> *emb_join_list= &parent_lex->top_join_list; > + TABLE_LIST *emb_tbl_nest= NULL; // will change when we learn to handle > outer joins > + TABLE_LIST *tl; > + DBUG_ENTER("convert_subq_to_jtbm"); > + > + if (subq_pred->setup_engine(TRUE)) > + DBUG_RETURN(TRUE); > + /* Please ensure that the code below is covered with some test case !*/ > + if (subq_pred->engine->engine_type() != subselect_engine::HASH_SJ_ENGINE) > + { > + *remove_item= FALSE; > + make_in_exists_conversion(parent_join->thd, parent_join, subq_pred); You should return value of make_in_exists_conversion() as this may fail... > + DBUG_RETURN(FALSE); > + } > + *remove_item= TRUE; > + > + TABLE_LIST *jtbm; > + char *tbl_alias; > + const char alias_mask[]="<subquery%d>"; > + if (!(tbl_alias= (char*)parent_join->thd->calloc(sizeof(alias_mask)+5)) || > + !(jtbm= alloc_join_nest(parent_join->thd))) //todo: this is not a join > nest! > + { > + DBUG_RETURN(TRUE); > + } > + > + jtbm->join_list= emb_join_list; > + jtbm->embedding= emb_tbl_nest; > + jtbm->jtbm_subselect= subq_pred; > + jtbm->nested_join= NULL; > + > + /* Nests do not participate in those 'chains', so: */ > + /* jtbm->next_leaf= jtbm->next_local= jtbm->next_global == NULL*/ > + emb_join_list->push_back(jtbm); > + > + /* > + Inject the jtbm table into TABLE_LIST::next_leaf list, so that > + make_join_statistics() and co. can find it. > + */ > + for (tl= parent_lex->leaf_tables; tl->next_leaf; tl= tl->next_leaf) ; Add an explicit {} on an empty row, as otherwise you may get compiler warnings from some compilers > + tl->next_leaf= jtbm; > + > + /* > + Same as above for TABLE_LIST::next_local chain > + (a theory: a next_local chain always starts with ::leaf_tables > + because view's tables are inserted after the view) > + */ > + for (tl= parent_lex->leaf_tables; tl->next_local; tl= tl->next_local) ; Add an explicit {} on an empty row... > + tl->next_local= jtbm; Do you really have to put the table last ? It would be much esier to put it first... > + > + /* A theory: no need to re-connect the next_global chain */ > + > + subselect_hash_sj_engine *hash_sj_engine= > + ((subselect_hash_sj_engine*)subq_pred->engine); > + jtbm->table= hash_sj_engine->tmp_table; > + > + jtbm->table->tablenr= parent_join->tables; > + jtbm->table->map= table_map(1) << (parent_join->tables); > + > + parent_join->tables++; > + Where is it checked that this doesn't exceed MAX_TABLES? Add a comment for the function that guarantees this and an DBUG_ASSERT > + Item *conds= hash_sj_engine->semi_join_conds; > + conds->fix_after_pullout(parent_lex, &conds); > + > + DBUG_EXECUTE("where", print_where(conds,"SJ-EXPR", QT_ORDINARY);); > + > + my_snprintf(tbl_alias, sizeof(alias_mask)+5, alias_mask, > + hash_sj_engine->materialize_join->select_lex->select_number); A fast way to generate the name would be to use the following function: #define subquery_table_name_length= sizeof("<subquery9999>"); void create_temporary_subquery_table_name(char *to, uint number) { DBUG_ASSERT(number < 10000); to= strmov(to, "<subquery"); to= int10_to_str((int) number, to, 10); to[0]= '>'; to[1]= 0; } This is significantly faster than my_snprintf(). This would also simplify the other code you have when you generate the name. > + jtbm->alias= tbl_alias; > + > + /* Inject sj_on_expr into the parent's WHERE or ON */ > + if (emb_tbl_nest) > + { > + DBUG_ASSERT(0); > + /*emb_tbl_nest->on_expr= and_items(emb_tbl_nest->on_expr, > + sj_nest->sj_on_expr); > + emb_tbl_nest->on_expr->fix_fields(parent_join->thd, > &emb_tbl_nest->on_expr); > + */ Please remove the comment and change the above to: DBUG_ASSERT(!emb_tbl_nest); > > +enum_nested_loop_state join_tab_execution_startup(JOIN_TAB *tab) > +{ <cut> > + else if (tab->bush_children) > + { > + /* It's a merged SJM nest */ > + enum_nested_loop_state rc; > + JOIN *join= tab->join; > + SJ_MATERIALIZATION_INFO *sjm= > tab->bush_children->start->emb_sj_nest->sj_mat_info; > + JOIN_TAB *join_tab= tab->bush_children->start; > + JOIN_TAB *save_return_tab= join->return_tab; move setting of the last 2 variables inside the loop belpw > --- 5.3-noc/sql/sql_base.cc 2011-01-27 21:39:33.000000000 +0300 > +++ maria-5.3-subqueries-r36-noc/sql/sql_base.cc 2011-02-21 > 18:15:06.000000000 +0300 > @@ -7778,6 +7778,18 @@ > if (res) > DBUG_RETURN(1); > } > + if (table_list->jtbm_subselect) > + { > + Item *item= table_list->jtbm_subselect; > + if (item->fix_fields(thd, &item)) > + { > + my_error(ER_TOO_MANY_TABLES,MYF(0),MAX_TABLES); > + DBUG_RETURN(1); > + } > + DBUG_ASSERT(item == table_list->jtbm_subselect); > + if (table_list->jtbm_subselect->setup_engine(FALSE)) > + DBUG_RETURN(1); > + } > } Regards, Monty _______________________________________________ Mailing list: https://launchpad.net/~maria-developers Post to : [email protected] Unsubscribe : https://launchpad.net/~maria-developers More help : https://help.launchpad.net/ListHelp

