HIVE-16811 : Estimate statistics in absence of stats (Vineet Garg, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/8f7c5788 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/8f7c5788 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/8f7c5788 Branch: refs/heads/master Commit: 8f7c5788938f3706a42e5ea8350ca6d3500eb15d Parents: d155565 Author: Vineet Garg <vg...@apache.com> Authored: Fri Sep 1 22:07:15 2017 -0700 Committer: Vineet Garg <vg...@apache.com> Committed: Fri Sep 1 22:07:15 2017 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 6 + .../test/resources/testconfiguration.properties | 1 + .../ql/optimizer/calcite/RelOptHiveTable.java | 93 +- .../stats/annotation/StatsRulesProcFactory.java | 12 +- .../hadoop/hive/ql/plan/ColStatistics.java | 12 + .../apache/hadoop/hive/ql/stats/StatsUtils.java | 206 +- .../clientpositive/join_reordering_no_stats.q | 63 + .../clientpositive/annotate_stats_filter.q.out | 12 +- .../clientpositive/annotate_stats_groupby.q.out | 4 +- .../clientpositive/annotate_stats_part.q.out | 42 +- .../clientpositive/annotate_stats_select.q.out | 4 +- .../clientpositive/annotate_stats_table.q.out | 12 +- .../auto_join_reordering_values.q.out | 8 +- .../clientpositive/auto_join_stats.q.out | 30 +- .../clientpositive/auto_join_stats2.q.out | 91 +- .../clientpositive/auto_sortmerge_join_12.q.out | 12 +- .../cbo_rp_annotate_stats_groupby.q.out | 4 +- .../columnStatsUpdateForStatsOptimizer_2.q.out | 24 +- .../clientpositive/explain_rearrange.q.out | 24 +- ql/src/test/results/clientpositive/join19.q.out | 190 +- ql/src/test/results/clientpositive/join42.q.out | 154 +- ql/src/test/results/clientpositive/join43.q.out | 136 +- .../join_cond_pushdown_unqual1.q.out | 284 +-- .../join_cond_pushdown_unqual2.q.out | 162 +- .../join_cond_pushdown_unqual3.q.out | 284 +-- .../join_cond_pushdown_unqual4.q.out | 162 +- .../results/clientpositive/join_hive_626.q.out | 60 +- .../test/results/clientpositive/join_star.q.out | 124 +- .../llap/acid_bucket_pruning.q.out | 8 +- .../llap/auto_smb_mapjoin_14.q.out | 260 +-- .../llap/auto_sortmerge_join_1.q.out | 48 +- .../llap/auto_sortmerge_join_10.q.out | 42 +- .../llap/auto_sortmerge_join_11.q.out | 64 +- .../llap/auto_sortmerge_join_12.q.out | 26 +- .../llap/auto_sortmerge_join_13.q.out | 134 +- .../llap/auto_sortmerge_join_14.q.out | 24 +- .../llap/auto_sortmerge_join_15.q.out | 24 +- .../llap/auto_sortmerge_join_2.q.out | 32 +- .../llap/auto_sortmerge_join_3.q.out | 48 +- .../llap/auto_sortmerge_join_4.q.out | 48 +- .../llap/auto_sortmerge_join_5.q.out | 44 +- .../llap/auto_sortmerge_join_6.q.out | 682 ++++--- .../llap/auto_sortmerge_join_7.q.out | 48 +- .../llap/auto_sortmerge_join_8.q.out | 48 +- .../llap/auto_sortmerge_join_9.q.out | 488 ++--- .../clientpositive/llap/bucket_groupby.q.out | 198 +- .../llap/bucket_map_join_tez1.q.out | 1121 ++++++----- .../llap/bucket_map_join_tez2.q.out | 322 +-- .../clientpositive/llap/bucketmapjoin1.q.out | 88 +- .../clientpositive/llap/bucketmapjoin2.q.out | 66 +- .../clientpositive/llap/bucketmapjoin3.q.out | 44 +- .../clientpositive/llap/bucketmapjoin4.q.out | 44 +- .../clientpositive/llap/bucketmapjoin7.q.out | 24 +- .../clientpositive/llap/bucketpruning1.q.out | 208 +- .../llap/bucketsortoptimize_insert_2.q.out | 144 +- .../llap/bucketsortoptimize_insert_6.q.out | 168 +- .../llap/bucketsortoptimize_insert_7.q.out | 72 +- .../columnStatsUpdateForStatsOptimizer_1.q.out | 24 +- .../llap/column_access_stats.q.out | 138 +- .../llap/column_table_stats.q.out | 24 +- .../llap/column_table_stats_orc.q.out | 20 +- .../clientpositive/llap/constprog_dpp.q.out | 20 +- .../llap/constprog_semijoin.q.out | 150 +- .../llap/correlationoptimizer4.q.out | 324 +-- .../results/clientpositive/llap/count.q.out | 144 +- .../llap/cross_product_check_1.q.out | 150 +- .../llap/cross_product_check_2.q.out | 132 +- .../results/clientpositive/llap/cte_3.q.out | 8 +- .../results/clientpositive/llap/cte_5.q.out | 10 +- .../results/clientpositive/llap/cte_mat_3.q.out | 18 +- .../results/clientpositive/llap/cte_mat_4.q.out | 36 +- .../results/clientpositive/llap/cte_mat_5.q.out | 16 +- .../llap/disable_merge_for_bucketing.q.out | 12 +- .../llap/dynamic_partition_pruning.q.out | 655 +++--- .../llap/dynamic_partition_pruning_2.q.out | 201 +- .../llap/dynamic_semijoin_reduction.q.out | 100 +- .../llap/dynamic_semijoin_reduction_2.q.out | 269 ++- .../llap/dynamic_semijoin_reduction_3.q.out | 186 +- .../llap/dynamic_semijoin_reduction_sw.q.out | 34 +- .../llap/dynpart_sort_opt_vectorization.q.out | 208 +- .../llap/dynpart_sort_optimization.q.out | 288 +-- .../llap/dynpart_sort_optimization2.q.out | 68 +- .../llap/dynpart_sort_optimization_acid.q.out | 144 +- .../clientpositive/llap/empty_join.q.out | 22 +- .../clientpositive/llap/except_distinct.q.out | 48 +- .../clientpositive/llap/explainuser_1.q.out | 384 ++-- .../clientpositive/llap/explainuser_2.q.out | 116 +- .../llap/filter_join_breaktask.q.out | 36 +- .../llap/hybridgrace_hashjoin_1.q.out | 197 +- .../results/clientpositive/llap/insert1.q.out | 46 +- .../clientpositive/llap/insert_into1.q.out | 6 +- .../clientpositive/llap/intersect_all.q.out | 44 +- .../llap/intersect_distinct.q.out | 40 +- .../clientpositive/llap/intersect_merge.q.out | 540 ++--- .../clientpositive/llap/jdbc_handler.q.out | 14 +- .../results/clientpositive/llap/join46.q.out | 326 +-- .../llap/join_emit_interval.q.out | 32 +- .../llap/join_is_not_distinct_from.q.out | 102 +- .../clientpositive/llap/join_nullsafe.q.out | 102 +- .../llap/join_reordering_no_stats.q.out | 708 +++++++ .../clientpositive/llap/lateral_view.q.out | 32 +- .../clientpositive/llap/llap_nullscan.q.out | 64 +- .../clientpositive/llap/llap_partitioned.q.out | 8 +- .../results/clientpositive/llap/llap_smb.q.out | 18 +- .../clientpositive/llap/llap_stats.q.out | 4 +- .../results/clientpositive/llap/llap_udf.q.out | 36 +- .../clientpositive/llap/llapdecider.q.out | 32 +- .../clientpositive/llap/lvj_mapjoin.q.out | 60 +- .../results/clientpositive/llap/mapjoin3.q.out | 16 +- .../results/clientpositive/llap/mapjoin46.q.out | 280 +-- .../clientpositive/llap/mapjoin_decimal.q.out | 22 +- .../llap/mapjoin_emit_interval.q.out | 28 +- .../results/clientpositive/llap/merge1.q.out | 12 +- .../results/clientpositive/llap/merge2.q.out | 12 +- .../results/clientpositive/llap/mergejoin.q.out | 370 ++-- .../llap/metadata_only_queries.q.out | 68 +- .../clientpositive/llap/multiMapJoin1.q.out | 503 ++--- .../clientpositive/llap/multiMapJoin2.q.out | 8 +- .../llap/multi_count_distinct_null.q.out | 42 +- .../llap/multi_insert_lateral_view.q.out | 288 +-- .../clientpositive/llap/optimize_nullscan.q.out | 20 +- .../llap/orc_llap_nonvector.q.out | 16 +- .../clientpositive/llap/orc_merge3.q.out | 6 +- .../clientpositive/llap/orc_merge4.q.out | 6 +- .../clientpositive/llap/orc_merge5.q.out | 16 +- .../clientpositive/llap/orc_merge6.q.out | 16 +- .../clientpositive/llap/orc_merge7.q.out | 12 +- .../llap/orc_merge_incompat1.q.out | 8 +- .../llap/orc_merge_incompat2.q.out | 6 +- .../llap/orc_predicate_pushdown.q.out | 144 +- .../llap/parquet_predicate_pushdown.q.out | 136 +- .../llap/parquet_types_vectorization.q.out | 112 +- .../llap/partition_shared_scan.q.out | 116 +- .../clientpositive/llap/ppd_union_view.q.out | 66 +- .../clientpositive/llap/ptf_matchpath.q.out | 42 +- .../clientpositive/llap/rcfile_createas1.q.out | 6 +- .../clientpositive/llap/rcfile_merge3.q.out | 6 +- .../clientpositive/llap/rcfile_merge4.q.out | 6 +- .../llap/reduce_deduplicate.q.out | 10 +- .../llap/reduce_deduplicate_distinct.q.out | 84 +- .../results/clientpositive/llap/sample10.q.out | 6 +- .../llap/schema_evol_orc_nonvec_part.q.out | 54 +- ...chema_evol_orc_nonvec_part_all_complex.q.out | 18 +- ...ema_evol_orc_nonvec_part_all_primitive.q.out | 30 +- .../llap/schema_evol_orc_nonvec_table.q.out | 30 +- .../llap/schema_evol_orc_vec_part.q.out | 54 +- .../schema_evol_orc_vec_part_all_complex.q.out | 18 +- ...schema_evol_orc_vec_part_all_primitive.q.out | 30 +- .../llap/schema_evol_orc_vec_table.q.out | 30 +- .../llap/schema_evol_text_nonvec_part.q.out | 54 +- ...hema_evol_text_nonvec_part_all_complex.q.out | 18 +- ...ma_evol_text_nonvec_part_all_primitive.q.out | 30 +- .../llap/schema_evol_text_nonvec_table.q.out | 30 +- .../llap/schema_evol_text_vec_part.q.out | 54 +- .../schema_evol_text_vec_part_all_complex.q.out | 18 +- ...chema_evol_text_vec_part_all_primitive.q.out | 30 +- .../llap/schema_evol_text_vec_table.q.out | 30 +- .../llap/schema_evol_text_vecrow_part.q.out | 54 +- ...hema_evol_text_vecrow_part_all_complex.q.out | 18 +- ...ma_evol_text_vecrow_part_all_primitive.q.out | 30 +- .../llap/schema_evol_text_vecrow_table.q.out | 30 +- .../results/clientpositive/llap/semijoin.q.out | 534 ++--- .../results/clientpositive/llap/skewjoin.q.out | 76 +- .../clientpositive/llap/skewjoinopt15.q.out | 36 +- .../results/clientpositive/llap/smb_cache.q.out | 28 +- .../clientpositive/llap/smb_mapjoin_14.q.out | 244 +-- .../clientpositive/llap/smb_mapjoin_15.q.out | 104 +- .../clientpositive/llap/smb_mapjoin_17.q.out | 218 +- .../clientpositive/llap/smb_mapjoin_18.q.out | 24 +- .../clientpositive/llap/smb_mapjoin_19.q.out | 6 +- .../clientpositive/llap/smb_mapjoin_4.q.out | 310 +-- .../clientpositive/llap/smb_mapjoin_5.q.out | 310 +-- .../clientpositive/llap/smb_mapjoin_6.q.out | 108 +- .../results/clientpositive/llap/sqlmerge.q.out | 76 +- .../results/clientpositive/llap/stats11.q.out | 44 +- .../clientpositive/llap/stats_only_null.q.out | 24 +- .../clientpositive/llap/subquery_exists.q.out | 10 +- .../clientpositive/llap/subquery_in.q.out | 36 +- .../clientpositive/llap/subquery_multi.q.out | 416 ++-- .../clientpositive/llap/subquery_notin.q.out | 366 ++-- .../clientpositive/llap/subquery_scalar.q.out | 530 ++--- .../clientpositive/llap/subquery_select.q.out | 48 +- .../results/clientpositive/llap/sysdb.q.out | 12 +- .../llap/table_access_keys_stats.q.out | 8 +- .../clientpositive/llap/temp_table.q.out | 30 +- .../llap/tez_bmj_schema_evolution.q.out | 22 +- .../results/clientpositive/llap/tez_dml.q.out | 6 +- .../results/clientpositive/llap/tez_join.q.out | 24 +- .../clientpositive/llap/tez_join_hash.q.out | 8 +- .../llap/tez_join_result_complex.q.out | 40 +- .../clientpositive/llap/tez_nway_join.q.out | 90 +- .../clientpositive/llap/tez_self_join.q.out | 36 +- .../results/clientpositive/llap/tez_smb_1.q.out | 106 +- .../clientpositive/llap/tez_smb_empty.q.out | 82 +- .../clientpositive/llap/tez_smb_main.q.out | 470 +++-- .../llap/tez_union_group_by.q.out | 66 +- .../clientpositive/llap/unionDistinct_1.q.out | 994 ++++----- .../clientpositive/llap/union_remove_26.q.out | 8 +- .../llap/vector_adaptor_usage_mode.q.out | 100 +- .../llap/vector_aggregate_9.q.out | 36 +- .../llap/vector_aggregate_without_gby.q.out | 10 +- .../llap/vector_auto_smb_mapjoin_14.q.out | 260 +-- .../llap/vector_between_columns.q.out | 34 +- .../clientpositive/llap/vector_between_in.q.out | 166 +- .../llap/vector_binary_join_groupby.q.out | 56 +- .../clientpositive/llap/vector_bucket.q.out | 10 +- .../llap/vector_cast_constant.q.out | 18 +- .../clientpositive/llap/vector_char_2.q.out | 36 +- .../clientpositive/llap/vector_char_4.q.out | 6 +- .../llap/vector_char_mapjoin1.q.out | 78 +- .../clientpositive/llap/vector_coalesce_2.q.out | 40 +- .../llap/vector_complex_all.q.out | 86 +- .../llap/vector_complex_join.q.out | 8 +- .../clientpositive/llap/vector_count.q.out | 48 +- .../llap/vector_count_distinct.q.out | 18 +- .../clientpositive/llap/vector_data_types.q.out | 24 +- .../clientpositive/llap/vector_date_1.q.out | 68 +- .../clientpositive/llap/vector_decimal_1.q.out | 90 +- .../llap/vector_decimal_10_0.q.out | 10 +- .../llap/vector_decimal_aggregate.q.out | 32 +- .../llap/vector_decimal_expressions.q.out | 14 +- .../llap/vector_decimal_mapjoin.q.out | 18 +- .../llap/vector_decimal_math_funcs.q.out | 8 +- .../llap/vector_decimal_precision.q.out | 12 +- .../llap/vector_decimal_round.q.out | 40 +- .../llap/vector_decimal_udf.q.out | 258 +-- .../llap/vector_decimal_udf2.q.out | 16 +- .../clientpositive/llap/vector_distinct_2.q.out | 14 +- .../clientpositive/llap/vector_groupby4.q.out | 14 +- .../clientpositive/llap/vector_groupby6.q.out | 14 +- .../clientpositive/llap/vector_groupby_3.q.out | 14 +- .../llap/vector_groupby_cube1.q.out | 126 +- .../llap/vector_groupby_grouping_id1.q.out | 84 +- .../llap/vector_groupby_grouping_id2.q.out | 220 +- .../llap/vector_groupby_grouping_id3.q.out | 32 +- .../llap/vector_groupby_grouping_sets1.q.out | 96 +- .../llap/vector_groupby_grouping_sets2.q.out | 72 +- .../llap/vector_groupby_grouping_sets3.q.out | 46 +- .../llap/vector_groupby_grouping_sets4.q.out | 98 +- .../llap/vector_groupby_grouping_sets5.q.out | 64 +- .../llap/vector_groupby_grouping_sets6.q.out | 24 +- .../vector_groupby_grouping_sets_grouping.q.out | 182 +- .../vector_groupby_grouping_sets_limit.q.out | 118 +- .../llap/vector_groupby_reduce.q.out | 84 +- .../llap/vector_groupby_rollup1.q.out | 98 +- .../llap/vector_grouping_sets.q.out | 40 +- .../llap/vector_include_no_sel.q.out | 12 +- .../clientpositive/llap/vector_inner_join.q.out | 160 +- .../clientpositive/llap/vector_interval_1.q.out | 80 +- .../clientpositive/llap/vector_interval_2.q.out | 112 +- .../llap/vector_interval_arithmetic.q.out | 60 +- .../llap/vector_interval_mapjoin.q.out | 20 +- .../clientpositive/llap/vector_join30.q.out | 318 ++- .../llap/vector_left_outer_join2.q.out | 96 +- .../llap/vector_leftsemi_mapjoin.q.out | 1904 +++++++++--------- .../llap/vector_mr_diff_schema_alias.q.out | 55 +- .../llap/vector_multi_insert.q.out | 6 +- .../llap/vector_nullsafe_join.q.out | 176 +- .../llap/vector_number_compare_projection.q.out | 24 +- .../clientpositive/llap/vector_orderby_5.q.out | 16 +- .../llap/vector_outer_join0.q.out | 28 +- .../llap/vector_partition_diff_num_cols.q.out | 60 +- .../llap/vector_partitioned_date_time.q.out | 156 +- .../llap/vector_ptf_part_simple.q.out | 312 +-- .../clientpositive/llap/vector_reduce1.q.out | 10 +- .../clientpositive/llap/vector_reduce2.q.out | 10 +- .../clientpositive/llap/vector_reduce3.q.out | 10 +- .../llap/vector_reduce_groupby_decimal.q.out | 18 +- .../llap/vector_string_concat.q.out | 26 +- .../clientpositive/llap/vector_struct_in.q.out | 48 +- .../clientpositive/llap/vector_udf1.q.out | 220 +- .../llap/vector_udf_character_length.q.out | 12 +- .../llap/vector_udf_octet_length.q.out | 6 +- .../clientpositive/llap/vector_varchar_4.q.out | 6 +- .../llap/vector_varchar_mapjoin1.q.out | 154 +- .../llap/vector_varchar_simple.q.out | 24 +- .../llap/vector_when_case_null.q.out | 12 +- .../llap/vector_windowing_navfn.q.out | 132 +- .../llap/vectorization_decimal_date.q.out | 10 +- .../llap/vectorization_part_project.q.out | 8 +- .../llap/vectorization_short_regress.q.out | 72 +- .../llap/vectorized_bucketmapjoin1.q.out | 54 +- .../llap/vectorized_context.q.out | 109 +- .../llap/vectorized_date_funcs.q.out | 42 +- .../llap/vectorized_distinct_gby.q.out | 14 +- .../vectorized_dynamic_partition_pruning.q.out | 649 +++--- .../vectorized_dynamic_semijoin_reduction.q.out | 238 +-- .../clientpositive/llap/vectorized_join46.q.out | 250 +-- .../llap/vectorized_parquet.q.out | 12 +- .../llap/vectorized_parquet_types.q.out | 16 +- .../clientpositive/llap/vectorized_ptf.q.out | 538 ++--- .../llap/vectorized_timestamp.q.out | 34 +- .../llap/vectorized_timestamp_funcs.q.out | 80 +- .../results/clientpositive/merge_join_1.q.out | 68 +- .../test/results/clientpositive/mergejoin.q.out | 132 +- .../clientpositive/mergejoins_mixed.q.out | 237 ++- .../results/clientpositive/perf/query14.q.out | 2 +- .../test/results/clientpositive/ppd_join5.q.out | 122 +- .../clientpositive/ppd_outer_join5.q.out | 64 +- .../results/clientpositive/smb_mapjoin_47.q.out | 120 +- .../spark/auto_join_reordering_values.q.out | 10 +- .../clientpositive/spark/auto_join_stats.q.out | 34 +- .../clientpositive/spark/auto_join_stats2.q.out | 95 +- .../spark/auto_smb_mapjoin_14.q.out | 2 +- .../spark/auto_sortmerge_join_12.q.out | 8 +- .../spark/auto_sortmerge_join_6.q.out | 464 +++-- .../spark/auto_sortmerge_join_9.q.out | 4 +- .../spark/bucket_map_join_tez1.q.out | 214 +- .../spark/bucket_map_join_tez2.q.out | 28 +- .../spark/column_access_stats.q.out | 38 +- .../results/clientpositive/spark/join19.q.out | 136 +- .../spark/join_cond_pushdown_unqual1.q.out | 274 +-- .../spark/join_cond_pushdown_unqual2.q.out | 152 +- .../spark/join_cond_pushdown_unqual3.q.out | 274 +-- .../spark/join_cond_pushdown_unqual4.q.out | 152 +- .../clientpositive/spark/join_hive_626.q.out | 64 +- .../clientpositive/spark/join_star.q.out | 137 +- .../clientpositive/spark/mergejoins_mixed.q.out | 171 +- .../clientpositive/spark/ppd_join5.q.out | 130 +- .../clientpositive/spark/ppd_outer_join5.q.out | 64 +- .../spark/spark_dynamic_partition_pruning.q.out | 96 +- .../spark_dynamic_partition_pruning_3.q.out | 118 +- ...dynamic_partition_pruning_mapjoin_only.q.out | 192 +- .../spark/spark_explainuser_1.q.out | 352 ++-- .../spark/spark_use_op_stats.q.out | 92 +- .../clientpositive/spark/stats_only_null.q.out | 24 +- .../spark/table_access_keys_stats.q.out | 8 +- .../clientpositive/stats_only_null.q.out | 24 +- .../clientpositive/stats_partial_size.q.out | 10 +- .../results/clientpositive/stats_ppr_all.q.out | 6 +- .../clientpositive/tez/explainanalyze_2.q.out | 208 +- .../clientpositive/tez/explainanalyze_3.q.out | 47 +- .../clientpositive/tez/explainanalyze_5.q.out | 14 +- .../clientpositive/tez/explainuser_3.q.out | 43 +- .../tez/hybridgrace_hashjoin_1.q.out | 195 +- .../tez/multi_count_distinct.q.out | 30 +- .../results/clientpositive/tez/tez-tag.q.out | 59 +- .../tez/vector_join_part_col_char.q.out | 16 +- .../tez/vector_non_string_partition.q.out | 28 +- .../vector_mr_diff_schema_alias.q.out | 39 +- .../clientpositive/vector_outer_join6.q.out | 4 +- .../clientpositive/vectorized_context.q.out | 66 +- 342 files changed, 19045 insertions(+), 17149 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 17af16b..6de07d2 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1667,6 +1667,12 @@ public class HiveConf extends Configuration { "Whether or not to use a binary search to find the entries in an index table that match the filter, where possible"), // Statistics + HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true, + "Estimate statistics in absence of statistics."), + HIVE_STATS_NDV_ESTIMATE_PERC("hive.stats.ndv.estimate.percent", (float)20, + "This many percentage of rows will be estimated as count distinct in absence of statistics."), + HIVE_STATS_NUM_NULLS_ESTIMATE_PERC("hive.stats.num.nulls.estimate.percent", (float)5, + "This many percentage of rows will be estimated as number of nulls in absence of statistics."), HIVESTATSAUTOGATHER("hive.stats.autogather", true, "A flag to gather statistics (only basic) automatically during the INSERT OVERWRITE command."), HIVESTATSCOLAUTOGATHER("hive.stats.column.autogather", false, http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 6504250..7385df6 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -531,6 +531,7 @@ minillaplocal.query.files=\ join_nulls.q,\ join_nullsafe.q,\ join_is_not_distinct_from.q,\ + join_reordering_no_stats.q,\ leftsemijoin_mr.q,\ limit_join_transpose.q,\ lineage2.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java index 22790de..85aa9b3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java @@ -211,19 +211,10 @@ public class RelOptHiveTable extends RelOptAbstractTable { // predicates computePartitionList(hiveConf, null, new HashSet<Integer>()); } - if (hiveTblMetadata.isPartitioned()) { - List<Long> rowCounts = StatsUtils.getBasicStatForPartitions(hiveTblMetadata, - partitionList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); - rowCount = StatsUtils.getSumIgnoreNegatives(rowCounts); - - } else { - rowCount = StatsUtils.getNumRows(hiveTblMetadata); - } + rowCount = StatsUtils.getNumRows(hiveConf, getNonPartColumns(), hiveTblMetadata, + partitionList, noColsMissingStats); } - if (rowCount == -1) - noColsMissingStats.getAndIncrement(); - return rowCount; } @@ -308,44 +299,59 @@ public class RelOptHiveTable extends RelOptAbstractTable { // 2. Obtain Col Stats for Non Partition Cols if (nonPartColNamesThatRqrStats.size() > 0) { - List<ColStatistics> hiveColStats; + List<ColStatistics> hiveColStats = new ArrayList<ColStatistics>(); if (!hiveTblMetadata.isPartitioned()) { // 2.1 Handle the case for unpartitioned table. - hiveColStats = StatsUtils.getTableColumnStats(hiveTblMetadata, hiveNonPartitionCols, - nonPartColNamesThatRqrStats, colStatsCached); - - // 2.1.1 Record Column Names that we needed stats for but couldn't - if (hiveColStats == null) { - colNamesFailedStats.addAll(nonPartColNamesThatRqrStats); - colStatsCached.updateState(State.NONE); - } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) { - Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats); - - Set<String> setOfObtainedColStats = new HashSet<String>(); - for (ColStatistics cs : hiveColStats) { - setOfObtainedColStats.add(cs.getColumnName()); + try { + Statistics stats = StatsUtils.collectStatistics(hiveConf, null, + hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, + colStatsCached, nonPartColNamesThatRqrStats, true, true); + rowCount = stats.getNumRows(); + for (String c : nonPartColNamesThatRqrStats) { + ColStatistics cs = stats.getColumnStatisticsFromColName(c); + if (cs != null) { + hiveColStats.add(cs); + } } - setOfFiledCols.removeAll(setOfObtainedColStats); + colStatsCached.updateState(stats.getColumnStatsState()); - colNamesFailedStats.addAll(setOfFiledCols); + // 2.1.1 Record Column Names that we needed stats for but couldn't + if (hiveColStats.isEmpty()) { + colNamesFailedStats.addAll(nonPartColNamesThatRqrStats); + } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) { + Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats); - colStatsCached.updateState(State.PARTIAL); - } else { - // Column stats in hiveColStats might not be in the same order as the columns in - // nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap - // using nonPartColIndxsThatRqrStats as below - Map<String, ColStatistics> columnStatsMap = - new HashMap<String, ColStatistics>(hiveColStats.size()); - for (ColStatistics cs : hiveColStats) { - columnStatsMap.put(cs.getColumnName(), cs); - } - hiveColStats.clear(); - for (String colName : nonPartColNamesThatRqrStats) { - hiveColStats.add(columnStatsMap.get(colName)); - } + Set<String> setOfObtainedColStats = new HashSet<String>(); + for (ColStatistics cs : hiveColStats) { + setOfObtainedColStats.add(cs.getColumnName()); + } + setOfFiledCols.removeAll(setOfObtainedColStats); - colStatsCached.updateState(State.COMPLETE); + colNamesFailedStats.addAll(setOfFiledCols); + } else { + // Column stats in hiveColStats might not be in the same order as the columns in + // nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap + // using nonPartColIndxsThatRqrStats as below + Map<String, ColStatistics> columnStatsMap = + new HashMap<String, ColStatistics>(hiveColStats.size()); + for (ColStatistics cs : hiveColStats) { + columnStatsMap.put(cs.getColumnName(), cs); + // even though the stats were estimated we need to warn user that + // stats are not available + if(cs.isEstimated()) { + colNamesFailedStats.add(cs.getColumnName()); + } + } + hiveColStats.clear(); + for (String colName : nonPartColNamesThatRqrStats) { + hiveColStats.add(columnStatsMap.get(colName)); + } + } + } catch (HiveException e) { + String logMsg = "Collecting stats for table: " + hiveTblMetadata.getTableName() + " failed."; + LOG.error(logMsg, e); + throw new RuntimeException(logMsg, e); } } else { // 2.2 Obtain col stats for partitioned table. @@ -373,6 +379,9 @@ public class RelOptHiveTable extends RelOptAbstractTable { ColStatistics cs = stats.getColumnStatisticsFromColName(c); if (cs != null) { hiveColStats.add(cs); + if(cs.isEstimated()) { + colNamesFailedStats.add(c); + } } else { colNamesFailedStats.add(c); } http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index ad29d65..423913b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -1258,7 +1258,6 @@ public class StatsRulesProcFactory { // be full aggregation query like count(*) in which case number of // rows will be 1 if (colExprMap.isEmpty()) { - stats.setNumRows(1); updateStats(stats, 1, true, gop); } } @@ -1435,6 +1434,17 @@ public class StatsRulesProcFactory { break; } } + // there could be case where join operators input are not RS e.g. + // map join with Spark. Since following estimation of statistics relies on join operators having it inputs as + // reduced sink it will not work for such cases. So we should not try to estimate stats + if(allSatisfyPreCondition) { + for (int pos = 0; pos < parents.size(); pos++) { + if (!(jop.getParentOperators().get(pos) instanceof ReduceSinkOperator)) { + allSatisfyPreCondition = false; + break; + } + } + } if (allSatisfyPreCondition) { http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java index f2d2e2d..1aafa9e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java @@ -29,11 +29,13 @@ public class ColStatistics { private long numFalses; private Range range; private boolean isPrimaryKey; + private boolean isEstimated; public ColStatistics(String colName, String colType) { this.setColumnName(colName); this.setColumnType(colType); this.setPrimaryKey(false); + this.setIsEstimated(false); } public ColStatistics() { @@ -131,6 +133,9 @@ public class ColStatistics { } sb.append(" isPrimaryKey: "); sb.append(isPrimaryKey); + + sb.append(" isEstimated: "); + sb.append(isEstimated); return sb.toString(); } @@ -143,6 +148,7 @@ public class ColStatistics { clone.setNumTrues(numTrues); clone.setNumFalses(numFalses); clone.setPrimaryKey(isPrimaryKey); + clone.setIsEstimated(isEstimated); if (range != null ) { clone.setRange(range.clone()); } @@ -157,6 +163,12 @@ public class ColStatistics { this.isPrimaryKey = isPrimaryKey; } + public void setIsEstimated(boolean isEstimated) { + this.isEstimated= isEstimated; + } + + public boolean isEstimated() { return isEstimated; } + public static class Range { public final Number minValue; public final Number maxValue; http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 487a823..3041968 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -177,6 +178,105 @@ public class StatsUtils { return ds; } + /** + * Returns number of rows if it exists. Otherwise it estimates number of rows + * based on estimated data size for both partition and non-partitioned table + * RelOptHiveTable's getRowCount uses this. + * + * @param conf + * @param schema + * @param table + * @return + */ + public static long getNumRows(HiveConf conf, List<ColumnInfo> schema, Table table, + PrunedPartitionList partitionList, AtomicInteger noColsMissingStats) { + //for non-partitioned table + List<String> neededColumns = new ArrayList<>(); + for(ColumnInfo ci:schema) { + neededColumns.add(ci.getInternalName()); + } + + boolean shouldEstimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS); + + if(!table.isPartitioned()) { + //get actual number of rows from metastore + long nr = getNumRows(table); + + // log warning if row count is missing + if(nr <= 0) { + noColsMissingStats.getAndIncrement(); + } + + // if row count exists or stats aren't to be estimated return + // whatever we have + if(nr > 0 || !shouldEstimateStats) { + return nr; + } + // go ahead with the estimation + long ds = getDataSize(conf, table); + return getNumRows(conf, schema, neededColumns, table, ds); + } + else { // partitioned table + long nr = 0; + List<Long> rowCounts = Lists.newArrayList(); + rowCounts = getBasicStatForPartitions( + table, partitionList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); + nr = getSumIgnoreNegatives(rowCounts); + + // log warning if row count is missing + if(nr <= 0) { + noColsMissingStats.getAndIncrement(); + } + + // if row count exists or stats aren't to be estimated return + // whatever we have + if(nr > 0 || !shouldEstimateStats) { + return nr; + } + + // estimate row count + long ds = 0; + List<Long> dataSizes = Lists.newArrayList(); + + dataSizes = getBasicStatForPartitions( + table, partitionList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE); + + ds = getSumIgnoreNegatives(dataSizes); + + if (ds <= 0) { + dataSizes = getBasicStatForPartitions( + table, partitionList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE); + ds = getSumIgnoreNegatives(dataSizes); + } + + // if data size still could not be determined, then fall back to filesytem to get file + // sizes + if (ds <= 0 && shouldEstimateStats) { + dataSizes = getFileSizeForPartitions(conf, partitionList.getNotDeniedPartns()); + } + ds = getSumIgnoreNegatives(dataSizes); + float deserFactor = + HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); + ds = (long) (ds * deserFactor); + + int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns); + if (avgRowSize > 0) { + setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize); + nr = getSumIgnoreNegatives(rowCounts); + ds = getSumIgnoreNegatives(dataSizes); + + // number of rows -1 means that statistics from metastore is not reliable + if (nr <= 0) { + nr = ds / avgRowSize; + } + } + if (nr == 0) { + nr = 1; + } + return nr; + } + } + private static long getNumRows(HiveConf conf, List<ColumnInfo> schema, List<String> neededColumns, Table table, long ds) { long nr = getNumRows(table); // number of rows -1 means that statistics from metastore is not reliable @@ -210,15 +310,21 @@ public class StatsUtils { float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); + boolean shouldEstimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS); if (!table.isPartitioned()) { - long ds = getDataSize(conf, table); + //getDataSize tries to estimate stats if it doesn't exist using file size + // we would like to avoid file system calls if it too expensive + long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table); long nr = getNumRows(conf, schema, neededColumns, table, ds); stats.setNumRows(nr); List<ColStatistics> colStats = Lists.newArrayList(); if (fetchColStats) { colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache); + if(colStats == null || colStats.size() < 1) { + colStats = estimateStats(table,schema,neededColumns, conf, nr); + } long betterDS = getDataSizeFromColumnStats(nr, colStats); ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS; } @@ -254,7 +360,7 @@ public class StatsUtils { // if data size still could not be determined, then fall back to filesytem to get file // sizes - if (ds <= 0) { + if (ds <= 0 && shouldEstimateStats) { dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns()); } ds = getSumIgnoreNegatives(dataSizes); @@ -354,10 +460,19 @@ public class StatsUtils { // There are some partitions with no state (or we didn't fetch any state). // Update the stats with empty list to reflect that in the // state/initialize structures. + + if(columnStats.isEmpty()) { + // estimate stats + columnStats = estimateStats(table, schema, neededColumns, conf, nr); + } + + // add partition column stats addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats); - stats.addToColumnStats(columnStats); + stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats)); stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns)); + + stats.addToColumnStats(columnStats); } else { if (statsRetrieved) { columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName())); @@ -765,7 +880,8 @@ public class StatsUtils { hasNull = (colStats == null) || (colStats.size() < neededColumns.size()); if (colStats != null) { for (ColStatistics cs : colStats) { - boolean isNull = cs == null; + // either colstats is null or is estimated + boolean isNull = (cs == null) ? true: (cs.isEstimated()); hasStats |= !isNull; hasNull |= isNull; if (hasNull && hasStats) break; @@ -869,6 +985,78 @@ public class StatsUtils { return cs; } + private static ColStatistics estimateColStats(long numRows, String colName, HiveConf conf, + List<ColumnInfo> schema) { + ColumnInfo cinfo = getColumnInfoForColumn(colName, schema); + ColStatistics cs = new ColStatistics(colName, cinfo.getTypeName()); + cs.setIsEstimated(true); + + String colTypeLowerCase = cinfo.getTypeName().toLowerCase(); + + float ndvPercent = Math.min(100L, HiveConf.getFloatVar(conf, ConfVars.HIVE_STATS_NDV_ESTIMATE_PERC)); + float nullPercent = Math.min(100L, HiveConf.getFloatVar(conf, ConfVars.HIVE_STATS_NUM_NULLS_ESTIMATE_PERC)); + + cs.setCountDistint(Math.max(1, (long)(numRows * ndvPercent/100.00))); + cs.setNumNulls(Math.min(numRows, (long)(numRows * nullPercent/100.00))); + + if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)){ + cs.setAvgColLen(JavaDataModel.get().primitive1()); + cs.setRange(-128,127); + } + else if(colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)){ + cs.setAvgColLen(JavaDataModel.get().primitive1()); + cs.setRange(-32768, 32767); + } else if(colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().primitive1()); + cs.setRange(Long.MIN_VALUE, Long.MAX_VALUE); + } else if (colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().primitive2()); + cs.setRange(Integer.MIN_VALUE, Integer.MAX_VALUE); + } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().primitive1()); + cs.setRange(Float.MIN_VALUE, Float.MAX_VALUE); + } else if (colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().primitive2()); + cs.setRange(Double.MIN_VALUE, Double.MAX_VALUE); + } else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) + || colTypeLowerCase.startsWith(serdeConstants.BINARY_TYPE_NAME) + || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME) + || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { + cs.setAvgColLen(getAvgColLenOf(conf,cinfo.getObjectInspector(), cinfo.getTypeName())); + } else if (colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { + cs.setCountDistint(2); + cs.setNumTrues(Math.max(1, (long)numRows/2)); + cs.setNumFalses(Math.max(1, (long)numRows/2)); + cs.setAvgColLen(JavaDataModel.get().primitive1()); + } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) || + colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp()); + } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal()); + cs.setRange(Float.MIN_VALUE, Float.MAX_VALUE); + } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) { + cs.setAvgColLen(JavaDataModel.get().lengthOfDate()); + // epoch, days since epoch + cs.setRange(0, 25201); + } else { + // Columns statistics for complex datatypes are not supported yet + return null; + } + return cs; + } + + private static List<ColStatistics> estimateStats(Table table, List<ColumnInfo> schema, + List<String> neededColumns, HiveConf conf, long nr) { + + List<ColStatistics> stats = new ArrayList<ColStatistics>(neededColumns.size()); + + for (int i = 0; i < neededColumns.size(); i++) { + ColStatistics cs = estimateColStats(nr, neededColumns.get(i), conf, schema); + stats.add(cs); + } + return stats; + } + /** * Get table level column statistics from metastore for needed columns * @param table @@ -912,10 +1100,10 @@ public class StatsUtils { } // Merge stats from cache with metastore cache if (colStatsCache != null) { - for (int i = 0; i < neededColumns.size(); i++) { - ColStatistics cs = colStatsCache.getColStats().get(neededColumns.get(i)); + for(String col:neededColumns) { + ColStatistics cs = colStatsCache.getColStats().get(col); if (cs != null) { - stats.add(i, cs); + stats.add(cs); if (LOG.isDebugEnabled()) { LOG.debug("Stats for column " + cs.getColumnName() + " in table " + table.getCompleteName() + " retrieved from cache"); @@ -1153,7 +1341,9 @@ public class StatsUtils { } else if (colTypeLowerCase.equals(serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME)) { return JavaDataModel.JAVA32_META; } else { - throw new IllegalArgumentException("Size requested for unknown type: " + colType); + //TODO: support complex types + // for complex type we simply return 0 + return 0; } } http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/queries/clientpositive/join_reordering_no_stats.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/join_reordering_no_stats.q b/ql/src/test/queries/clientpositive/join_reordering_no_stats.q new file mode 100644 index 0000000..3ea9f0c --- /dev/null +++ b/ql/src/test/queries/clientpositive/join_reordering_no_stats.q @@ -0,0 +1,63 @@ +set hive.stats.autogather=false; + +create table supplier_nostats (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, +S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING); + +CREATE TABLE lineitem_nostats (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|'; + +CREATE TABLE part_nostats( + p_partkey INT, + p_name STRING, + p_mfgr STRING, + p_brand STRING, + p_type STRING, + p_size INT, + p_container STRING, + p_retailprice DOUBLE, + p_comment STRING +); + +-- should not have cross join +explain select count(1) from part_nostats,supplier_nostats,lineitem_nostats where p_partkey = l_partkey and s_suppkey = l_suppkey; + +set hive.stats.estimate=false; +explain select count(1) from part_nostats,supplier_nostats,lineitem_nostats where p_partkey = l_partkey and s_suppkey = l_suppkey; + +CREATE TABLE Employee_Part(employeeID int, employeeName String) partitioned by (employeeSalary double, country string) +row format delimited fields terminated by '|' stored as textfile; + +LOAD DATA LOCAL INPATH "../../data/files/employee.dat" INTO TABLE Employee_Part partition(employeeSalary='2000.0', country='USA'); +LOAD DATA LOCAL INPATH "../../data/files/employee2.dat" INTO TABLE Employee_Part partition(employeeSalary='2000.0', country='UK'); +LOAD DATA LOCAL INPATH "../../data/files/employee2.dat" INTO TABLE Employee_Part partition(employeeSalary='3000.0', country='USA'); +LOAD DATA LOCAL INPATH "../../data/files/employee2.dat" INTO TABLE Employee_Part partition(employeeSalary='4000.0', country='USA'); +LOAD DATA LOCAL INPATH "../../data/files/employee2.dat" INTO TABLE Employee_Part partition(employeeSalary='3500.0', country='UK'); +LOAD DATA LOCAL INPATH "../../data/files/employee.dat" INTO TABLE Employee_Part partition(employeeSalary='3000.0', country='UK'); + +-- partitioned table +set hive.stats.estimate=true; +explain select count(1) from Employee_Part,supplier_nostats,lineitem_nostats where employeeID= l_partkey and s_suppkey = l_suppkey; + +set hive.stats.estimate=false; +explain select count(1) from Employee_Part,supplier_nostats,lineitem_nostats where employeeID= l_partkey and s_suppkey = l_suppkey; + +drop table Employee_Part; +drop table supplier_nostats; +drop table lineitem_nostats; +drop table part_nostats; http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/annotate_stats_filter.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_filter.q.out b/ql/src/test/results/clientpositive/annotate_stats_filter.q.out index e22c3ef..b2f9836 100644 --- a/ql/src/test/results/clientpositive/annotate_stats_filter.q.out +++ b/ql/src/test/results/clientpositive/annotate_stats_filter.q.out @@ -66,11 +66,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: explain select * from loc_orc where state='OH' @@ -87,17 +87,17 @@ STAGE PLANS: Map Operator Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: (state = 'OH') (type: boolean) - Statistics: Num rows: 4 Data size: 398 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: 'OH' (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 4 Data size: 398 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 4 Data size: 398 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out b/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out index fccfabd..f9a1eb8 100644 --- a/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out +++ b/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out @@ -66,11 +66,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: analyze table loc_orc compute statistics for columns state http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/annotate_stats_part.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_part.q.out b/ql/src/test/results/clientpositive/annotate_stats_part.q.out index 866d30a..def4d4f 100644 --- a/ql/src/test/results/clientpositive/annotate_stats_part.q.out +++ b/ql/src/test/results/clientpositive/annotate_stats_part.q.out @@ -54,11 +54,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: insert overwrite table loc_orc partition(year) select * from loc_staging @@ -90,11 +90,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 6 Data size: 1884 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 6 Data size: 3060 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1104 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 6 Data size: 2280 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: analyze table loc_orc partition(year='2001') compute statistics @@ -121,11 +121,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 3 Data size: 372 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 936 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), '__HIVE_DEFAULT_PARTITION__' (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 3 Data size: 372 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 936 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: explain select * from loc_orc @@ -142,11 +142,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 7 Data size: 1966 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 7 Data size: 3338 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 7 Data size: 1288 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 7 Data size: 2660 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: explain select * from loc_orc where year='2001' @@ -163,11 +163,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 7 Data size: 734 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 7 Data size: 2050 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), '2001' (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 7 Data size: 734 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 7 Data size: 2050 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: analyze table loc_orc partition(year) compute statistics @@ -196,11 +196,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 292 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), '__HIVE_DEFAULT_PARTITION__' (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 292 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: explain select * from loc_orc @@ -217,11 +217,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 2246 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 3814 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 1472 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 3040 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: explain select * from loc_orc where year='2001' or year='__HIVE_DEFAULT_PARTITION__' @@ -238,11 +238,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 2246 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 3814 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 1472 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 8 Data size: 3040 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: explain select * from loc_orc where year='2001' and year='__HIVE_DEFAULT_PARTITION__' @@ -259,14 +259,14 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: PARTIAL Filter Operator predicate: false (type: boolean) - Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: analyze table loc_orc partition(year='2001') compute statistics for columns state,locid @@ -398,11 +398,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: explain select * from loc_orc http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/annotate_stats_select.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_select.q.out b/ql/src/test/results/clientpositive/annotate_stats_select.q.out index e3f08ea..dec7f40 100644 --- a/ql/src/test/results/clientpositive/annotate_stats_select.q.out +++ b/ql/src/test/results/clientpositive/annotate_stats_select.q.out @@ -103,11 +103,11 @@ STAGE PLANS: Processor Tree: TableScan alias: alltypes_orc - Statistics: Num rows: 2 Data size: 1686 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2 Data size: 1002 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: bo1 (type: boolean), ti1 (type: tinyint), si1 (type: smallint), i1 (type: int), bi1 (type: bigint), f1 (type: float), d1 (type: double), de1 (type: decimal(10,0)), ts1 (type: timestamp), da1 (type: timestamp), s1 (type: string), vc1 (type: varchar(5)), m1 (type: map<string,string>), l1 (type: array<int>), st1 (type: struct<c1:int,c2:string>) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 - Statistics: Num rows: 2 Data size: 1686 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2 Data size: 1002 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: analyze table alltypes_orc compute statistics for columns bo1, ti1, si1, i1, bi1, f1, d1, s1, vc1 http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/annotate_stats_table.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_table.q.out b/ql/src/test/results/clientpositive/annotate_stats_table.q.out index efc3c1f..5d443f1 100644 --- a/ql/src/test/results/clientpositive/annotate_stats_table.q.out +++ b/ql/src/test/results/clientpositive/annotate_stats_table.q.out @@ -42,11 +42,11 @@ STAGE PLANS: Processor Tree: TableScan alias: emp_orc - Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: lastname (type: string), deptid (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging @@ -81,11 +81,11 @@ STAGE PLANS: Processor Tree: TableScan alias: emp_orc - Statistics: Num rows: 3 Data size: 394 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 564 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: lastname (type: string), deptid (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 3 Data size: 394 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 564 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: analyze table emp_orc compute statistics @@ -110,11 +110,11 @@ STAGE PLANS: Processor Tree: TableScan alias: emp_orc - Statistics: Num rows: 48 Data size: 4512 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 48 Data size: 8836 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: lastname (type: string), deptid (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 48 Data size: 4512 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 48 Data size: 8836 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: analyze table emp_orc compute statistics for columns deptid http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/auto_join_reordering_values.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/auto_join_reordering_values.q.out b/ql/src/test/results/clientpositive/auto_join_reordering_values.q.out index 156be41..ba8d16c 100644 --- a/ql/src/test/results/clientpositive/auto_join_reordering_values.q.out +++ b/ql/src/test/results/clientpositive/auto_join_reordering_values.q.out @@ -182,7 +182,7 @@ STAGE PLANS: name: default.orderpayment_small name: default.orderpayment_small Truncated Path -> Alias: - /orderpayment_small [$hdt$_0:orderpayment, $hdt$_1:dim_pay_date] + /orderpayment_small [$hdt$_1:orderpayment, $hdt$_2:dim_pay_date] Needs Tagging: true Reduce Operator Tree: Join Operator @@ -318,7 +318,7 @@ STAGE PLANS: name: default.orderpayment_small name: default.orderpayment_small Truncated Path -> Alias: - /orderpayment_small [$hdt$_2:deal] + /orderpayment_small [$hdt$_3:deal] #### A masked pattern was here #### Needs Tagging: true Reduce Operator Tree: @@ -455,7 +455,7 @@ STAGE PLANS: name: default.orderpayment_small name: default.orderpayment_small Truncated Path -> Alias: - /orderpayment_small [$hdt$_3:order_city] + /orderpayment_small [$hdt$_4:order_city] #### A masked pattern was here #### Needs Tagging: true Reduce Operator Tree: @@ -592,7 +592,7 @@ STAGE PLANS: name: default.user_small name: default.user_small Truncated Path -> Alias: - /user_small [$hdt$_4:user] + /user_small [$hdt$_0:user] #### A masked pattern was here #### Needs Tagging: true Reduce Operator Tree: http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/auto_join_stats.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/auto_join_stats.q.out b/ql/src/test/results/clientpositive/auto_join_stats.q.out index e80af96..cb21718 100644 --- a/ql/src/test/results/clientpositive/auto_join_stats.q.out +++ b/ql/src/test/results/clientpositive/auto_join_stats.q.out @@ -305,11 +305,11 @@ STAGE PLANS: Stage: Stage-11 Map Reduce Local Work Alias -> Map Local Tables: - $hdt$_1:src2 + $hdt$_2:src2 Fetch Operator limit: -1 Alias -> Map Local Operator Tree: - $hdt$_1:src2 + $hdt$_2:src2 TableScan alias: src2 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE @@ -358,14 +358,14 @@ STAGE PLANS: Stage: Stage-10 Map Reduce Local Work Alias -> Map Local Tables: - $hdt$_2:smalltable + $hdt$_0:smalltable Fetch Operator limit: -1 $hdt$_3:smalltable2 Fetch Operator limit: -1 Alias -> Map Local Operator Tree: - $hdt$_2:smalltable + $hdt$_0:smalltable TableScan alias: smalltable Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE @@ -410,26 +410,30 @@ STAGE PLANS: 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double) 1 UDFToDouble(_col0) (type: double) 2 UDFToDouble(_col0) (type: double) - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1, _col3 Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Local Work: Map Reduce Local Work Stage: Stage-12 Map Reduce Local Work Alias -> Map Local Tables: - $hdt$_0:src1 + $hdt$_1:src1 Fetch Operator limit: -1 Alias -> Map Local Operator Tree: - $hdt$_0:src1 + $hdt$_1:src1 TableScan alias: src1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/auto_join_stats2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/auto_join_stats2.q.out b/ql/src/test/results/clientpositive/auto_join_stats2.q.out index 6ea5afa..1a3caa6 100644 --- a/ql/src/test/results/clientpositive/auto_join_stats2.q.out +++ b/ql/src/test/results/clientpositive/auto_join_stats2.q.out @@ -14,6 +14,7 @@ POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table sma POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@smalltable +Warning: Map Join MAPJOIN[24][bigTable=?] in task 'Stage-5:MAPRED' is a cross product PREHOOK: query: explain select src1.key, src2.key, smalltable.key from src src1 JOIN src src2 ON (src1.key = src2.key) JOIN smalltable ON (src1.key + src2.key = smalltable.key) PREHOOK: type: QUERY POSTHOOK: query: explain select src1.key, src2.key, smalltable.key from src src1 JOIN src src2 ON (src1.key = src2.key) JOIN smalltable ON (src1.key + src2.key = smalltable.key) @@ -62,8 +63,8 @@ STAGE PLANS: Statistics: Num rows: 1 Data size: 30 Basic stats: COMPLETE Column stats: NONE HashTable Sink Operator keys: - 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double) - 1 UDFToDouble(_col0) (type: double) + 0 + 1 Stage: Stage-5 Map Reduce @@ -82,25 +83,32 @@ STAGE PLANS: condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: string) - 1 _col0 (type: string) + 0 + 1 outputColumnNames: _col0, _col1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 500 Data size: 20812 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 keys: - 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double) - 1 UDFToDouble(_col0) (type: double) + 0 _col0 (type: string) + 1 _col0 (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Statistics: Num rows: 550 Data size: 22893 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((UDFToDouble(_col2) + UDFToDouble(_col0)) = UDFToDouble(_col1)) (type: boolean) + Statistics: Num rows: 275 Data size: 11446 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: string), _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 275 Data size: 11446 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 11446 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Local Work: Map Reduce Local Work @@ -110,6 +118,7 @@ STAGE PLANS: Processor Tree: ListSink +Warning: Map Join MAPJOIN[24][bigTable=?] in task 'Stage-5:MAPRED' is a cross product PREHOOK: query: select src1.key, src2.key, smalltable.key from src src1 JOIN src src2 ON (src1.key = src2.key) JOIN smalltable ON (src1.key + src2.key = smalltable.key) PREHOOK: type: QUERY PREHOOK: Input: default@smalltable @@ -159,47 +168,47 @@ STAGE PLANS: Stage: Stage-8 Map Reduce Local Work Alias -> Map Local Tables: - $hdt$_0:src1 + $hdt$_0:smalltable Fetch Operator limit: -1 - $hdt$_2:smalltable + $hdt$_1:src1 Fetch Operator limit: -1 $hdt$_3:smalltable2 Fetch Operator limit: -1 Alias -> Map Local Operator Tree: - $hdt$_0:src1 + $hdt$_0:smalltable TableScan - alias: src1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + alias: smalltable + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string) outputColumnNames: _col0 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE HashTable Sink Operator keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - $hdt$_2:smalltable + 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double) + 1 UDFToDouble(_col0) (type: double) + 2 UDFToDouble(_col0) (type: double) + $hdt$_1:src1 TableScan - alias: smalltable - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string) outputColumnNames: _col0 - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE HashTable Sink Operator keys: - 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double) - 1 UDFToDouble(_col0) (type: double) - 2 UDFToDouble(_col0) (type: double) + 0 _col0 (type: string) + 1 _col0 (type: string) $hdt$_3:smalltable2 TableScan alias: smalltable2 @@ -246,15 +255,19 @@ STAGE PLANS: 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double) 1 UDFToDouble(_col0) (type: double) 2 UDFToDouble(_col0) (type: double) - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1, _col3 Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Local Work: Map Reduce Local Work http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/auto_sortmerge_join_12.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/auto_sortmerge_join_12.q.out b/ql/src/test/results/clientpositive/auto_sortmerge_join_12.q.out index d129807..7875e96 100644 --- a/ql/src/test/results/clientpositive/auto_sortmerge_join_12.q.out +++ b/ql/src/test/results/clientpositive/auto_sortmerge_join_12.q.out @@ -148,7 +148,7 @@ STAGE PLANS: Stage: Stage-9 Map Reduce Local Work Alias -> Map Local Tables: - $hdt$_0:a + $hdt$_1:a Fetch Operator limit: -1 Partition Description: @@ -200,7 +200,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.bucket_small name: default.bucket_small - $hdt$_1:b + $hdt$_2:b Fetch Operator limit: -1 Partition Description: @@ -305,7 +305,7 @@ STAGE PLANS: name: default.bucket_medium name: default.bucket_medium Alias -> Map Local Operator Tree: - $hdt$_0:a + $hdt$_1:a TableScan alias: a Statistics: Num rows: 1 Data size: 114 Basic stats: COMPLETE Column stats: NONE @@ -324,7 +324,7 @@ STAGE PLANS: 1 _col0 (type: string) 2 _col0 (type: string) Position of Big Table: 2 - $hdt$_1:b + $hdt$_2:b TableScan alias: b Statistics: Num rows: 1 Data size: 170 Basic stats: COMPLETE Column stats: NONE @@ -603,8 +603,8 @@ STAGE PLANS: name: default.bucket_small name: default.bucket_small Truncated Path -> Alias: - /bucket_big/ds=2008-04-08 [$hdt$_2:c] - /bucket_big/ds=2008-04-09 [$hdt$_2:c] + /bucket_big/ds=2008-04-08 [$hdt$_0:c] + /bucket_big/ds=2008-04-09 [$hdt$_0:c] Needs Tagging: false Reduce Operator Tree: Group By Operator http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out b/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out index 23f5fcf..88b5d84 100644 --- a/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out +++ b/ql/src/test/results/clientpositive/cbo_rp_annotate_stats_groupby.q.out @@ -66,11 +66,11 @@ STAGE PLANS: Processor Tree: TableScan alias: loc_orc - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: state, locid, zip, year - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 1600 Basic stats: COMPLETE Column stats: NONE ListSink PREHOOK: query: analyze table loc_orc compute statistics for columns state http://git-wip-us.apache.org/repos/asf/hive/blob/8f7c5788/ql/src/test/results/clientpositive/columnStatsUpdateForStatsOptimizer_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/columnStatsUpdateForStatsOptimizer_2.q.out b/ql/src/test/results/clientpositive/columnStatsUpdateForStatsOptimizer_2.q.out index a7c9b3f..4e430b3 100644 --- a/ql/src/test/results/clientpositive/columnStatsUpdateForStatsOptimizer_2.q.out +++ b/ql/src/test/results/clientpositive/columnStatsUpdateForStatsOptimizer_2.q.out @@ -200,29 +200,29 @@ STAGE PLANS: Map Operator Tree: TableScan alias: calendar - Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: month (type: int) outputColumnNames: month - Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: max(month) mode: hash outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator sort order: - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: int) Reduce Operator Tree: Group By Operator aggregations: max(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -322,29 +322,29 @@ STAGE PLANS: Map Operator Tree: TableScan alias: calendar - Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: month (type: int) outputColumnNames: month - Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: max(month) mode: hash outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator sort order: - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: int) Reduce Operator Tree: Group By Operator aggregations: max(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat