Dear Wiki user, You have subscribed to a wiki page or wiki category on "Hadoop Wiki" for change notification.
The following page has been changed by ZhengShao: http://wiki.apache.org/hadoop/Hive/HiveQL/Transform ------------------------------------------------------------------------------ Hive runs the reduce script in the reduce task (instead of the map task) because of the ''clusterBy''/''distributeBy''/''sortBy'' clause in the inner query. {{{ - clusterBy: CLUSTER BY colName (, colName)* + clusterBy: CLUSTER BY colName (',' colName)* - distributeBy: DISTRIBUTE BY colName (, colName)* + distributeBy: DISTRIBUTE BY colName (',' colName)* - sortBy: SORT BY colName (, colName)* + sortBy: SORT BY colName (',' colName)* query: FROM ( FROM src - MAP expression (, expression)* + MAP '(' expression (',' expression)* ')' USING 'my_map_script' - ( AS colName (, colName)* )? + ( AS colName (',' colName)* )? ( clusterBy? | distributeBy? sortBy? ) src_alias ) - REDUCE expression (, expression)* + REDUCE '(' expression (, expression)* ')' USING 'my_reduce_script' - ( AS colName (, colName)* )? + ( AS colName (',' colName)* )? }}} Example: {{{ FROM ( FROM pv_users - MAP pv_users.userid, pv_users.date + MAP ( pv_users.userid, pv_users.date ) USING 'map_script' AS dt, uid CLUSTER BY dt) map_output INSERT OVERWRITE TABLE pv_users_reduced - REDUCE map_output.dt, map_output.uid + REDUCE ( map_output.dt, map_output.uid ) USING 'reduce_script' AS date, count; }}} @@ -51, +51 @@ {{{ FROM ( FROM pv_users - MAP pv_users.userid, pv_users.date + MAP ( pv_users.userid, pv_users.date ) USING 'map_script' CLUSTER BY key) map_output INSERT OVERWRITE TABLE pv_users_reduced - REDUCE map_output.key, map_output.value + REDUCE ( map_output.key, map_output.value ) USING 'reduce_script' AS date, count; }}} @@ -73, +73 @@ {{{ FROM ( FROM pv_users - MAP pv_users.userid, pv_users.date + MAP ( pv_users.userid, pv_users.date ) USING 'map_script' AS c1, c2, c3 DISTRIBUTE BY c2 SORT BY c2, c1) map_output INSERT OVERWRITE TABLE pv_users_reduced - REDUCE map_output.c1, map_output.c2, map_output.c3 + REDUCE ( map_output.c1, map_output.c2, map_output.c3 ) USING 'reduce_script' AS date, count; }}}
