Dear Wiki user, You have subscribed to a wiki page or wiki category on "Pig Wiki" for change notification.
The "PigMix" page has been changed by daijy. http://wiki.apache.org/pig/PigMix?action=diff&rev1=15&rev2=16 -------------------------------------------------- {{{ A = load 'page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); - B = order A by user parallel $mappers; + B = order A by user $parallelfactor; store B into 'page_views_sorted' using PigStorage('\u0001'); alpha = load 'users' using PigStorage('\u0001') as (name, phone, address, city, state, zip); - a1 = order alpha by name parallel $mappers; + a1 = order alpha by name $parallelfactor; store a1 into 'users_sorted' using PigStorage('\u0001'); a = load 'power_users' using PigStorage('\u0001') as (name, phone, address, city, state, zip); @@ -287, +287 @@ This script tests reading from a map, flattening a bag of maps, and use of bincond (features 2, 3, and 4). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, (int)action as action, (map[])page_info as page_info, @@ -304, +304 @@ This script tests using a join small enough to do in fragment and replicate (feature 7). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, estimated_revenue; @@ -321, +321 @@ something that pig could potentially optimize by not regrouping. {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, (double)estimated_revenue; @@ -340, +340 @@ This script covers foreach generate with a nested distinct (feature 10). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, action; @@ -359, +359 @@ This script does an anti-join. This is useful because it is a use of cogroup that is not a regular join (feature 9). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user; @@ -377, +377 @@ This script covers the case where the group by key is a significant percentage of the row (feature 12). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, action, (int)timespent as timespent, query_term, ip_addr, timestamp; @@ -392, +392 @@ This script covers having a nested plan with splits (feature 11). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() as (user, action, timespent, query_term, + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, timestamp; C = group B by user $parallelfactor; @@ -409, +409 @@ This script covers group all (feature 13). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, (int)timespent as timespent, (double)estimated_revenue as estimated_revenue; @@ -423, +423 @@ This script covers order by of a single value (feature 15). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = order A by query_term $parallelfactor; @@ -435, +435 @@ This script covers order by of multiple values (feature 15). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent:int, query_term, ip_addr, timestamp, estimated_revenue:double, page_info, page_links); B = order A by query_term, estimated_revenue desc, timespent $parallelfactor; @@ -448, +448 @@ This script covers distinct and union and reading from a wide row but using only one field (features: 1, 14). {{{ register pigperf.jar; - A = load '$page_views' using org.apache.pig.test.utils.datagen.PigPerformanceLoader() + A = load '$page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user; @@ -520, +520 @@ A = load 'page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, action, estimated_revenue, timespent; - C = group B by user parallel 40; + C = group B by user $parallelfactor; D = foreach C { beth = distinct B.action; rev = distinct B.estimated_revenue; @@ -538, +538 @@ A = load 'page_views' using org.apache.pig.test.udf.storefunc.PigPerformanceLoader() as (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, page_info, page_links); B = foreach A generate user, estimated_revenue; - C = group B by user parallel 40; + C = group B by user $parallelfactor; D = foreach C { E = order B by estimated_revenue; F = E.estimated_revenue; @@ -560, +560 @@ B = group A by (user, action, timespent, query_term, ip_addr, timestamp, estimated_revenue, user_1, action_1, timespent_1, query_term_1, ip_addr_1, timestamp_1, estimated_revenue_1, user_2, action_2, timespent_2, query_term_2, ip_addr_2, timestamp_2, - estimated_revenue_2) parallel 40; + estimated_revenue_2) $parallelfactor; C = foreach B generate SUM(A.timespent), SUM(A.timespent_1), SUM(A.timespent_2), AVG(A.estimated_revenue), AVG(A.estimated_revenue_1), AVG(A.estimated_revenue_2); store C into '$out'; }}}