[ 
https://issues.apache.org/jira/browse/PIG-1263?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Olga Natkovich reassigned PIG-1263:
-----------------------------------

    Assignee: Daniel Dai

> Script producing varying number of records when COGROUPing value of map data 
> type with and without types
> --------------------------------------------------------------------------------------------------------
>
>                 Key: PIG-1263
>                 URL: https://issues.apache.org/jira/browse/PIG-1263
>             Project: Pig
>          Issue Type: Bug
>          Components: impl
>    Affects Versions: 0.6.0
>            Reporter: Viraj Bhat
>            Assignee: Daniel Dai
>             Fix For: 0.7.0
>
>
> I have a Pig script which I am experimenting upon. [[Albeit this is not 
> optimized and can be done in variety of ways]] I get different record counts 
> by placing load store pairs in the script.
> Case 1: Returns 424329 records
> Case 2: Returns 5859 records
> Case 3: Returns 5859 records
> Case 4: Returns 5578 records
> I am wondering what the correct result is?
> Here are the scripts.
> Case 1: 
> {code}
> register udf.jar
> A = LOAD '/user/viraj/data/20100203' USING MapLoader() AS (s, m, l);
> B = FOREACH A GENERATE
>         s#'key1' as key1,
>         s#'key2' as key2;
> C = FOREACH B generate key2;
> D = filter C by (key2 IS NOT null);
> E = distinct D;
> store E into 'unique_key_list' using PigStorage('\u0001');
> F = Foreach E generate key2, MapGenerate(key2) as m;
> G = FILTER F by (m IS NOT null);
> H = foreach G generate key2, m#'id1' as id1, m#'id2' as id2, m#'id3' as id3, 
> m#'id4' as id4, m#'id5' as id5, m#'id6' as id6, m#'id7' as id7, m#'id8' as 
> id8, m#'id9' as id9, m#'id10' as id10, m#'id11' as id11, m#'id12' as id12;
> I = GROUP H BY (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12);
> J = Foreach I generate group.id1 as id1, group.id2 as id2, group.id3 as id3, 
> group.id4 as id4,group.id5 as id5, group.id6 as id6, group.id7 as id7, 
> group.id8 as id8, group.id9 as id9, group.id10 as id10, group.id11 as id11, 
> group.id12 as id12;
> --load previous days data
> K = LOAD '/user/viraj/data/20100202' USING PigStorage('\u0001') as (id1, id2, 
> id3, id4, id5, id6, id7, id8, id9, id10, id11, id12);
> L = COGROUP  K by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER,
>              J by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER;
> M = filter L by IsEmpty(K);
> store M into 'cogroupNoTypes' using PigStorage();
> {code}
> Case 2:  Storing and loading intermediate results in J 
> {code}
> register udf.jar
> A = LOAD '/user/viraj/data/20100203' USING MapLoader() AS (s, m, l);
> B = FOREACH A GENERATE
>         s#'key1' as key1,
>         s#'key2' as key2;
> C = FOREACH B generate key2;
> D = filter C by (key2 IS NOT null);
> E = distinct D;
> store E into 'unique_key_list' using PigStorage('\u0001');
> F = Foreach E generate key2, MapGenerate(key2) as m;
> G = FILTER F by (m IS NOT null);
> H = foreach G generate key2, m#'id1' as id1, m#'id2' as id2, m#'id3' as id3, 
> m#'id4' as id4, m#'id5' as id5, m#'id6' as id6, m#'id7' as id7, m#'id8' as 
> id8, m#'id9' as id9, m#'id10' as id10, m#'id11' as id11, m#'id12' as id12;
> I = GROUP H BY (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12);
> J = Foreach I generate group.id1 as id1, group.id2 as id2, group.id3 as id3, 
> group.id4 as id4,group.id5 as id5, group.id6 as id6, group.id7 as id7, 
> group.id8 as id8, group.id9 as id9, group.id10 as id10, group.id11 as id11, 
> group.id12 as id12;
> --store intermediate data to HDFS and re-read
> store J into 'output/20100203/J' using PigStorage('\u0001');
> --load previous days data
> K = LOAD '/user/viraj/data/20100202' USING PigStorage('\u0001') as (id1, id2, 
> id3, id4, id5, id6, id7, id8, id9, id10, id11, id12);
> --read J into K1
> K1 = LOAD 'output/20100203/J' using PigStorage('\u0001') as (id1, id2, id3, 
> id4, id5, id6, id7, id8, id9, id10, id11, id12);
> L = COGROUP  K by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER,
>              K1 by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER;
> M = filter L by IsEmpty(K);
> store M into 'cogroupNoTypesIntStore' using PigStorage();
> {code}
> Case 3: Types information specified but no intermediate store of J
> {code}
> register udf.jar
> A = LOAD '/user/viraj/data/20100203' USING MapLoader() AS (s, m, l);
> B = FOREACH A GENERATE
>         s#'key1' as key1,
>         s#'key2' as key2;
> C = FOREACH B generate key2;
> D = filter C by (key2 IS NOT null);
> E = distinct D;
> store E into 'unique_key_list' using PigStorage('\u0001');
> F = Foreach E generate key2, MapGenerate(key2) as m;
> G = FILTER F by (m IS NOT null);
> H = foreach G generate key2, (long)m#'id1' as id1, (long)m#'id2' as id2, 
> (long)m#'id3' as id3, (long)m#'id4' as id4, (long)m#'id5' as id5, 
> (long)m#'id6' as id6, (long)m#'id7' as id7, (chararray)m#'id8' as id8, 
> (chararray)m#'id9' as id9, (chararray)m#'id10' as id10, (chararray)m#'id11' 
> as id11, (chararray)m#'id12' as id12;
> I = GROUP H BY (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12);
> J = Foreach I generate group.id1 as id1, group.id2 as id2, group.id3 as id3, 
> group.id4 as id4,group.id5 as id5, group.id6 as id6, group.id7 as id7, 
> group.id8 as id8, group.id9 as id9, group.id10 as id10, group.id11 as id11, 
> group.id12 as id12;
> store J into 'output/20100203/J' using PigStorage('\u0001');
> --load previous days data with type information
> K = LOAD '/user/viraj/data/20100202' USING PigStorage('\u0001') as  
> (id1:chararray, id2:long, id3:long, id4:long, id5:long, id6:long, id7:long, 
> id8:long, id9:chararray, id10:chararray, 
> id11:chararray,id12:chararray,id13:chararray);
> L = COGROUP  K by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER,
>              J by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER;
> M = filter L by IsEmpty(K);
> store M into 'cogroupTypesStore' using PigStorage();
> {code}
> Case 4: Split the store of script into 2 parts one which stores alias G and 
> the other which loads G. Both are run separately.
> Script 1
> {code}
> register udf.jar
> A = LOAD '/user/viraj/data/20100203' USING MapLoader() AS (s, m, l);
> B = FOREACH A GENERATE
>         s#'key1' as key1,
>         s#'key2' as key2;
> C = FOREACH B generate key2;
> D = filter C by (key2 IS NOT null);
> E = distinct D;
> store E into 'unique_key_list' using PigStorage('\u0001');
> F = Foreach E generate key2, MapGenerate(key2) as m;
> G = FILTER F by (m IS NOT null);
> store G into 'output/20100203/G' using PigStorage('\u0001');
> {code}
> Script 2:
> {code}
> G = load 'output/20100203/G' using PigStorage('\u0001') as (ip, m:map[]);
> H = foreach G generate key2, (long)m#'id1' as id1, (long)m#'id2' as id2, 
> (long)m#'id3' as id3, (long)m#'id4' as id4, (long)m#'id5' as id5, 
> (long)m#'id6' as id6, (long)m#'id7' as id7, (chararray)m#'id8' as id8, 
> (chararray)m#'id9' as id9, (chararray)m#'id10' as id10, (chararray)m#'id11' 
> as id11, (chararray)m#'id12' as id12;
> I = GROUP H BY (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12);
> J = Foreach I generate group.id1 as id1, group.id2 as id2, group.id3 as id3, 
> group.id4 as id4,group.id5 as id5, group.id6 as id6, group.id7 as id7, 
> group.id8 as id8, group.id9 as id9, group.id10 as id10, group.id11 as id11, 
> group.id12 as id12;
> store J into 'output/20100203/J' using PigStorage('\u0001');
> K = LOAD '/user/viraj/data/20100202' USING PigStorage('\u0001') as  
> (id1:chararray, id2:long, id3:long, id4:long, id5:long, id6:long, id7:long, 
> id8:long, id9:chararray, id10:chararray, 
> id11:chararray,id12:chararray,id13:chararray);
> L = COGROUP  K by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER,
>              J by (id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, 
> id12) OUTER;
> M = filter L by IsEmpty(K);
> store M into 'cogroupTypesIntStore' using PigStorage();
> {code}

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

Reply via email to