Hi Guys,

I am running following Pig script in Pig 0.8 version

page_events = LOAD '/user/sgehlot/day=2011-05-10' as
(event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_number:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:int,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbound_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_page:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time_since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edition_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,edition_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:chararray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:chararray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,sticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subject:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:chararray,browser_major_version:chararray,browser_minor_version:chararray,os:chararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chararray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:chararray,test_name:chararray,test_group:chararray,test_version:chararray,page_version:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:chararray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chararray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:int,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_phrase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:chararray,url_id:chararray,network_rev:int);

referrer_group_map = LOAD '/user/sgehlot/oozie/db_data/referrer_group_map'
as
(referrer_id:int, has_web_search_phrase:int, hostname:chararray,
referral_type_id:int,
referral_type_name:chararray,
referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,referrer_group_cat:chararray);

filter_pe = FILTER page_events BY is_iab_robot == 0 AND
is_pattern_match_robot == 0 AND day == '2011-05-10';

select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
business_unit_id;

select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
has_web_search_phrase, referral_type_id;

jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
select_pe_col BY (referrer_id, has_web_search_phrase);


logic = FOREACH jn GENERATE
select_pe_col::day,
select_ref_col::referral_type_id,
 select_pe_col::business_unit_id,
((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie : null)
as c_users,
 ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie : null)
as nc_users,
((select_pe_col::reg_cookie == '-1') ? null : select_pe_col::reg_cookie) as
registered_users;
 group_it = GROUP logic BY (select_pe_col::day,
   select_ref_col::referral_type_id,
   select_pe_col::business_unit_id);

agg_results = FOREACH group_it{
 dst_c_users = DISTINCT logic.c_users;
dst_nc_users = DISTINCT logic.nc_users;
 dst_registered_users = DISTINCT logic.registered_users;
      GENERATE
       group.select_pe_col::day,
        group.select_ref_col::referral_type_id,
group.select_pe_col::business_unit_id,
 COUNT(dst_c_users) as c_users,
COUNT(dst_nc_users) as nc_users,
COUNT(dst_registered_users) as registered_users;
 };

STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day' USING
PigStorage('\t');


But I am keep on failing because of Java out of memory error. Data set in
"page_events" is huge but in "referrer_group_map" is relatively very less.

Here is error message:

Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.ArrayList.(ArrayList.java:112)
 at java.util.ArrayList.(ArrayList.java:119)
at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
 at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
at
org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33)
 at
org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCachedBag.java:236)
at
org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
 at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
 at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc.getNext(POUserFunc.java:216)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc.getNext(POUserFunc.java:253)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.getNext(PhysicalOperator.java:334)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.processPlan(POForEach.java:332)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.getNext(POForEach.java:284)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.processInput(PhysicalOperator.java:290)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange.getNext(POLocalRearrange.java:256)
 at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.processOnePackageOutput(PigCombiner.java:184)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:162)
 at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:51)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
 at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1265)
 at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:1173)

Error: java.lang.OutOfMemoryError: Java heap space
at java.util.ArrayList.(ArrayList.java:112)
 at java.util.ArrayList.(ArrayList.java:119)
at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
 at
org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
 at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
 at org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
 at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
at org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
 at org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
at
org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java:114)
 at
org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:67)
at
org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
 at
org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
at
org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.java:163)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCombinerPackage.getNext(POCombinerPackage.java:141)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.processOnePackageOutput(PigCombiner.java:171)
 at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:162)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:51)
 at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
 at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1265)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
 at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:1173)

Any idea and suggestion what could be cause of error message?

Thanks for any help,
Sonia

Reply via email to