Re: UDAF on AWS Hive

Matthew Bryan Tue, 06 Apr 2010 10:35:22 -0700

Thanks Zheng, and thanks for your great support to this list. I took
your idea and wrote the following code that worked for me...I'm no
Java whiz...so it's probably fairly inefficient. I do get to talk to
the Amazon folks from time to time, so I'll definitely mention my
interest in upgrading the Hive version. Thanks again.


Matt

package com.company.hadoop.hive.udaf;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import java.util.Arrays;

public class UDAFGroupConcat extends UDAF{

        public static class GroupConcatStringEvaluator implements
UDAFEvaluator {
                private Text mOutput;
                private boolean mEmpty;

        public GroupConcatStringEvaluator() {
                super();
                init();
        }

        public void init() {
                mOutput = null;
                mEmpty = true;
        }
        public boolean iterate(Text o,  IntWritable N) {
                if (o!=null) {
                        if(mEmpty) {
                                mOutput = new Text(N+" "+o.toString());
                                mEmpty = false;
                        } else {
                                String temp = mOutput.toString() +
"\t" + N + " " + o.toString();
                                String[] split = temp.split("\t");
                                Arrays.sort(split);
                                String sorted = split[0];
                                for (int i = 1; i < split.length; i++)
                                {
                                        sorted = sorted + "\t" + split[i];
                                }
                                mOutput.set(sorted);
                        }
                }
                return true;
        }
        public Text terminatePartial() {return mEmpty ? null : mOutput;}
        public boolean merge(Text o) {
                if (o!=null) {
                        if(mEmpty) {
                            mOutput = new Text(o.toString());
                            mEmpty = false;
                        } else {
                                String temp = mOutput.toString() +
"\t" + o.toString();
                                String[] split = temp.split("\t");
                                Arrays.sort(split);
                                String sorted = split[0];
                                for (int i = 1; i < split.length; i++)
                                {
                                        sorted = sorted + "\t" + split[i];
                                }
                                mOutput.set(sorted);
                        }
                }
                return true;
        }
        public Text terminate() {return mEmpty ? null : mOutput;}
}
}


On Fri, Apr 2, 2010 at 4:11 PM, Matthew Bryan <[email protected]> wrote:
> I'm writing a basic group_concat UDAF for the Amazon version of
> Hive....and it's working fine for unordered groupings. But I can't
> seem to get an ordered version working (filling an array based on an
> IntWritable passed alongside). When I move from using Text return type
> on terminatePartial() to either Text[] or a State class I start
> getting errors:
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class [Lorg.apache.hadoop.io.Text; from public
> org.apache.hadoop.io.Text[]
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial()
>
> or
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConc
> atNState from public
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConcatNState
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial
> ()
>
> What limits are there on the return type of
> terminatePartial()....shouldn't it just have to match the argument of
> merge and nothing more? Keep in mind this is the Amazon version of
> Hive (0.4 I think)....
>
> I put both versions of the UDAF below, ordered and unordered.
>
> Thanks for your time.
>
> Matt
>
>
> ######### Working Unordered ############
> /*QUERY: select user, event, group_concat(details) from datatable
> group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
>
> public class UDAFGroupConcat extends UDAF{
>
>        public static class GroupConcatStringEvaluator implements
> UDAFEvaluator {
>                private Text mOutput;
>                private boolean mEmpty;
>
>        public GroupConcatStringEvaluator() {
>                super();
>                init();
>        }
>
>        public void init() {
>                mOutput = null;
>                mEmpty = true;
>        }
>
>        public boolean iterate(Text o) {
>                if (o!=null) {
>                        if(mEmpty) {
>                                mOutput = new Text(o);
>                                mEmpty = false;
>                        } else {
>                                mOutput.set(mOutput.toString()+"
> "+o.toString());
>                        }
>                }
>                return true;
>        }
>        public Text terminatePartial() {return mEmpty ? null : mOutput;}
>        public boolean merge(Text o) {return iterate(o);}
>        public Text terminate() {return mEmpty ? null : mOutput;}
> }
> }
>
> ############ Not Working Ordered #############
> /*QUERY: select user, event, group_concatN(details, detail_id) from
> datatable group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.io.IntWritable;
>
> public class UDAFGroupConcatN extends UDAF{
>
>        public static class GroupConcatNStringEvaluator implements
> UDAFEvaluator {
>
>                private Text[] mArray;
>                private boolean mEmpty;
>
>                public GroupConcatNStringEvaluator() {
>                        super();
>                        init();
>                }
>
>        public void init() {
>                mArray = new Text[5];
>                mEmpty = true;
>        }
>
>        public boolean iterate(Text o, IntWritable N) {
>                if (o!=null&&N!=null) {
>                        mArray[N.get()].set(o.toString());
>                        mEmpty=false;
>                }
>                return true;
>        }
>        public Text[] terminatePartial() {return mEmpty ? null : mArray;}
>        public boolean merge(Text[] o) {
>                if (o!=null) {
>                        for(int i=0; i<=5; i++){
>                                if(mArray[i].getLength()==0){
>                                        mArray[i].set(o[i].toString());
>                                }
>                        }
>                }
>                return true;
>        }
>
>        public Text[] terminate() {return mEmpty ? null : mArray;}
> }
> }
>

Re: UDAF on AWS Hive

Reply via email to