done. PIG-3057 <https://issues.apache.org/jira/browse/PIG-3057>



On Mon, Nov 19, 2012 at 6:32 PM, Jonathan Coveney <[email protected]>wrote:

> Make a JIRA and attach the patch, please.
>
>
> 2012/11/19 pablomar <[email protected]>
>
> > hi all,
> >
> > I did it as simple as I could. What about this changes ?
> >
> >
> > PigStorage.java
> > original:
> >     private void readField(byte[] buf, int start, int end) {
> >         if (start == end) {
> >             // NULL value
> >             mProtoTuple.add(null);
> >         } else {
> >             mProtoTuple.add(new DataByteArray(buf, start, end));
> >         }
> >     }
> >
> >
> > new:
> >     protected void addToCurrentTuple(DataByteArray data) {
> >             mProtoTuple.add(data);
> >     }
> >
> >     protected void readField(byte[] buf, int start, int end) {
> >         if (start == end) {
> >             // NULL value
> >             addToCurrentTuple(null);
> >         } else {
> >             addToCurrentTuple(new DataByteArray(buf, start, end));
> >         }
> >     }
> >
> >
> >
> > so, what's the advantage ?
> > with the new version, if I want to extend PigStorage just to override
> > readField, I do something like:
> >
> > import org.apache.pig.builtin.PigStorage;
> > import org.apache.pig.data.DataByteArray;
> >
> > public class MyLoader extends PigStorage {
> >   public MyLoader() {
> >     this("\t");
> >   }
> >
> >   public MyLoader(String delimiter) {
> >     super(delimiter);
> >   }
> >
> >   @Override
> >   protected void readField(byte[] buf, int start, int end) {
> >     // remove the field name (example: min=, max=)
> >     for (int i=start; i<=end;i++) {
> >       if (buf[i] == '=') {
> >         start = i + 1;
> >         break;
> >       }
> >     }
> >
> >     if (start == end) {
> >       // NULL value
> >       addToCurrentTuple(null);
> >     } else {
> >       addToCurrentTuple(new DataByteArray(buf, start, end));
> >     }
> >   }
> > }
> >
> > currently, just to override readField, I also need to copy/paste a lot
> from
> > PigStorage:
> >
> > import java.io.IOException;
> > import java.util.ArrayList;
> > import java.util.Properties;
> > import org.apache.hadoop.io.Text;
> > import org.apache.pig.backend.executionengine.ExecException;
> > import org.apache.pig.builtin.PigStorage;
> > import org.apache.pig.PigException;
> > import org.apache.pig.data.DataByteArray;
> > import org.apache.pig.data.Tuple;
> > import org.apache.pig.data.TupleFactory;
> > import org.apache.pig.impl.util.ObjectSerializer;
> > import org.apache.pig.impl.util.StorageUtil;
> > import org.apache.pig.impl.util.UDFContext;
> >
> > public class MyLoaderextends PigStorage {
> >   private byte fieldDel = '\t';
> >   private ArrayList<Object> mProtoTuple = null;
> >   private TupleFactory mTupleFactory = TupleFactory.getInstance();
> >   private boolean mRequiredColumnsInitialized = false;
> >
> >   public MyLoader() {
> >     this("\t");
> >   }
> >
> >   public MyLoader(String delimiter) {
> >     super(delimiter);
> >     fieldDel = StorageUtil.parseFieldDel(delimiter);
> >   }
> >
> >   // exactly the same than in PigStorage, but I needed to modify
> readField
> > which is private, so I have to copy all of it
> >     @Override
> >     public Tuple getNext() throws IOException {
> >         mProtoTuple = new ArrayList<Object>();
> >         if (!mRequiredColumnsInitialized) {
> >             if (signature!=null) {
> >                 Properties p =
> > UDFContext.getUDFContext().getUDFProperties(this.getClass());
> >                 mRequiredColumns =
> > (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
> >             }
> >             mRequiredColumnsInitialized = true;
> >         }
> >         try {
> >             boolean notDone = in.nextKeyValue();
> >             if (!notDone) {
> >                 return null;
> >
> > }
> >
> >             Text value = (Text) in.getCurrentValue();
> >             byte[] buf = value.getBytes();
> >             int len = value.getLength();
> >             int start = 0;
> >             int fieldID = 0;
> >             for (int i = 0; i < len; i++) {
> >                 if (buf[i] == fieldDel) {
> >                     if (mRequiredColumns==null ||
> > (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))
> >                         readField(buf, start, i);
> >                     start = i + 1;
> >                     fieldID++;
> >                 }
> >             }
> >             // pick up the last field
> >             if (start <= len && (mRequiredColumns==null ||
> > (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))) {
> >                 readField(buf, start, len);
> >             }
> >             Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);
> >             return t;
> >         } catch (InterruptedException e) {
> >             int errCode = 6018;
> >             String errMsg = "Error while reading input";
> >             throw new ExecException(errMsg, errCode,
> >                     PigException.REMOTE_ENVIRONMENT, e);
> >         }
> >
> >     }
> >
> >   private void readField(byte[] buf, int start, int end) {
> >     // remove the field name (example: min=, max=)
> >     for (int i=start; i<=end;i++) {
> >       if (buf[i] == '=') {
> >         start = i + 1;
> >         break;
> >       }
> >     }
> >
> >     if (start == end) {
> >       // NULL value
> >       mProtoTuple.add(null);
> >     } else {
> >       mProtoTuple.add(new DataByteArray(buf, start, end));
> >     }
> >   }
> > }
> >
> >
> > On Mon, Nov 19, 2012 at 12:24 PM, pablomar
> > <[email protected]>wrote:
> >
> > > sure. My initial (and dirty) idea changed only 2 lines. I completely
> > agree
> > > with you
> > >
> > >
> > >
> > >
> > >
> > > On Mon, Nov 19, 2012 at 12:16 PM, Bill Graham <[email protected]
> > >wrote:
> > >
> > >> +1 as well, but I'd suggest we do the following:
> > >>
> > >> - Keep mProtoTuple private and add protected getters/setters instead
> > with
> > >> javadocs describing expected usage.
> > >> - Rename mProtoTuple and the getters/setters to something more
> > descriptive
> > >> than mProtoTuple.
> > >>
> > >>
> > >> On Fri, Nov 16, 2012 at 2:15 PM, Dmitriy Ryaboy <[email protected]>
> > >> wrote:
> > >>
> > >> > That sounds reasonable, I've run into the same problem. Do you mind
> > >> > submitting a patch?
> > >> >
> > >> > On Fri, Nov 16, 2012 at 12:48 PM, pablomar
> > >> > <[email protected]> wrote:
> > >> > > hi all,
> > >> > >
> > >> > > I'm using Pig 0.9.2 (Apache Pig version 0.9.2-cdh4.0.1, precisely)
> > >> > > I got a case today on which I needed to clean up some fields
> before
> > >> > > processing. I will need to do the same for all my scripts. So
> > instead
> > >> of
> > >> > > doing it inside the scripts, I thought in extending PigStorage and
> > do
> > >> it
> > >> > > inside my own Loader. My scripts will be shorter and cleaner
> > >> > >
> > >> > > in fact, the only method that I needed to overwrite was :
> > >> > > void *readField*(byte[] buf, int start, int end)
> > >> > >
> > >> > >
> > >> > > Everything was ok and it is working. Problem was that I had to
> > >> > copy/paste a
> > >> > > lot just because private declarations
> > >> > > for example:
> > >> > >   private byte fieldDel = '\t';
> > >> > >   private ArrayList<Object> mProtoTuple = null;
> > >> > >   private TupleFactory mTupleFactory = TupleFactory.getInstance();
> > >> > >   private boolean mRequiredColumnsInitialized = false;
> > >> > >
> > >> > > and of course:
> > >> > > *private *void readField(byte[] buf, int start, int end)
> > >> > >
> > >> > > so I had to copy/paste:
> > >> > > public Tuple getNext() and all the aforementioned variables just
> to
> > be
> > >> > able
> > >> > > to write my own *readField*
> > >> > >
> > >> > >
> > >> > > would it be possible in next versions of Pig to have *readField
> > >> > *protected
> > >> > > as well as *mProtoTuple *? I think it could be useful in some
> cases
> > >> like
> > >> > > mine
> > >> > > I'm asking because I don't know the reasoning after the decisions
> of
> > >> made
> > >> > > them private
> > >> > >
> > >> > > thanks a lot,
> > >> >
> > >>
> > >>
> > >>
> > >> --
> > >> *Note that I'm no longer using my Yahoo! email address. Please email
> me
> > at
> > >> [email protected] going forward.*
> > >>
> > >
> > >
> >
>

Reply via email to