Hello,

REGEX_EXTRACT is using Matcher.find() instead of Matcher.matches() and so
does not work with some non greedy regular expression.

Is it the wanted behavior?

Thanks,

Romain


http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Matcher.html



   -

   The 
matches<http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Matcher.html#matches()>
method
   attempts to match the entire input sequence against the pattern.
   - The 
find<http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Matcher.html#find()>
method
   scans the input sequence looking for the next subsequence that matches the
   pattern.




    System.out.println("Pig's way with m.find()");
    String a = "hdfs://mygrid.com/projects/";
    Matcher m = Pattern.compile("(.+?)/?").matcher(a);
    System.out.println(m.find());
    System.out.println(m.group(1));
    System.out.println(m.start());
    System.out.println(m.end());

    System.out.println("\nm.matches()");
    a = "hdfs://mygrid.com/projects/";
    m = Pattern.compile("(.+?)/?").matcher(a);
    System.out.println(m.matches());
    System.out.println(m.group(1));
    System.out.println(m.start());
    System.out.println(m.end());

    System.out.println("\nREGEX_EXTRACT m.find()");
    Tuple t = TupleFactory.getInstance().newTuple();
    t.append(a);
    t.append("(.+?)/?");
    t.append(1);
    System.out.println(new TestPigExtractAll().new REGEX_EXTRACT().exec(t));


Output:

Pig's way with m.find()
true
h
0
1

m.matches()
true
hdfs://mygrid.com/projects
0
27

REGEX_EXTRACT m.find()
h
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

/**
Output:

Pig's way with m.find()
true
h
0
1

m.matches()
true
hdfs://mygrid.com/projects
0
27

REGEX_EXTRACT m.find()
h
 */
public class TestPigExtractAll {

  /**
   * Removes the / at the end of the URL.
   */
  public static void main(String[] avirgs) throws IOException {

    System.out.println("Pig's way with m.find()");
    String a = "hdfs://mygrid.com/projects/";
    Matcher m = Pattern.compile("(.+?)/?").matcher(a);
    System.out.println(m.find());
    System.out.println(m.group(1));
    System.out.println(m.start());
    System.out.println(m.end());

    System.out.println("\nm.matches()");
    a = "hdfs://mygrid.com/projects/";
    m = Pattern.compile("(.+?)/?").matcher(a);
    System.out.println(m.matches());
    System.out.println(m.group(1));
    System.out.println(m.start());
    System.out.println(m.end());

    System.out.println("\nREGEX_EXTRACT m.find()");
    Tuple t = TupleFactory.getInstance().newTuple();
    t.append(a);
    t.append("(.+?)/?");
    t.append(1);
    System.out.println(new TestPigExtractAll().new REGEX_EXTRACT().exec(t));
  }

  /** copied from Pig builtin */
  public class REGEX_EXTRACT  {
    String mExpression = null;
    Pattern mPattern = null;

    public String exec(Tuple input) throws IOException {
        if (input.size()!=3) {
            String msg = "RegexExtract : Only 3 parameters are allowed.";
            throw new IOException(msg);
        }
        if (input.get(0)==null)
            return null;
        try {
            if (!input.get(1).equals(mExpression))
            {
                try
                {
                    mExpression = (String)input.get(1);
                    mPattern = Pattern.compile(mExpression);
                } catch (Exception e)
                {
                    String msg = "RegexExtract : Mal-Formed Regular expression : "+input.get(1);
                    throw new IOException(msg);
                }
            }
        } catch (NullPointerException e) {
            String msg = "RegexExtract : Regular expression is null";
            throw new IOException(msg);
        }
        int mIndex = (Integer)input.get(2);

        Matcher m = mPattern.matcher((String)input.get(0));

        if (m.find()&&m.groupCount()>=mIndex)
        {
            return m.group(mIndex);
        }
        return null;
    }
    }
}

Reply via email to