Hello,
REGEX_EXTRACT is using Matcher.find() instead of Matcher.matches() and so
does not work with some non greedy regular expression.
Is it the wanted behavior?
Thanks,
Romain
http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Matcher.html
-
The
matches<http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Matcher.html#matches()>
method
attempts to match the entire input sequence against the pattern.
- The
find<http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Matcher.html#find()>
method
scans the input sequence looking for the next subsequence that matches the
pattern.
System.out.println("Pig's way with m.find()");
String a = "hdfs://mygrid.com/projects/";
Matcher m = Pattern.compile("(.+?)/?").matcher(a);
System.out.println(m.find());
System.out.println(m.group(1));
System.out.println(m.start());
System.out.println(m.end());
System.out.println("\nm.matches()");
a = "hdfs://mygrid.com/projects/";
m = Pattern.compile("(.+?)/?").matcher(a);
System.out.println(m.matches());
System.out.println(m.group(1));
System.out.println(m.start());
System.out.println(m.end());
System.out.println("\nREGEX_EXTRACT m.find()");
Tuple t = TupleFactory.getInstance().newTuple();
t.append(a);
t.append("(.+?)/?");
t.append(1);
System.out.println(new TestPigExtractAll().new REGEX_EXTRACT().exec(t));
Output:
Pig's way with m.find()
true
h
0
1
m.matches()
true
hdfs://mygrid.com/projects
0
27
REGEX_EXTRACT m.find()
h
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
/**
Output:
Pig's way with m.find()
true
h
0
1
m.matches()
true
hdfs://mygrid.com/projects
0
27
REGEX_EXTRACT m.find()
h
*/
public class TestPigExtractAll {
/**
* Removes the / at the end of the URL.
*/
public static void main(String[] avirgs) throws IOException {
System.out.println("Pig's way with m.find()");
String a = "hdfs://mygrid.com/projects/";
Matcher m = Pattern.compile("(.+?)/?").matcher(a);
System.out.println(m.find());
System.out.println(m.group(1));
System.out.println(m.start());
System.out.println(m.end());
System.out.println("\nm.matches()");
a = "hdfs://mygrid.com/projects/";
m = Pattern.compile("(.+?)/?").matcher(a);
System.out.println(m.matches());
System.out.println(m.group(1));
System.out.println(m.start());
System.out.println(m.end());
System.out.println("\nREGEX_EXTRACT m.find()");
Tuple t = TupleFactory.getInstance().newTuple();
t.append(a);
t.append("(.+?)/?");
t.append(1);
System.out.println(new TestPigExtractAll().new REGEX_EXTRACT().exec(t));
}
/** copied from Pig builtin */
public class REGEX_EXTRACT {
String mExpression = null;
Pattern mPattern = null;
public String exec(Tuple input) throws IOException {
if (input.size()!=3) {
String msg = "RegexExtract : Only 3 parameters are allowed.";
throw new IOException(msg);
}
if (input.get(0)==null)
return null;
try {
if (!input.get(1).equals(mExpression))
{
try
{
mExpression = (String)input.get(1);
mPattern = Pattern.compile(mExpression);
} catch (Exception e)
{
String msg = "RegexExtract : Mal-Formed Regular expression : "+input.get(1);
throw new IOException(msg);
}
}
} catch (NullPointerException e) {
String msg = "RegexExtract : Regular expression is null";
throw new IOException(msg);
}
int mIndex = (Integer)input.get(2);
Matcher m = mPattern.matcher((String)input.get(0));
if (m.find()&&m.groupCount()>=mIndex)
{
return m.group(mIndex);
}
return null;
}
}
}