Author: olga Date: Mon Oct 12 18:22:12 2009 New Revision: 824446 URL: http://svn.apache.org/viewvc?rev=824446&view=rev Log: PIG-1015: [piggybank] DateExtractor should take into account timezones (dryaboy via olgan)
Modified: hadoop/pig/trunk/contrib/CHANGES.txt hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java Modified: hadoop/pig/trunk/contrib/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/CHANGES.txt?rev=824446&r1=824445&r2=824446&view=diff ============================================================================== --- hadoop/pig/trunk/contrib/CHANGES.txt (original) +++ hadoop/pig/trunk/contrib/CHANGES.txt Mon Oct 12 18:22:12 2009 @@ -1,3 +1,5 @@ +PIG-1015: [piggybank] DateExtractor should take into account timezones +(dryaboy via olgan) PIG-911: Added SequenceFileLoader (dryaboy via gates) PIG-885: New UDFs for piggybank (Bin, Decode, LookupInFiles, RegexExtract, RegexMatch, HashFVN, DiffDate) (daijy) PIG-868: added strin manipulation functions (bennies via olgan) Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java?rev=824446&r1=824445&r2=824446&view=diff ============================================================================== --- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java (original) +++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java Mon Oct 12 18:22:12 2009 @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; +import java.util.TimeZone; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; @@ -29,9 +30,9 @@ import org.apache.pig.impl.util.WrappedIOException; /** - * DateExtractor has three different constructors which each allow for different functionality. The - * incomingDateFormat (yyyy-MM-dd by default) is used to match the date string that gets passed in from the - * log. The outgoingDateFormat (dd/MMM/yyyy:HH:mm:ss Z by default) is used to format the returned string. + * DateExtractor has four different constructors which each allow for different functionality. The + * incomingDateFormat ("dd/MMM/yyyy:HH:mm:ss Z" by default) is used to match the date string that gets passed in from the + * log. The outgoingDateFormat ("yyyy-MM-dd" by default) is used to format the returned string. * * Different constructors exist for each combination; please use the appropriate respective constructor. * @@ -46,12 +47,14 @@ * A = FOREACH row GENERATE DateExtractor(dayTime); * * If a string cannot be parsed, null will be returned and an error message printed to stderr. - * + * + * By default, the DateExtractor uses the GMT timezone. You can use the three-parameter constructor to override the + * timezone. */ public class DateExtractor extends EvalFunc<String> { - private static SimpleDateFormat DEFAULT_INCOMING_DATE_FORMAT = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z"); - private static SimpleDateFormat DEFAULT_OUTGOING_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd"); - + private static String DEFAULT_INCOMING_DATE_FORMAT = "dd/MMM/yyyy:HH:mm:ss Z"; + private static String DEFAULT_OUTGOING_DATE_FORMAT = "yyyy-MM-dd"; + private static String DEFAULT_TZ_ID="GMT"; private SimpleDateFormat incomingDateFormat; private SimpleDateFormat outgoingDateFormat; @@ -61,8 +64,7 @@ * @param outgoingDateString outgoingDateFormat is based on outgoingDateString */ public DateExtractor() { - incomingDateFormat = DEFAULT_INCOMING_DATE_FORMAT; - outgoingDateFormat = DEFAULT_OUTGOING_DATE_FORMAT; + this(DEFAULT_INCOMING_DATE_FORMAT, DEFAULT_OUTGOING_DATE_FORMAT, DEFAULT_TZ_ID); } /** @@ -71,8 +73,7 @@ * @param outgoingDateString outgoingDateFormat is based on outgoingDateString */ public DateExtractor(String outgoingDateString) { - incomingDateFormat = DEFAULT_INCOMING_DATE_FORMAT; - outgoingDateFormat = new SimpleDateFormat(outgoingDateString); + this(DEFAULT_INCOMING_DATE_FORMAT, outgoingDateString, "GMT"); } /** @@ -83,10 +84,25 @@ * */ public DateExtractor(String incomingDateString, String outgoingDateString) { + this(incomingDateString, outgoingDateString, DEFAULT_TZ_ID); + } + + /** + * forms the formats based on passed incomingDateString and outgoingDateString + * + * @param incomingDateString incomingDateFormat is based on incomingDateString + * @param outgoingDateString outgoingDateFormat is based on outgoingDateString + * @param timeZoneID time zone id in which dates should be expressed. + * + */ + public DateExtractor(String incomingDateString, String outgoingDateString, String timeZoneID) { + TimeZone tz = TimeZone.getTimeZone(timeZoneID); incomingDateFormat = new SimpleDateFormat(incomingDateString); outgoingDateFormat = new SimpleDateFormat(outgoingDateString); + incomingDateFormat.setTimeZone(tz); + outgoingDateFormat.setTimeZone(tz); } - + @Override public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0) @@ -96,6 +112,7 @@ str = (String)input.get(0); Date date = incomingDateFormat.parse(str); return outgoingDateFormat.format(date); + } catch (ParseException pe) { System.err.println("piggybank.evaluation.util.apachelogparser.DateExtractor: unable to parse date "+str); return null; Modified: hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java?rev=824446&r1=824445&r2=824446&view=diff ============================================================================== --- hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java (original) +++ hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java Mon Oct 12 18:22:12 2009 @@ -33,23 +33,35 @@ @Test public void testDefaultFormatters() throws Exception { DateExtractor dayExtractor = new DateExtractor(); + // test that GMT conversion moves the day input.set(0, "20/Sep/2008:23:53:04 -0600"); + assertEquals("2008-09-21", dayExtractor.exec(input)); + + // test that if the string is already in GMT, nothing moves + input.set(0, "20/Sep/2008:23:53:04 -0000"); assertEquals("2008-09-20", dayExtractor.exec(input)); } @Test + public void testMZFormatters() throws Exception { + DateExtractor extractor = new DateExtractor("dd/MMM/yyyy:HH:mm:ss Z", "yyyy-MM-dd", "PST"); + input.set(0, "20/Sep/2008:23:53:04 -0700"); + assertEquals("2008-09-20", extractor.exec(input)); + } + + @Test public void testFailureThenSuccess() throws Exception { DateExtractor dayExtractor = new DateExtractor(); input.set(0,"dud"); assertEquals(null, dayExtractor.exec(input)); - input.set(0,"20/Sep/2008:23:53:04 -0600"); + input.set(0,"20/Sep/2008:23:53:04 -0000"); assertEquals("2008-09-20", dayExtractor.exec(input)); } @Test public void testPassedOutputFormatter() throws Exception { DateExtractor dayExtractor = new DateExtractor("MM-dd-yyyy"); - input.set(0,"20/Sep/2008:23:53:04 -0600"); + input.set(0,"20/Sep/2008:23:53:04 -0000"); assertEquals("09-20-2008", dayExtractor.exec(input)); }