Author: gates Date: Tue Oct 14 10:28:16 2008 New Revision: 704589 URL: http://svn.apache.org/viewvc?rev=704589&view=rev Log: PIG-476: Added DateExtractor, a piggybank eval func that extracts a date from a string.
Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java Modified: incubator/pig/trunk/CHANGES.txt Modified: incubator/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704589&r1=704588&r2=704589&view=diff ============================================================================== --- incubator/pig/trunk/CHANGES.txt (original) +++ incubator/pig/trunk/CHANGES.txt Tue Oct 14 10:28:16 2008 @@ -371,3 +371,6 @@ PIG-488: Added SearchTermExtractor, a piggybank eval func that, for many search engines, recognizes the search term in the URL returns it to the caller (spackest via gates). + + PIG-476: Added DateExtractor, a piggybank eval func that extracts a date + from a string (spackest via gates). Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java?rev=704589&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java Tue Oct 14 10:28:16 2008 @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.evaluation.util.apachelogparser; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Tuple; + +/** + * DateExtractor has three different constructors which each allow for different functionality. The + * incomingDateFormat (MM-dd-yyyy by default) is used to match the date string that gets passed in from the + * log. The outgoingDateFormat (dd/MMM/yyyy:HH:mm:ss Z by default) is used to format the returned string. + * + * Different constructors exist for each combination; please use the appropriate respective constructor. + * + * Note that any data that exists in the SimpleDateFormat schema can be supported. For example, if you were + * starting with the default incoming format and wanted to extract just the year, you would use the single + * string constructor DateExtractor("yyyy"). + * + * From pig latin you will need to use aliases to use a non-default format, like + * + * define MyDateExtractor org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor("MM-yyyy"); + * + * A = FOREACH row GENERATE DateExtractor(dayTime); + */ +public class DateExtractor extends EvalFunc<DataAtom> { + private static SimpleDateFormat DEFAULT_INCOMING_DATE_FORMAT = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z"); + private static SimpleDateFormat DEFAULT_OUTGOING_DATE_FORMAT = new SimpleDateFormat("MM-dd-yyyy"); + + private SimpleDateFormat incomingDateFormat; + private SimpleDateFormat outgoingDateFormat; + + /** + * forms the formats based on default incomingDateFormat and default outgoingDateFormat + * + * @param outgoingDateString outgoingDateFormat is based on outgoingDateString + */ + public DateExtractor() { + incomingDateFormat = DEFAULT_INCOMING_DATE_FORMAT; + outgoingDateFormat = DEFAULT_OUTGOING_DATE_FORMAT; + } + + /** + * forms the formats based on passed outgoingDateString and the default incomingDateFormat + * + * @param outgoingDateString outgoingDateFormat is based on outgoingDateString + */ + public DateExtractor(String outgoingDateString) { + incomingDateFormat = DEFAULT_INCOMING_DATE_FORMAT; + outgoingDateFormat = new SimpleDateFormat(outgoingDateString); + } + + /** + * forms the formats based on passed incomingDateString and outgoingDateString + * + * @param incomingDateString incomingDateFormat is based on incomingDateString + * @param outgoingDateString outgoingDateFormat is based on outgoingDateString + * + */ + public DateExtractor(String incomingDateString, String outgoingDateString) { + incomingDateFormat = new SimpleDateFormat(incomingDateString); + outgoingDateFormat = new SimpleDateFormat(outgoingDateString); + } + + @Override + public void exec(Tuple input, DataAtom output) { + String incomingDateString = input.getAtomField(0).strval(); + + Date date = null; + try { + date = incomingDateFormat.parse(incomingDateString); + } catch (ParseException e) { + System.err.println("Unable to parse incoming date string " + + incomingDateString + ", " + e.getMessage()); + return; + } + + String outgoingDateString = null; + if (date != null) + outgoingDateString = outgoingDateFormat.format(date); + + if (outgoingDateString != null) + output.setValue(outgoingDateString); + } +} Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java?rev=704589&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java Tue Oct 14 10:28:16 2008 @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.test.evaluation.util.apachelogparser; + +import java.util.ArrayList; + +import junit.framework.TestCase; + +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Datum; +import org.apache.pig.data.Tuple; +import org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor; +import org.junit.Test; + +public class TestDateExtractor extends TestCase { + @Test + public void testInstantiation() { + assertNotNull(new DateExtractor()); + } + + @Test + public void testDefaultFormatters() { + DateExtractor dayExtractor = new DateExtractor(); + + Tuple input = new Tuple(new DataAtom("20/Sep/2008:23:53:04 -0600")); + DataAtom output = new DataAtom(); + dayExtractor.exec(input, output); + assertEquals("09-20-2008", output.toString()); + } + + @Test + public void testFailureThenSuccess() { + DateExtractor dayExtractor = new DateExtractor(); + + Tuple input = new Tuple(new DataAtom("dud")); + DataAtom output = new DataAtom(); + dayExtractor.exec(input, output); + assertEquals("", output.toString()); + + input = new Tuple(new DataAtom("20/Sep/2008:23:53:04 -0600")); + output = new DataAtom(); + dayExtractor.exec(input, output); + assertEquals("09-20-2008", output.toString()); + } + + @Test + public void testPassedOutputFormatter() { + DateExtractor dayExtractor = new DateExtractor("MM-dd-yyyy"); + + ArrayList<Datum> input = new ArrayList<Datum>(); + input.add(new DataAtom("20/Sep/2008:23:53:04 -0600")); + + DataAtom output = new DataAtom(); + dayExtractor.exec(new Tuple(input), output); + assertEquals("09-20-2008", output.toString()); + } + + @Test + public void testPassedInputOutputFormatter() { + DateExtractor dayExtractor = new DateExtractor("dd/MMM/yyyy:HH:mm:ss", "MM~dd~yyyy"); + + ArrayList<Datum> input = new ArrayList<Datum>(); + input.add(new DataAtom("20/Sep/2008:23:53:04")); + + DataAtom output = new DataAtom(); + dayExtractor.exec(new Tuple(input), output); + assertEquals("09~20~2008", output.toString()); + } + + @Test + public void testPassedOutputInputFormatterYear() { + DateExtractor dayExtractor = new DateExtractor("dd/MMM/yyyy:HH:mm:ss", "yyyy"); + + ArrayList<Datum> input = new ArrayList<Datum>(); + input.add(new DataAtom("20/Sep/2008:23:53:04")); + + DataAtom output = new DataAtom(); + dayExtractor.exec(new Tuple(input), output); + assertEquals("2008", output.toString()); + } + + @Test + public void testPassedOutputFormatterYear() { + DateExtractor dayExtractor = new DateExtractor("yyyy"); + + ArrayList<Datum> input = new ArrayList<Datum>(); + input.add(new DataAtom("20/Sep/2008:23:53:04 -0600")); + + DataAtom output = new DataAtom(); + dayExtractor.exec(new Tuple(input), output); + assertEquals("2008", output.toString()); + } +}