Author: gates
Date: Tue Oct 14 10:28:16 2008
New Revision: 704589

URL: http://svn.apache.org/viewvc?rev=704589&view=rev
Log:
PIG-476: Added DateExtractor, a piggybank eval func that extracts a date from a 
string.


Added:
    
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java
    
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java
Modified:
    incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704589&r1=704588&r2=704589&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Tue Oct 14 10:28:16 2008
@@ -371,3 +371,6 @@
        PIG-488: Added SearchTermExtractor, a piggybank eval func that, for many
        search engines, recognizes the search term in the URL returns it to the
        caller (spackest via gates).
+
+       PIG-476: Added DateExtractor, a piggybank eval func that extracts a date
+       from a string (spackest via gates).

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java?rev=704589&view=auto
==============================================================================
--- 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/DateExtractor.java
 Tue Oct 14 10:28:16 2008
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+/**
+ * DateExtractor has three different constructors which each allow for 
different functionality. The
+ * incomingDateFormat (MM-dd-yyyy by default) is used to match the date string 
that gets passed in from the
+ * log. The outgoingDateFormat (dd/MMM/yyyy:HH:mm:ss Z by default) is used to 
format the returned string.
+ * 
+ * Different constructors exist for each combination; please use the 
appropriate respective constructor.
+ * 
+ * Note that any data that exists in the SimpleDateFormat schema can be 
supported. For example, if you were
+ * starting with the default incoming format and wanted to extract just the 
year, you would use the single
+ * string constructor DateExtractor("yyyy").
+ * 
+ * From pig latin you will need to use aliases to use a non-default format, 
like
+ * 
+ * define MyDateExtractor 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor("MM-yyyy");
+ * 
+ * A = FOREACH row GENERATE DateExtractor(dayTime);
+ */
+public class DateExtractor extends EvalFunc<DataAtom> {
+    private static SimpleDateFormat DEFAULT_INCOMING_DATE_FORMAT = new 
SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z");
+    private static SimpleDateFormat DEFAULT_OUTGOING_DATE_FORMAT = new 
SimpleDateFormat("MM-dd-yyyy");
+
+    private SimpleDateFormat incomingDateFormat;
+    private SimpleDateFormat outgoingDateFormat;
+
+    /**
+     * forms the formats based on default incomingDateFormat and default 
outgoingDateFormat
+     * 
+     * @param outgoingDateString outgoingDateFormat is based on 
outgoingDateString
+     */
+    public DateExtractor() {
+        incomingDateFormat = DEFAULT_INCOMING_DATE_FORMAT;
+        outgoingDateFormat = DEFAULT_OUTGOING_DATE_FORMAT;
+    }
+
+    /**
+     * forms the formats based on passed outgoingDateString and the default 
incomingDateFormat
+     * 
+     * @param outgoingDateString outgoingDateFormat is based on 
outgoingDateString
+     */
+    public DateExtractor(String outgoingDateString) {
+        incomingDateFormat = DEFAULT_INCOMING_DATE_FORMAT;
+        outgoingDateFormat = new SimpleDateFormat(outgoingDateString);
+    }
+
+    /**
+     * forms the formats based on passed incomingDateString and 
outgoingDateString
+     * 
+     * @param incomingDateString incomingDateFormat is based on 
incomingDateString
+     * @param outgoingDateString outgoingDateFormat is based on 
outgoingDateString
+     * 
+     */
+    public DateExtractor(String incomingDateString, String outgoingDateString) 
{
+        incomingDateFormat = new SimpleDateFormat(incomingDateString);
+        outgoingDateFormat = new SimpleDateFormat(outgoingDateString);
+    }
+
+    @Override
+    public void exec(Tuple input, DataAtom output) {
+        String incomingDateString = input.getAtomField(0).strval();
+
+        Date date = null;
+        try {
+            date = incomingDateFormat.parse(incomingDateString);
+        } catch (ParseException e) {
+            System.err.println("Unable to parse incoming date string " +
+                incomingDateString + ", " + e.getMessage());
+            return;
+        }
+
+        String outgoingDateString = null;
+        if (date != null)
+            outgoingDateString = outgoingDateFormat.format(date);
+
+        if (outgoingDateString != null)
+            output.setValue(outgoingDateString);
+    }
+}

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java?rev=704589&view=auto
==============================================================================
--- 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestDateExtractor.java
 Tue Oct 14 10:28:16 2008
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.util.apachelogparser;
+
+import java.util.ArrayList;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor;
+import org.junit.Test;
+
+public class TestDateExtractor extends TestCase {
+    @Test
+    public void testInstantiation() {
+        assertNotNull(new DateExtractor());
+    }
+
+    @Test
+    public void testDefaultFormatters() {
+        DateExtractor dayExtractor = new DateExtractor();
+
+        Tuple input = new Tuple(new DataAtom("20/Sep/2008:23:53:04 -0600"));
+        DataAtom output = new DataAtom();
+        dayExtractor.exec(input, output);
+        assertEquals("09-20-2008", output.toString());
+    }
+
+    @Test
+    public void testFailureThenSuccess() {
+        DateExtractor dayExtractor = new DateExtractor();
+
+        Tuple input = new Tuple(new DataAtom("dud"));
+        DataAtom output = new DataAtom();
+        dayExtractor.exec(input, output);
+        assertEquals("", output.toString());
+
+        input = new Tuple(new DataAtom("20/Sep/2008:23:53:04 -0600"));
+        output = new DataAtom();
+        dayExtractor.exec(input, output);
+        assertEquals("09-20-2008", output.toString());
+    }
+
+    @Test
+    public void testPassedOutputFormatter() {
+        DateExtractor dayExtractor = new DateExtractor("MM-dd-yyyy");
+
+        ArrayList<Datum> input = new ArrayList<Datum>();
+        input.add(new DataAtom("20/Sep/2008:23:53:04 -0600"));
+
+        DataAtom output = new DataAtom();
+        dayExtractor.exec(new Tuple(input), output);
+        assertEquals("09-20-2008", output.toString());
+    }
+
+    @Test
+    public void testPassedInputOutputFormatter() {
+        DateExtractor dayExtractor = new DateExtractor("dd/MMM/yyyy:HH:mm:ss", 
"MM~dd~yyyy");
+
+        ArrayList<Datum> input = new ArrayList<Datum>();
+        input.add(new DataAtom("20/Sep/2008:23:53:04"));
+
+        DataAtom output = new DataAtom();
+        dayExtractor.exec(new Tuple(input), output);
+        assertEquals("09~20~2008", output.toString());
+    }
+
+    @Test
+    public void testPassedOutputInputFormatterYear() {
+        DateExtractor dayExtractor = new DateExtractor("dd/MMM/yyyy:HH:mm:ss", 
"yyyy");
+
+        ArrayList<Datum> input = new ArrayList<Datum>();
+        input.add(new DataAtom("20/Sep/2008:23:53:04"));
+
+        DataAtom output = new DataAtom();
+        dayExtractor.exec(new Tuple(input), output);
+        assertEquals("2008", output.toString());
+    }
+
+    @Test
+    public void testPassedOutputFormatterYear() {
+        DateExtractor dayExtractor = new DateExtractor("yyyy");
+
+        ArrayList<Datum> input = new ArrayList<Datum>();
+        input.add(new DataAtom("20/Sep/2008:23:53:04 -0600"));
+
+        DataAtom output = new DataAtom();
+        dayExtractor.exec(new Tuple(input), output);
+        assertEquals("2008", output.toString());
+    }
+}


Reply via email to