svn commit: r709206 - in /hadoop/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/stora

2008-10-30 Thread gates
Author: gates
Date: Thu Oct 30 09:56:48 2008
New Revision: 709206

URL: http://svn.apache.org/viewvc?rev=709206view=rev
Log:
PIG-509: Added CombinedLogLoader, loads logs that were created using Apache's 
combined log format.

Added:

hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java

hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java
Modified:
hadoop/pig/trunk/CHANGES.txt

Modified: hadoop/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=709206r1=709205r2=709206view=diff
==
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Thu Oct 30 09:56:48 2008
@@ -379,3 +379,6 @@
gates).
 
 move to hadoop
+
+   PIG-509: Added CombinedLogLoader, loads logs that were created using
+   Apache's combined log format (spackest via gates).

Added: 
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java?rev=709206view=auto
==
--- 
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java
 (added)
+++ 
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java
 Thu Oct 30 09:56:48 2008
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
License); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.storage.apachelog;
+
+import java.util.regex.Pattern;
+
+import org.apache.pig.piggybank.storage.RegExLoader;
+
+/**
+ * CombinedLogLoader is used to load logs based on Apache's combined log 
format, based on a format like
+ * 
+ * LogFormat %h %l %u %t \%r\ %s %b \%{Referer}i\ \%{User-Agent}i\ 
combined
+ * 
+ * The log filename ends up being access_log from a line like
+ * 
+ * CustomLog logs/combined_log combined
+ * 
+ * Example:
+ * 
+ * raw = LOAD 'combined_log' USING 
org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader AS
+ * (remoteAddr, remoteLogname, user, time, method, uri, proto, status, bytes, 
referer, userAgent);
+ * 
+ */
+
+public class CombinedLogLoader extends RegExLoader {
+// 1.2.3.4 - - [30/Sep/2008:15:07:53 -0400] GET / HTTP/1.1 200 3190 -
+// Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) 
AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1
+private final static Pattern combinedLogPattern = Pattern
+
.compile(^(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+.(\\S+\\s+\\S+).\\s+\(\\S+)\\s+(.+?)\\s+(HTTP[^\]+)\\\s+(\\S+)\\s+(\\S+)\\s+\([^\]*)\\\s+\(.*)\$);
+
+public Pattern getPattern() {
+return combinedLogPattern;
+}
+}

Added: 
hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java?rev=709206view=auto
==
--- 
hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java
 (added)
+++ 
hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java
 Thu Oct 30 09:56:48 2008
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
License); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in 

[Pig Wiki] Trivial Update of StorageFunction by CorinneC

2008-10-30 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Pig Wiki for change 
notification.

The following page has been changed by CorinneC:
http://wiki.apache.org/pig/StorageFunction

--
  
  If the !LoadFunc interface is implemented, the function can be used to load 
tuples. If the !StoreFunc interface is implemented, the function can be used to 
store tuples. Since loading and storing are usually tied to each other, most 
functions will implement both interfaces, e.g., !PigStorage and !BinStorage do. 
However, occassionally, we may write a function only for loading.
  
- For examples of how to implement the following interfaces, look at 
[http://svn.apache.org/repos/asf/incubator/pig/trunk/src/org/apache/pig/builtin/PigStorage.java
 PigStorage], or 
[http://svn.apache.org/repos/asf/incubator/pig/trunk/src/org/apache/pig/builtin/BinStorage.java
 BinStorage].
+ For examples of how to implement the following interfaces, look at 
[http://svn.apache.org/repos/asf/hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java
 PigStorage], or 
[http://svn.apache.org/repos/asf/hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java
 BinStorage].
  
  {{{
  public interface LoadFunc {


svn commit: r709222 - in /hadoop/pig/branches/types: src/org/apache/pig/builtin/DIFF.java test/org/apache/pig/test/TestBuiltin.java

2008-10-30 Thread gates
Author: gates
Date: Thu Oct 30 11:33:43 2008
New Revision: 709222

URL: http://svn.apache.org/viewvc?rev=709222view=rev
Log:
PIG-511 Fixed flaws in builtin UDF diff pointed out by Crisitan Ivascu.


Modified:
hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java
hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java

Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java?rev=709222r1=709221r2=709222view=diff
==
--- hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java (original)
+++ hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java Thu Oct 30 
11:33:43 2008
@@ -18,7 +18,9 @@
 package org.apache.pig.builtin;
 
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 
 import org.apache.pig.EvalFunc;
 import org.apache.pig.backend.executionengine.ExecException;
@@ -33,8 +35,6 @@
  * will emit any Tuples that are in on of the DataBags but not the other. If 
the
  * fields are values, it will emit tuples with values that do not match.
  * 
- * @author breed
- *
  */
 public class DIFF extends EvalFuncDataBag {
 TupleFactory mTupleFactory = TupleFactory.getInstance();
@@ -78,48 +78,20 @@
 DataBag bag1,
 DataBag bag2,
 DataBag emitTo) {
-// Create two distinct versions of the bag.  This will speed up
-// comparison, and provide us a sorted order so we don't have to do
-// an n^2 lookup.
-DataBag d1 = mBagFactory.newDistinctBag();
-DataBag d2 = mBagFactory.newDistinctBag();
-IteratorTuple i1 = d1.iterator();
-IteratorTuple i2 = d2.iterator();
-while (i1.hasNext()) d1.add(i1.next());
-while (i2.hasNext()) d2.add(i2.next());
-
-i1 = d1.iterator();
-i2 = d2.iterator();
-
-Tuple t1 = i1.next();
-Tuple t2 = i2.next();
-
-while (i1.hasNext()  i2.hasNext()) {
-int c = t1.compareTo(t2);
-
-if (c  0) {
-// place t1 in the result bag and advance i1
-emitTo.add(t1);
-t1 = i1.next();
-} else if (c  0) {
-// place t2 in the result bag and advance i2
-emitTo.add(t2);
-t2 = i2.next();
-} else if (c == 0) {
-// put neither in the result bag, advance both iterators
-t1 = i1.next();
-t2 = i2.next();
-}
-}
+// Build two hash tables and probe with first one, then the other.
+// This does make the assumption that the distinct set of keys from
+// each bag will fit in memory.
+SetTuple s1 = new HashSetTuple();
+IteratorTuple i1 = bag1.iterator();
+while (i1.hasNext()) s1.add(i1.next());
+
+SetTuple s2 = new HashSetTuple();
+IteratorTuple i2 = bag2.iterator();
+while (i2.hasNext()) s2.add(i2.next());
+
+for (Tuple t : s1) if (!s2.contains(t)) emitTo.add(t);
+for (Tuple t : s2) if (!s1.contains(t)) emitTo.add(t);
 
-// One ran out, put all the rest of the other (if there are any) in
-// the result bag.
-while (i1.hasNext()) {
-emitTo.add(i1.next());
-}
-while (i2.hasNext()) {
-emitTo.add(i2.next());
-}
 }
 
 

Modified: hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java?rev=709222r1=709221r2=709222view=diff
==
--- hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java 
(original)
+++ hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Thu Oct 
30 11:33:43 2008
@@ -19,6 +19,7 @@
 
 import java.io.File;
 import java.io.PrintWriter;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -890,6 +891,44 @@
 
 assertTrue(f1.equals(f2));
 }
+
+@Test
+public void testDIFF() throws Exception {
+// Test it in the case with two bags.
+BagFactory bf = BagFactory.getInstance();
+TupleFactory tf = TupleFactory.getInstance();
+
+DataBag b1 = bf.newDefaultBag();
+DataBag b2 = bf.newDefaultBag();
+for (int i = 0; i  10; i++) b1.add(tf.newTuple(new Integer(i)));
+for (int i = 0; i  10; i += 2) b2.add(tf.newTuple(new Integer(i)));
+Tuple t = tf.newTuple(2);
+t.set(0, b1);
+t.set(1, b2);
+DIFF d = new DIFF();
+DataBag result = d.exec(t);
+
+assertEquals(5, result.size());
+IteratorTuple i = result.iterator();
+int[]