svn commit: r709206 - in /hadoop/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/stora
Author: gates Date: Thu Oct 30 09:56:48 2008 New Revision: 709206 URL: http://svn.apache.org/viewvc?rev=709206view=rev Log: PIG-509: Added CombinedLogLoader, loads logs that were created using Apache's combined log format. Added: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java Modified: hadoop/pig/trunk/CHANGES.txt Modified: hadoop/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=709206r1=709205r2=709206view=diff == --- hadoop/pig/trunk/CHANGES.txt (original) +++ hadoop/pig/trunk/CHANGES.txt Thu Oct 30 09:56:48 2008 @@ -379,3 +379,6 @@ gates). move to hadoop + + PIG-509: Added CombinedLogLoader, loads logs that were created using + Apache's combined log format (spackest via gates). Added: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java?rev=709206view=auto == --- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java (added) +++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java Thu Oct 30 09:56:48 2008 @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the License); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.storage.apachelog; + +import java.util.regex.Pattern; + +import org.apache.pig.piggybank.storage.RegExLoader; + +/** + * CombinedLogLoader is used to load logs based on Apache's combined log format, based on a format like + * + * LogFormat %h %l %u %t \%r\ %s %b \%{Referer}i\ \%{User-Agent}i\ combined + * + * The log filename ends up being access_log from a line like + * + * CustomLog logs/combined_log combined + * + * Example: + * + * raw = LOAD 'combined_log' USING org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader AS + * (remoteAddr, remoteLogname, user, time, method, uri, proto, status, bytes, referer, userAgent); + * + */ + +public class CombinedLogLoader extends RegExLoader { +// 1.2.3.4 - - [30/Sep/2008:15:07:53 -0400] GET / HTTP/1.1 200 3190 - +// Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1 +private final static Pattern combinedLogPattern = Pattern + .compile(^(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+.(\\S+\\s+\\S+).\\s+\(\\S+)\\s+(.+?)\\s+(HTTP[^\]+)\\\s+(\\S+)\\s+(\\S+)\\s+\([^\]*)\\\s+\(.*)\$); + +public Pattern getPattern() { +return combinedLogPattern; +} +} Added: hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java?rev=709206view=auto == --- hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java (added) +++ hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java Thu Oct 30 09:56:48 2008 @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the License); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in
[Pig Wiki] Trivial Update of StorageFunction by CorinneC
Dear Wiki user, You have subscribed to a wiki page or wiki category on Pig Wiki for change notification. The following page has been changed by CorinneC: http://wiki.apache.org/pig/StorageFunction -- If the !LoadFunc interface is implemented, the function can be used to load tuples. If the !StoreFunc interface is implemented, the function can be used to store tuples. Since loading and storing are usually tied to each other, most functions will implement both interfaces, e.g., !PigStorage and !BinStorage do. However, occassionally, we may write a function only for loading. - For examples of how to implement the following interfaces, look at [http://svn.apache.org/repos/asf/incubator/pig/trunk/src/org/apache/pig/builtin/PigStorage.java PigStorage], or [http://svn.apache.org/repos/asf/incubator/pig/trunk/src/org/apache/pig/builtin/BinStorage.java BinStorage]. + For examples of how to implement the following interfaces, look at [http://svn.apache.org/repos/asf/hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java PigStorage], or [http://svn.apache.org/repos/asf/hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java BinStorage]. {{{ public interface LoadFunc {
svn commit: r709222 - in /hadoop/pig/branches/types: src/org/apache/pig/builtin/DIFF.java test/org/apache/pig/test/TestBuiltin.java
Author: gates Date: Thu Oct 30 11:33:43 2008 New Revision: 709222 URL: http://svn.apache.org/viewvc?rev=709222view=rev Log: PIG-511 Fixed flaws in builtin UDF diff pointed out by Crisitan Ivascu. Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java?rev=709222r1=709221r2=709222view=diff == --- hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java (original) +++ hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java Thu Oct 30 11:33:43 2008 @@ -18,7 +18,9 @@ package org.apache.pig.builtin; import java.io.IOException; +import java.util.HashSet; import java.util.Iterator; +import java.util.Set; import org.apache.pig.EvalFunc; import org.apache.pig.backend.executionengine.ExecException; @@ -33,8 +35,6 @@ * will emit any Tuples that are in on of the DataBags but not the other. If the * fields are values, it will emit tuples with values that do not match. * - * @author breed - * */ public class DIFF extends EvalFuncDataBag { TupleFactory mTupleFactory = TupleFactory.getInstance(); @@ -78,48 +78,20 @@ DataBag bag1, DataBag bag2, DataBag emitTo) { -// Create two distinct versions of the bag. This will speed up -// comparison, and provide us a sorted order so we don't have to do -// an n^2 lookup. -DataBag d1 = mBagFactory.newDistinctBag(); -DataBag d2 = mBagFactory.newDistinctBag(); -IteratorTuple i1 = d1.iterator(); -IteratorTuple i2 = d2.iterator(); -while (i1.hasNext()) d1.add(i1.next()); -while (i2.hasNext()) d2.add(i2.next()); - -i1 = d1.iterator(); -i2 = d2.iterator(); - -Tuple t1 = i1.next(); -Tuple t2 = i2.next(); - -while (i1.hasNext() i2.hasNext()) { -int c = t1.compareTo(t2); - -if (c 0) { -// place t1 in the result bag and advance i1 -emitTo.add(t1); -t1 = i1.next(); -} else if (c 0) { -// place t2 in the result bag and advance i2 -emitTo.add(t2); -t2 = i2.next(); -} else if (c == 0) { -// put neither in the result bag, advance both iterators -t1 = i1.next(); -t2 = i2.next(); -} -} +// Build two hash tables and probe with first one, then the other. +// This does make the assumption that the distinct set of keys from +// each bag will fit in memory. +SetTuple s1 = new HashSetTuple(); +IteratorTuple i1 = bag1.iterator(); +while (i1.hasNext()) s1.add(i1.next()); + +SetTuple s2 = new HashSetTuple(); +IteratorTuple i2 = bag2.iterator(); +while (i2.hasNext()) s2.add(i2.next()); + +for (Tuple t : s1) if (!s2.contains(t)) emitTo.add(t); +for (Tuple t : s2) if (!s1.contains(t)) emitTo.add(t); -// One ran out, put all the rest of the other (if there are any) in -// the result bag. -while (i1.hasNext()) { -emitTo.add(i1.next()); -} -while (i2.hasNext()) { -emitTo.add(i2.next()); -} } Modified: hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java?rev=709222r1=709221r2=709222view=diff == --- hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java (original) +++ hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Thu Oct 30 11:33:43 2008 @@ -19,6 +19,7 @@ import java.io.File; import java.io.PrintWriter; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -890,6 +891,44 @@ assertTrue(f1.equals(f2)); } + +@Test +public void testDIFF() throws Exception { +// Test it in the case with two bags. +BagFactory bf = BagFactory.getInstance(); +TupleFactory tf = TupleFactory.getInstance(); + +DataBag b1 = bf.newDefaultBag(); +DataBag b2 = bf.newDefaultBag(); +for (int i = 0; i 10; i++) b1.add(tf.newTuple(new Integer(i))); +for (int i = 0; i 10; i += 2) b2.add(tf.newTuple(new Integer(i))); +Tuple t = tf.newTuple(2); +t.set(0, b1); +t.set(1, b2); +DIFF d = new DIFF(); +DataBag result = d.exec(t); + +assertEquals(5, result.size()); +IteratorTuple i = result.iterator(); +int[]