Author: gates Date: Thu Oct 30 09:56:48 2008 New Revision: 709206 URL: http://svn.apache.org/viewvc?rev=709206&view=rev Log: PIG-509: Added CombinedLogLoader, loads logs that were created using Apache's combined log format.
Added: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java Modified: hadoop/pig/trunk/CHANGES.txt Modified: hadoop/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=709206&r1=709205&r2=709206&view=diff ============================================================================== --- hadoop/pig/trunk/CHANGES.txt (original) +++ hadoop/pig/trunk/CHANGES.txt Thu Oct 30 09:56:48 2008 @@ -379,3 +379,6 @@ gates). move to hadoop + + PIG-509: Added CombinedLogLoader, loads logs that were created using + Apache's combined log format (spackest via gates). Added: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java?rev=709206&view=auto ============================================================================== --- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java (added) +++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/CombinedLogLoader.java Thu Oct 30 09:56:48 2008 @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.storage.apachelog; + +import java.util.regex.Pattern; + +import org.apache.pig.piggybank.storage.RegExLoader; + +/** + * CombinedLogLoader is used to load logs based on Apache's combined log format, based on a format like + * + * LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined + * + * The log filename ends up being access_log from a line like + * + * CustomLog logs/combined_log combined + * + * Example: + * + * raw = LOAD 'combined_log' USING org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader AS + * (remoteAddr, remoteLogname, user, time, method, uri, proto, status, bytes, referer, userAgent); + * + */ + +public class CombinedLogLoader extends RegExLoader { + // 1.2.3.4 - - [30/Sep/2008:15:07:53 -0400] "GET / HTTP/1.1" 200 3190 "-" + // "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1" + private final static Pattern combinedLogPattern = Pattern + .compile("^(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+.(\\S+\\s+\\S+).\\s+\"(\\S+)\\s+(.+?)\\s+(HTTP[^\"]+)\"\\s+(\\S+)\\s+(\\S+)\\s+\"([^\"]*)\"\\s+\"(.*)\"$"); + + public Pattern getPattern() { + return combinedLogPattern; + } +} Added: hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java?rev=709206&view=auto ============================================================================== --- hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java (added) +++ hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCombinedLogLoader.java Thu Oct 30 09:56:48 2008 @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.test.storage; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Properties; + +import junit.framework.TestCase; + +import org.apache.pig.PigServer; +import org.apache.pig.PigServer.ExecType; +import org.apache.pig.data.Tuple; +import org.apache.pig.impl.PigContext; +import org.apache.pig.impl.io.BufferedPositionedInputStream; +import org.apache.pig.impl.io.FileLocalizer; +import org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader; +import org.junit.Test; + +public class TestCombinedLogLoader extends TestCase { + public static ArrayList<String[]> data = new ArrayList<String[]>(); + static { + data.add(new String[] { "1.2.3.4", "-", "-", "[01/Jan/2008:23:27:45 -0600]", "\"GET /zero.html HTTP/1.0\"", "200", "100", "\"-\"", + "\"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1\"" }); + data.add(new String[] { "1.2.3.4", "-", "-", "[01/Jan/2008:23:27:45 -0600]", "\"GET /zero.html HTTP/1.0\"", "200", "100", + "\"http://myreferringsite.com\"", + "\"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1\"" }); + data.add(new String[] { "1.2.3.4", "-", "-", "[01/Jan/2008:23:27:45 -0600]", "\"GET /zero.html HTTP/1.0\"", "200", "100", "\"-\"", + "\"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; en-us) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1\"" }); + } + + public static ArrayList<String[]> EXPECTED = new ArrayList<String[]>(); + static { + + for (int i = 0; i < data.size(); i++) { + ArrayList<String> thisExpected = new ArrayList<String>(); + for (int j = 0; j <= 2; j++) { + thisExpected.add(data.get(i)[j]); + } + String temp = data.get(i)[3]; + temp = temp.replace("[", ""); + temp = temp.replace("]", ""); + thisExpected.add(temp); + + temp = data.get(i)[4]; + + for (String thisOne : data.get(i)[4].split(" ")) { + thisOne = thisOne.replace("\"", ""); + thisExpected.add(thisOne); + } + for (int j = 5; j <= 6; j++) { + thisExpected.add(data.get(i)[j]); + } + for (int j = 7; j <= 8; j++) { + String thisOne = data.get(i)[j]; + thisOne = thisOne.replace("\"", ""); + thisExpected.add(thisOne); + } + + String[] toAdd = new String[0]; + toAdd = (String[]) (thisExpected.toArray(toAdd)); + EXPECTED.add(toAdd); + } + } + + @Test + public void testInstantiation() { + CombinedLogLoader combinedLogLoader = new CombinedLogLoader(); + assertNotNull(combinedLogLoader); + } + + @Test + public void testLoadFromBindTo() throws Exception { + String filename = TestHelper.createTempFile(data, " "); + CombinedLogLoader combinedLogLoader = new CombinedLogLoader(); + PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties()); + InputStream inputStream = FileLocalizer.open(filename, pigContext); + combinedLogLoader.bindTo(filename, new BufferedPositionedInputStream(inputStream), 0, Long.MAX_VALUE); + + int tupleCount = 0; + + while (true) { + Tuple tuple = combinedLogLoader.getNext(); + if (tuple == null) + break; + else { + TestHelper.examineTuple(EXPECTED, tuple, tupleCount); + tupleCount++; + } + } + assertEquals(data.size(), tupleCount); + } + + public void testLoadFromPigServer() throws Exception { + String filename = TestHelper.createTempFile(data, " "); + PigServer pig = new PigServer(ExecType.LOCAL); + filename = filename.replace("\\", "\\\\"); + pig.registerQuery("A = LOAD 'file:" + filename + "' USING org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();"); + Iterator<?> it = pig.openIterator("A"); + + int tupleCount = 0; + + while (it.hasNext()) { + Tuple tuple = (Tuple) it.next(); + if (tuple == null) + break; + else { + TestHelper.examineTuple(EXPECTED, tuple, tupleCount); + tupleCount++; + } + } + assertEquals(data.size(), tupleCount); + } +}