Author: daijy
Date: Wed Aug  5 17:51:51 2015
New Revision: 1694274

URL: http://svn.apache.org/r1694274
Log:
PIG-4639: Add better parser for Apache HTTPD access log

Added:
    
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
    
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/build.xml
    pig/trunk/ivy.xml
    pig/trunk/ivy/libraries.properties
    pig/trunk/ivy/piggybank-template.xml

Modified: pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Aug  5 17:51:51 2015
@@ -34,6 +34,8 @@ PIG-4405: Adding 'map[]' support to mock
 
 PIG-4638: Allow TOMAP to accept dynamically sized input (nielsbasjes via daijy)
 
+PIG-4639: Add better parser for Apache HTTPD access log (nielsbasjes via daijy)
+
 BUG FIXES
 
 PIG-4636: Occurred spelled incorrectly in error message for Launcher and 
POMergeCogroup (stevenmz via daijy)

Modified: pig/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/pig/trunk/build.xml?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/build.xml (original)
+++ pig/trunk/build.xml Wed Aug  5 17:51:51 2015
@@ -729,6 +729,8 @@
             <fileset dir="${ivy.lib.dir}" includes="accumulo-*.jar" 
excludes="accumulo-minicluster*.jar"/>
             <fileset dir="${ivy.lib.dir}" includes="json-simple-*.jar"/>
             <fileset dir="${ivy.lib.dir}" includes="kryo-*.jar"/>
+            <fileset dir="${ivy.lib.dir}" 
includes="httpdlog-*-${basjes-httpdlog-pigloader.version}.jar"/>
+            <fileset dir="${ivy.lib.dir}" 
includes="parser-core-${basjes-httpdlog-pigloader.version}.jar"/>
         </copy>
     </target>
 

Added: 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java?rev=1694274&view=auto
==============================================================================
--- 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
 (added)
+++ 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
 Wed Aug  5 17:51:51 2015
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.piggybank.storage.apachelog;
+
+import nl.basjes.pig.input.apachehttpdlog.Loader;
+
+/**
+ * This is a pig loader that can load Apache HTTPD access logs written in 
(almost) any
+ * Apache HTTPD LogFormat.<br/>
+ * Basic usage: <br/>
+ * Simply feed the loader your (custom) logformat specification and it will 
tell you which fields
+ * can be extracted from this logformat.<br/>
+ * For example:
+ * <pre>
+ * -- Specify any existing file as long as it exists.
+ * -- It won't be read by the loader when no fields are requested.
+ * Example =
+ *     LOAD 'test.pig'
+ *     USING org.apache.pig.piggybank.storage.apachelog.LogFormatLoader(
+ *       '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i"'
+ *     );
+ * DUMP Example;
+ * </pre>
+ *
+ * The output of this command is a (huge) example (yes actual pig code) which 
demonstrates
+ * how all possible fields can be extracted. In normal use cases this example 
will be trimmed
+ * down to request only the fields your application really needs.
+ * This loader implements pushdown projection so there is no need to worry too 
much about the
+ * fields you leave in.
+ * This loader supports extracting things like an individual cookie or query 
string parameter
+ * regardless of the position it has in the actual log line.
+ *
+ * In addition to the logformat specification used in your custom config this 
parser also
+ * understands the standard formats:<pre>
+ *    common
+ *    combined
+ *    combinedio
+ *    referer
+ *    agent
+ * </pre>
+ *
+ * So this works also:
+ * <pre>
+ * Example =
+ *     LOAD 'test.pig'
+ *     USING 
org.apache.pig.piggybank.storage.apachelog.LogFormatLoader('common');
+ * DUMP Example;
+ * </pre>
+ *
+ * This class is simply a wrapper around <a 
href="https://github.com/nielsbasjes/logparser";
+ * >https://github.com/nielsbasjes/logparser</a> so more detailed 
documentation can be found there.
+ */
+public class LogFormatLoader extends Loader {
+    public LogFormatLoader(String... parameters) {
+        super(parameters);
+    }
+}

Added: 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java?rev=1694274&view=auto
==============================================================================
--- 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
 (added)
+++ 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
 Wed Aug  5 17:51:51 2015
@@ -0,0 +1,131 @@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.builtin.mock.Storage;
+import org.apache.pig.data.Tuple;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.pig.builtin.mock.Storage.resetData;
+import static org.apache.pig.builtin.mock.Storage.tuple;
+import static org.apache.pig.builtin.mock.Storage.map;
+import static org.junit.Assert.assertEquals;
+
+public class TestLogFormatLoader {
+    @Test
+    public void testLogFormatLoader() throws Exception {
+        final String logformat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" 
\"%{User-Agent}i\" \"%{Cookie}i\"";
+        final String testLine = "2001:980:91c0:1:8d31:a232:25e5:85d - - 
[05/Sep/2010:11:27:50 +0200] " +
+                "\"GET 
/b/ss/advbolprod2/1/H.22.1/s73176445413647?AQB=1&pccr=true&vidn=27F07A1B85012045-403"
 +
+                
"&&ndh=1&t=19%2F5%2F2012%2023%3A51%3A27%202%20-120&ce=UTF-8&ns=bol&pageName=%2Fnl%2Fp%2Ffissler-"
 +
+                
"speciaal-pannen-grillpan-28-x-28-cm%2F9200000002876066%2F&g=http%3A%2F%2Fwww.bol.com%2Fnl%2Fp%2F"
 +
+                
"fissler-speciaal-pannen-grillpan-28-x-28-cm%2F9200000002876066%2F%3Fpromo%3Dkoken-pannen_303_hs-"
 +
+                
"koken-pannen-afj-120601_B3_product_1_9200000002876066%26bltg.pg_nm%3Dkoken-pannen%26bltg.slt_id%3D"
 +
+                
"303%26bltg.slt_nm%3Dhs-koken-pannen-afj-120601%26bltg.slt_p&r=http%3A%2F%2Fwww.bol.com%2Fnl%2Fm%2F"
 +
+                
"koken-tafelen%2Fkoken-pannen%2FN%2F11766%2Findex.html%3Fblabla%3Dblablawashere&cc=EUR&ch=D%3Dv3&"
 +
+                
"server=ps316&events=prodView%2Cevent1%2Cevent2%2Cevent31&products=%3B9200000002876066%3B%3B%3B%3B"
 +
+                
"evar3%3Dkth%7Cevar8%3D9200000002876066_Fissler%20Speciaal%20Pannen%20-%20Grillpan%20-%2028%20x%2028"
 +
+                
"%20cm%7Cevar35%3D170%7Cevar47%3DKTH%7Cevar9%3DNew%7Cevar40%3Dno%20reviews%2C%3B%3B%3B%3Bevent31%3D423"
 +
+                
"&c1=catalog%3Akth%3Aproduct-detail&v1=D%3Dc1&h1=catalog%2Fkth%2Fproduct-detail&h2=D%3DpageName&v3=kth"
 +
+                
"&l3=endeca_001-mensen_default%2Cendeca_exact-boeken_default%2Cendeca_verschijningsjaar_default%2C"
 +
+                
"endeca_hardgoodscategoriesyn_default%2Cendeca_searchrank-hadoop_default%2Cendeca_genre_default%2C"
 +
+                
"endeca_uitvoering_default&v4=ps316&v6=koken-pannen_303_hs-koken-pannen-afj-120601_B3_product_1_"
 +
+                
"9200000002876066&v10=Tu%2023%3A30&v12=logged%20in&v13=New&c25=niet%20ssl&c26=3631&"
 +
+                
"c30=1.2.3.4.1323208998208762&v31=2000285551&c45=20120619235127&c46=20120501%204.3.4.1&"
 +
+                
"c47=D%3Ds_vi&c49=%2Fnl%2Fcatalog%2Fproduct-detail.jsp&c50=%2Fnl%2Fcatalog%2Fproduct-detail.jsp&"
 +
+                
"v51=www.bol.com&s=1280x800&c=24&j=1.7&v=N&k=Y&bw=1280&bh=272&p=Shockwave%20Flash%3B&AQE=1
 " +
+                "HTTP/1.1\" 200 23617 
\"http://www.google.nl/imgres?imgurl=http://daniel_en_sander.basjes.nl/"; +
+                
"fotos/geboorte-kaartje/geboortekaartje-binnenkant.jpg&imgrefurl=http://daniel_en_sander.basjes.nl/";
 +
+                
"fotos/geboorte-kaartje&usg=__LDxRMkacRs6yLluLcIrwoFsXY6o=&h=521&w=1024&sz=41&hl=nl&start=13&zoom=1"
 +
+                
"&um=1&itbs=1&tbnid=Sqml3uGbjoyBYM:&tbnh=76&tbnw=150&prev=/images%3Fq%3Dbinnenkant%2Bgeboortekaartje"
 +
+                
"%26um%3D1%26hl%3Dnl%26sa%3DN%26biw%3D1882%26bih%3D1014%26tbs%3Disch:1\" " +
+                "\"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; nl-nl) 
AppleWebKit/533.17.8 (KHTML, like Gecko) " +
+                "Version/5.0.1 Safari/533.17.8\" \"jquery-ui-theme=Eggplant; 
BuI=SomeThing; " +
+                "Apache=127.0.0.1.1351111543699529\"";
+
+        PigServer pigServer = new PigServer(ExecType.LOCAL);
+        Storage.Data data = resetData(pigServer);
+
+        ArrayList<String[]> input = new ArrayList<String[]>();
+        input.add(new String[] { testLine });
+        String filename = TestHelper.createTempFile(input, " ");
+        filename = filename.replace("\\", "\\\\");
+
+        pigServer.registerQuery(
+            "Clicks = " +
+            "    LOAD '" + filename + "' " +
+            "    USING 
org.apache.pig.piggybank.storage.apachelog.LogFormatLoader(" +
+            "            '"+logformat+"'," +
+            "            'IP:connection.client.host'," +
+            "            'TIME.STAMP:request.receive.time'," +
+            "       '-map:request.firstline.uri.query.g:HTTP.URI'," +
+            "            'STRING:request.firstline.uri.query.g.query.promo'," +
+            "            'STRING:request.firstline.uri.query.g.query.*'," +
+            "            'STRING:request.firstline.uri.query.s'," +
+            "       '-map:request.firstline.uri.query.r:HTTP.URI'," +
+            "            'STRING:request.firstline.uri.query.r.query.blabla'," 
+
+            "            'HTTP.COOKIE:request.cookies.bui'," +
+            "            'HTTP.USERAGENT:request.user-agent'" +
+            "            )" +
+            "         AS (" +
+            "            ConnectionClientHost," +
+            "            RequestReceiveTime," +
+            "            Promo," +
+            "            QueryParams:map[]," +
+            "            ScreenResolution," +
+            "            GoogleQuery," +
+            "            BUI," +
+            "            RequestUseragent" +
+            "            );"
+            );
+
+        pigServer.registerQuery("STORE Clicks INTO 'Clicks' USING 
mock.Storage();");
+
+        List<Tuple> out = data.get("Clicks");
+
+        assertEquals(1, out.size());
+
+        Tuple actual = out.get(0);
+        Tuple expected = tuple(
+            "2001:980:91c0:1:8d31:a232:25e5:85d",
+            "[05/Sep/2010:11:27:50 +0200]",
+            
"koken-pannen_303_hs-koken-pannen-afj-120601_B3_product_1_9200000002876066",
+            map(
+                "promo"       , 
"koken-pannen_303_hs-koken-pannen-afj-120601_B3_product_1_9200000002876066",
+                "bltg.pg_nm"  , "koken-pannen",
+                "bltg.slt_nm" , "hs-koken-pannen-afj-120601",
+                "bltg.slt_id" , "303",
+                "bltg.slt_p"  , ""
+            ),
+            "1280x800",
+            "blablawashere",
+            "SomeThing",
+            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; nl-nl) 
AppleWebKit/533.17.8 " +
+                "(KHTML, like Gecko) Version/5.0.1 Safari/533.17.8"
+        );
+
+        assertEquals(expected, actual);
+    }
+
+}

Modified: pig/trunk/ivy.xml
URL: 
http://svn.apache.org/viewvc/pig/trunk/ivy.xml?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/ivy.xml (original)
+++ pig/trunk/ivy.xml Wed Aug  5 17:51:51 2015
@@ -79,6 +79,14 @@
       conf="compile->master"/>
     <dependency org="org.apache.httpcomponents" name="httpcore" 
rev="${httpcomponents.version}"
       conf="compile->master"/>
+    <dependency org="nl.basjes.parse.httpdlog" name="httpdlog-pigloader" 
rev="${basjes-httpdlog-pigloader.version}"
+      conf="compile->master"/>
+    <dependency org="nl.basjes.parse.httpdlog" name="httpdlog-inputformat" 
rev="${basjes-httpdlog-pigloader.version}"
+      conf="compile->master"/>
+    <dependency org="nl.basjes.parse.httpdlog" name="httpdlog-parser" 
rev="${basjes-httpdlog-pigloader.version}"
+      conf="compile->master"/>
+    <dependency org="nl.basjes.parse" name="parser-core" 
rev="${basjes-httpdlog-pigloader.version}"
+      conf="compile->master"/>
     <dependency org="commons-configuration" name="commons-configuration" 
rev="${commons-configuration.version}"
       conf="hadoop23->master"/>
     <dependency org="commons-collections" name="commons-collections" 
rev="${commons-collections.version}"

Modified: pig/trunk/ivy/libraries.properties
URL: 
http://svn.apache.org/viewvc/pig/trunk/ivy/libraries.properties?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/ivy/libraries.properties (original)
+++ pig/trunk/ivy/libraries.properties Wed Aug  5 17:51:51 2015
@@ -19,6 +19,7 @@ apacheant.version=1.7.1
 apacherat.version=0.8
 automaton.version=1.11-8
 avro.version=1.7.5
+basjes-httpdlog-pigloader.version=2.1.1
 commons-beanutils.version=1.7.0
 commons-cli.version=1.2
 commons-codec.version=1.4

Modified: pig/trunk/ivy/piggybank-template.xml
URL: 
http://svn.apache.org/viewvc/pig/trunk/ivy/piggybank-template.xml?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/ivy/piggybank-template.xml (original)
+++ pig/trunk/ivy/piggybank-template.xml Wed Aug  5 17:51:51 2015
@@ -76,6 +76,11 @@
       </exclusions>
     </dependency>
     <dependency>
+      <groupId>nl.basjes.parse.httpdlog</groupId>
+      <artifactId>httpdlog-pigloader</artifactId>
+      <version>2.1.1</version>
+    </dependency>
+    <dependency>
       <groupId>org.apache.pig</groupId>
       <artifactId>pig</artifactId>
       <version>@version</version>


Reply via email to