Author: daijy
Date: Wed Aug 5 17:51:51 2015
New Revision: 1694274
URL: http://svn.apache.org/r1694274
Log:
PIG-4639: Add better parser for Apache HTTPD access log
Added:
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
Modified:
pig/trunk/CHANGES.txt
pig/trunk/build.xml
pig/trunk/ivy.xml
pig/trunk/ivy/libraries.properties
pig/trunk/ivy/piggybank-template.xml
Modified: pig/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Aug 5 17:51:51 2015
@@ -34,6 +34,8 @@ PIG-4405: Adding 'map[]' support to mock
PIG-4638: Allow TOMAP to accept dynamically sized input (nielsbasjes via daijy)
+PIG-4639: Add better parser for Apache HTTPD access log (nielsbasjes via daijy)
+
BUG FIXES
PIG-4636: Occurred spelled incorrectly in error message for Launcher and
POMergeCogroup (stevenmz via daijy)
Modified: pig/trunk/build.xml
URL:
http://svn.apache.org/viewvc/pig/trunk/build.xml?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/build.xml (original)
+++ pig/trunk/build.xml Wed Aug 5 17:51:51 2015
@@ -729,6 +729,8 @@
<fileset dir="${ivy.lib.dir}" includes="accumulo-*.jar"
excludes="accumulo-minicluster*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="json-simple-*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="kryo-*.jar"/>
+ <fileset dir="${ivy.lib.dir}"
includes="httpdlog-*-${basjes-httpdlog-pigloader.version}.jar"/>
+ <fileset dir="${ivy.lib.dir}"
includes="parser-core-${basjes-httpdlog-pigloader.version}.jar"/>
</copy>
</target>
Added:
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java?rev=1694274&view=auto
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
(added)
+++
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/apachelog/LogFormatLoader.java
Wed Aug 5 17:51:51 2015
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.piggybank.storage.apachelog;
+
+import nl.basjes.pig.input.apachehttpdlog.Loader;
+
+/**
+ * This is a pig loader that can load Apache HTTPD access logs written in
(almost) any
+ * Apache HTTPD LogFormat.<br/>
+ * Basic usage: <br/>
+ * Simply feed the loader your (custom) logformat specification and it will
tell you which fields
+ * can be extracted from this logformat.<br/>
+ * For example:
+ * <pre>
+ * -- Specify any existing file as long as it exists.
+ * -- It won't be read by the loader when no fields are requested.
+ * Example =
+ * LOAD 'test.pig'
+ * USING org.apache.pig.piggybank.storage.apachelog.LogFormatLoader(
+ * '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i"'
+ * );
+ * DUMP Example;
+ * </pre>
+ *
+ * The output of this command is a (huge) example (yes actual pig code) which
demonstrates
+ * how all possible fields can be extracted. In normal use cases this example
will be trimmed
+ * down to request only the fields your application really needs.
+ * This loader implements pushdown projection so there is no need to worry too
much about the
+ * fields you leave in.
+ * This loader supports extracting things like an individual cookie or query
string parameter
+ * regardless of the position it has in the actual log line.
+ *
+ * In addition to the logformat specification used in your custom config this
parser also
+ * understands the standard formats:<pre>
+ * common
+ * combined
+ * combinedio
+ * referer
+ * agent
+ * </pre>
+ *
+ * So this works also:
+ * <pre>
+ * Example =
+ * LOAD 'test.pig'
+ * USING
org.apache.pig.piggybank.storage.apachelog.LogFormatLoader('common');
+ * DUMP Example;
+ * </pre>
+ *
+ * This class is simply a wrapper around <a
href="https://github.com/nielsbasjes/logparser"
+ * >https://github.com/nielsbasjes/logparser</a> so more detailed
documentation can be found there.
+ */
+public class LogFormatLoader extends Loader {
+ public LogFormatLoader(String... parameters) {
+ super(parameters);
+ }
+}
Added:
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java?rev=1694274&view=auto
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
(added)
+++
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestLogFormatLoader.java
Wed Aug 5 17:51:51 2015
@@ -0,0 +1,131 @@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.builtin.mock.Storage;
+import org.apache.pig.data.Tuple;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.pig.builtin.mock.Storage.resetData;
+import static org.apache.pig.builtin.mock.Storage.tuple;
+import static org.apache.pig.builtin.mock.Storage.map;
+import static org.junit.Assert.assertEquals;
+
+public class TestLogFormatLoader {
+ @Test
+ public void testLogFormatLoader() throws Exception {
+ final String logformat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\"
\"%{User-Agent}i\" \"%{Cookie}i\"";
+ final String testLine = "2001:980:91c0:1:8d31:a232:25e5:85d - -
[05/Sep/2010:11:27:50 +0200] " +
+ "\"GET
/b/ss/advbolprod2/1/H.22.1/s73176445413647?AQB=1&pccr=true&vidn=27F07A1B85012045-403"
+
+
"&&ndh=1&t=19%2F5%2F2012%2023%3A51%3A27%202%20-120&ce=UTF-8&ns=bol&pageName=%2Fnl%2Fp%2Ffissler-"
+
+
"speciaal-pannen-grillpan-28-x-28-cm%2F9200000002876066%2F&g=http%3A%2F%2Fwww.bol.com%2Fnl%2Fp%2F"
+
+
"fissler-speciaal-pannen-grillpan-28-x-28-cm%2F9200000002876066%2F%3Fpromo%3Dkoken-pannen_303_hs-"
+
+
"koken-pannen-afj-120601_B3_product_1_9200000002876066%26bltg.pg_nm%3Dkoken-pannen%26bltg.slt_id%3D"
+
+
"303%26bltg.slt_nm%3Dhs-koken-pannen-afj-120601%26bltg.slt_p&r=http%3A%2F%2Fwww.bol.com%2Fnl%2Fm%2F"
+
+
"koken-tafelen%2Fkoken-pannen%2FN%2F11766%2Findex.html%3Fblabla%3Dblablawashere&cc=EUR&ch=D%3Dv3&"
+
+
"server=ps316&events=prodView%2Cevent1%2Cevent2%2Cevent31&products=%3B9200000002876066%3B%3B%3B%3B"
+
+
"evar3%3Dkth%7Cevar8%3D9200000002876066_Fissler%20Speciaal%20Pannen%20-%20Grillpan%20-%2028%20x%2028"
+
+
"%20cm%7Cevar35%3D170%7Cevar47%3DKTH%7Cevar9%3DNew%7Cevar40%3Dno%20reviews%2C%3B%3B%3B%3Bevent31%3D423"
+
+
"&c1=catalog%3Akth%3Aproduct-detail&v1=D%3Dc1&h1=catalog%2Fkth%2Fproduct-detail&h2=D%3DpageName&v3=kth"
+
+
"&l3=endeca_001-mensen_default%2Cendeca_exact-boeken_default%2Cendeca_verschijningsjaar_default%2C"
+
+
"endeca_hardgoodscategoriesyn_default%2Cendeca_searchrank-hadoop_default%2Cendeca_genre_default%2C"
+
+
"endeca_uitvoering_default&v4=ps316&v6=koken-pannen_303_hs-koken-pannen-afj-120601_B3_product_1_"
+
+
"9200000002876066&v10=Tu%2023%3A30&v12=logged%20in&v13=New&c25=niet%20ssl&c26=3631&"
+
+
"c30=1.2.3.4.1323208998208762&v31=2000285551&c45=20120619235127&c46=20120501%204.3.4.1&"
+
+
"c47=D%3Ds_vi&c49=%2Fnl%2Fcatalog%2Fproduct-detail.jsp&c50=%2Fnl%2Fcatalog%2Fproduct-detail.jsp&"
+
+
"v51=www.bol.com&s=1280x800&c=24&j=1.7&v=N&k=Y&bw=1280&bh=272&p=Shockwave%20Flash%3B&AQE=1
" +
+ "HTTP/1.1\" 200 23617
\"http://www.google.nl/imgres?imgurl=http://daniel_en_sander.basjes.nl/" +
+
"fotos/geboorte-kaartje/geboortekaartje-binnenkant.jpg&imgrefurl=http://daniel_en_sander.basjes.nl/"
+
+
"fotos/geboorte-kaartje&usg=__LDxRMkacRs6yLluLcIrwoFsXY6o=&h=521&w=1024&sz=41&hl=nl&start=13&zoom=1"
+
+
"&um=1&itbs=1&tbnid=Sqml3uGbjoyBYM:&tbnh=76&tbnw=150&prev=/images%3Fq%3Dbinnenkant%2Bgeboortekaartje"
+
+
"%26um%3D1%26hl%3Dnl%26sa%3DN%26biw%3D1882%26bih%3D1014%26tbs%3Disch:1\" " +
+ "\"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; nl-nl)
AppleWebKit/533.17.8 (KHTML, like Gecko) " +
+ "Version/5.0.1 Safari/533.17.8\" \"jquery-ui-theme=Eggplant;
BuI=SomeThing; " +
+ "Apache=127.0.0.1.1351111543699529\"";
+
+ PigServer pigServer = new PigServer(ExecType.LOCAL);
+ Storage.Data data = resetData(pigServer);
+
+ ArrayList<String[]> input = new ArrayList<String[]>();
+ input.add(new String[] { testLine });
+ String filename = TestHelper.createTempFile(input, " ");
+ filename = filename.replace("\\", "\\\\");
+
+ pigServer.registerQuery(
+ "Clicks = " +
+ " LOAD '" + filename + "' " +
+ " USING
org.apache.pig.piggybank.storage.apachelog.LogFormatLoader(" +
+ " '"+logformat+"'," +
+ " 'IP:connection.client.host'," +
+ " 'TIME.STAMP:request.receive.time'," +
+ " '-map:request.firstline.uri.query.g:HTTP.URI'," +
+ " 'STRING:request.firstline.uri.query.g.query.promo'," +
+ " 'STRING:request.firstline.uri.query.g.query.*'," +
+ " 'STRING:request.firstline.uri.query.s'," +
+ " '-map:request.firstline.uri.query.r:HTTP.URI'," +
+ " 'STRING:request.firstline.uri.query.r.query.blabla',"
+
+ " 'HTTP.COOKIE:request.cookies.bui'," +
+ " 'HTTP.USERAGENT:request.user-agent'" +
+ " )" +
+ " AS (" +
+ " ConnectionClientHost," +
+ " RequestReceiveTime," +
+ " Promo," +
+ " QueryParams:map[]," +
+ " ScreenResolution," +
+ " GoogleQuery," +
+ " BUI," +
+ " RequestUseragent" +
+ " );"
+ );
+
+ pigServer.registerQuery("STORE Clicks INTO 'Clicks' USING
mock.Storage();");
+
+ List<Tuple> out = data.get("Clicks");
+
+ assertEquals(1, out.size());
+
+ Tuple actual = out.get(0);
+ Tuple expected = tuple(
+ "2001:980:91c0:1:8d31:a232:25e5:85d",
+ "[05/Sep/2010:11:27:50 +0200]",
+
"koken-pannen_303_hs-koken-pannen-afj-120601_B3_product_1_9200000002876066",
+ map(
+ "promo" ,
"koken-pannen_303_hs-koken-pannen-afj-120601_B3_product_1_9200000002876066",
+ "bltg.pg_nm" , "koken-pannen",
+ "bltg.slt_nm" , "hs-koken-pannen-afj-120601",
+ "bltg.slt_id" , "303",
+ "bltg.slt_p" , ""
+ ),
+ "1280x800",
+ "blablawashere",
+ "SomeThing",
+ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; nl-nl)
AppleWebKit/533.17.8 " +
+ "(KHTML, like Gecko) Version/5.0.1 Safari/533.17.8"
+ );
+
+ assertEquals(expected, actual);
+ }
+
+}
Modified: pig/trunk/ivy.xml
URL:
http://svn.apache.org/viewvc/pig/trunk/ivy.xml?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/ivy.xml (original)
+++ pig/trunk/ivy.xml Wed Aug 5 17:51:51 2015
@@ -79,6 +79,14 @@
conf="compile->master"/>
<dependency org="org.apache.httpcomponents" name="httpcore"
rev="${httpcomponents.version}"
conf="compile->master"/>
+ <dependency org="nl.basjes.parse.httpdlog" name="httpdlog-pigloader"
rev="${basjes-httpdlog-pigloader.version}"
+ conf="compile->master"/>
+ <dependency org="nl.basjes.parse.httpdlog" name="httpdlog-inputformat"
rev="${basjes-httpdlog-pigloader.version}"
+ conf="compile->master"/>
+ <dependency org="nl.basjes.parse.httpdlog" name="httpdlog-parser"
rev="${basjes-httpdlog-pigloader.version}"
+ conf="compile->master"/>
+ <dependency org="nl.basjes.parse" name="parser-core"
rev="${basjes-httpdlog-pigloader.version}"
+ conf="compile->master"/>
<dependency org="commons-configuration" name="commons-configuration"
rev="${commons-configuration.version}"
conf="hadoop23->master"/>
<dependency org="commons-collections" name="commons-collections"
rev="${commons-collections.version}"
Modified: pig/trunk/ivy/libraries.properties
URL:
http://svn.apache.org/viewvc/pig/trunk/ivy/libraries.properties?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/ivy/libraries.properties (original)
+++ pig/trunk/ivy/libraries.properties Wed Aug 5 17:51:51 2015
@@ -19,6 +19,7 @@ apacheant.version=1.7.1
apacherat.version=0.8
automaton.version=1.11-8
avro.version=1.7.5
+basjes-httpdlog-pigloader.version=2.1.1
commons-beanutils.version=1.7.0
commons-cli.version=1.2
commons-codec.version=1.4
Modified: pig/trunk/ivy/piggybank-template.xml
URL:
http://svn.apache.org/viewvc/pig/trunk/ivy/piggybank-template.xml?rev=1694274&r1=1694273&r2=1694274&view=diff
==============================================================================
--- pig/trunk/ivy/piggybank-template.xml (original)
+++ pig/trunk/ivy/piggybank-template.xml Wed Aug 5 17:51:51 2015
@@ -76,6 +76,11 @@
</exclusions>
</dependency>
<dependency>
+ <groupId>nl.basjes.parse.httpdlog</groupId>
+ <artifactId>httpdlog-pigloader</artifactId>
+ <version>2.1.1</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.pig</groupId>
<artifactId>pig</artifactId>
<version>@version</version>