svn commit: r704149 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ contrib/piggybank/java/src/test/java/org/apache/pig/pig

2008-10-13 Thread gates
Author: gates
Date: Mon Oct 13 09:18:01 2008
New Revision: 704149

URL: http://svn.apache.org/viewvc?rev=704149view=rev
Log:
PIG-486: Added SearchEngineExtractor, a piggybank eval func that recognizes a 
set of the most common search engines in a URL and extracts the name of the 
search engine.

Added:

incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/

incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java

incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/

incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/

incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java
Modified:
incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704149r1=704148r2=704149view=diff
==
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:18:01 2008
@@ -361,3 +361,6 @@
 
PIG-474: Added MyRegexLoader, a subclass of RegExLoader, to piggybank 
(spackest via gates)
 
+PIG-486: Added SearchEngineExtractor, a piggybank eval func that
+   recognizes a set of the most common search engines in a URL and extracts
+   the name of the search engine (spackest via gates).

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java?rev=704149view=auto
==
--- 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
 Mon Oct 13 09:18:01 2008
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
License); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+import java.net.URL;
+import java.util.HashMap;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+/**
+ * SearchEngineExtractor takes a url string and extracts the search engine. 
For example, given
+ * 
+ * 
http://www.google.com/search?hl=ensafe=activerls=GGLG,GGLG:2005-24,GGLG:enq=purpose+of+lifebtnG=Search
+ * 
+ * then
+ * 
+ * Google
+ * 
+ * would be extracted.
+ * 
+ * From pig latin, usage looks something like
+ * 
+ * searchEngine = FOREACH row GENERATE
+ * 
org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor(referer);
+ * 
+ * Supported search engines include abacho.com, alice.it, alltheweb.com, 
altavista.com, aolsearch.aol.com,
+ * as.starware.com, ask.com, blogs.icerocket.com, blogsearch.google.com, 
blueyonder.co.uk, busca.orange.es,
+ * buscador.lycos.es, buscador.terra.es, buscar.ozu.es, categorico.it, 
cuil.com, excite.com, excite.it,
+ * fastweb.it, feedster.com, godado.com, godado.it, google.ad, google.ae, 
google.af, google.ag, google.am,
+ * google.as, google.at, google.az, google.ba, google.be, google.bg, 
google.bi, google.biz, google.bo,
+ * google.bs, google.bz, google.ca, google.cc, google.cd, google.cg, 
google.ch, google.ci, google.cl,
+ * google.cn, google.co.at , google.co.bi, google.co.bw, google.co.ci, 
google.co.ck, google.co.cr,
+ * google.co.gg, google.co.gl, google.co.gy, google.co.hu, google.co.id, 
google.co.il, google.co.im,
+ * google.co.in, google.co.it, google.co.je, google.co.jp, google.co.ke, 
google.co.kr, google.co.ls,
+ * google.co.ma, google.co.mu, google.co.mw, google.co.nz, google.co.pn, 
google.co.th, google.co.tt,

svn commit: r704151 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ contrib/piggybank/java/src/test/java/org/apache/pig/pig

2008-10-13 Thread gates
Author: gates
Date: Mon Oct 13 09:20:49 2008
New Revision: 704151

URL: http://svn.apache.org/viewvc?rev=704151view=rev
Log:
 PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, 
determines the host.

Added:

incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java

incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
Modified:
incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704151r1=704150r2=704151view=diff
==
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:20:49 2008
@@ -364,3 +364,6 @@
 PIG-486: Added SearchEngineExtractor, a piggybank eval func that
recognizes a set of the most common search engines in a URL and extracts
the name of the search engine (spackest via gates).
+
+PIG-487: Added HostExtractor, a piggybank eval func that, given a URL,
+   determines the host (spackest via gates).

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java?rev=704151view=auto
==
--- 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
 Mon Oct 13 09:20:49 2008
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
License); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+/*
+ * HostExtractor takes a url and returns the host. For example,
+ * 
+ * http://sports.espn.go.com/mlb/recap?gameId=281009122
+ * 
+ * leads to
+ * 
+ * sports.espn.go.com
+ * 
+ * Pig latin usage looks like
+ * 
+ * host = FOREACH row GENERATE
+ * 
org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor(referer);
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+
+import java.net.URL;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+public class HostExtractor extends EvalFuncDataAtom {
+@Override
+public void exec(Tuple input, DataAtom output) {
+String string = input.getAtomField(0).strval();
+
+if (string == null)
+return;
+
+String host = null;
+try {
+host = new URL(string).getHost().toLowerCase();
+} catch (Exception e) {
+}
+if (host != null)
+output.setValue(host);
+}
+}

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java?rev=704151view=auto
==
--- 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
 Mon Oct 13 09:20:49 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
License); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ *