Author: gates
Date: Mon Oct 13 09:18:01 2008
New Revision: 704149

URL: http://svn.apache.org/viewvc?rev=704149&view=rev
Log:
PIG-486: Added SearchEngineExtractor, a piggybank eval func that recognizes a 
set of the most common search engines in a URL and extracts the name of the 
search engine.

Added:
    
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/
    
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
    
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/
    
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/
    
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java
Modified:
    incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704149&r1=704148&r2=704149&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:18:01 2008
@@ -361,3 +361,6 @@
 
        PIG-474: Added MyRegexLoader, a subclass of RegExLoader, to piggybank 
(spackest via gates)
 
+    PIG-486: Added SearchEngineExtractor, a piggybank eval func that
+       recognizes a set of the most common search engines in a URL and extracts
+       the name of the search engine (spackest via gates).

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java?rev=704149&view=auto
==============================================================================
--- 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java
 Mon Oct 13 09:18:01 2008
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+import java.net.URL;
+import java.util.HashMap;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+/**
+ * SearchEngineExtractor takes a url string and extracts the search engine. 
For example, given
+ * 
+ * 
http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search
+ * 
+ * then
+ * 
+ * Google
+ * 
+ * would be extracted.
+ * 
+ * From pig latin, usage looks something like
+ * 
+ * searchEngine = FOREACH row GENERATE
+ * 
org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor(referer);
+ * 
+ * Supported search engines include abacho.com, alice.it, alltheweb.com, 
altavista.com, aolsearch.aol.com,
+ * as.starware.com, ask.com, blogs.icerocket.com, blogsearch.google.com, 
blueyonder.co.uk, busca.orange.es,
+ * buscador.lycos.es, buscador.terra.es, buscar.ozu.es, categorico.it, 
cuil.com, excite.com, excite.it,
+ * fastweb.it, feedster.com, godado.com, godado.it, google.ad, google.ae, 
google.af, google.ag, google.am,
+ * google.as, google.at, google.az, google.ba, google.be, google.bg, 
google.bi, google.biz, google.bo,
+ * google.bs, google.bz, google.ca, google.cc, google.cd, google.cg, 
google.ch, google.ci, google.cl,
+ * google.cn, google.co.at , google.co.bi, google.co.bw, google.co.ci, 
google.co.ck, google.co.cr,
+ * google.co.gg, google.co.gl, google.co.gy, google.co.hu, google.co.id, 
google.co.il, google.co.im,
+ * google.co.in, google.co.it, google.co.je, google.co.jp, google.co.ke, 
google.co.kr, google.co.ls,
+ * google.co.ma, google.co.mu, google.co.mw, google.co.nz, google.co.pn, 
google.co.th, google.co.tt,
+ * google.co.ug, google.co.uk, google.co.uz, google.co.ve, google.co.vi, 
google.co.za, google.co.zm,
+ * google.co.zw, google.com, google.com.af, google.com.ag, google.com.ai, 
google.com.ar, google.com.au,
+ * google.com.az, google.com.bd, google.com.bh, google.com.bi, google.com.bn, 
google.com.bo, google.com.br,
+ * google.com.bs, google.com.bz, google.com.cn, google.com.co, google.com.cu, 
google.com.do, google.com.ec,
+ * google.com.eg, google.com.et, google.com.fj, google.com.ge, google.com.gh, 
google.com.gi, google.com.gl,
+ * google.com.gp, google.com.gr, google.com.gt, google.com.gy, google.com.hk, 
google.com.hn, google.com.hr,
+ * google.com.jm, google.com.jo, google.com.kg, google.com.kh, google.com.ki, 
google.com.kz, google.com.lk,
+ * google.com.lv, google.com.ly, google.com.mt, google.com.mu, google.com.mw, 
google.com.mx, google.com.my,
+ * google.com.na, google.com.nf, google.com.ng, google.com.ni, google.com.np, 
google.com.nr, google.com.om,
+ * google.com.pa, google.com.pe, google.com.ph, google.com.pk, google.com.pl, 
google.com.pr, google.com.pt,
+ * google.com.py, google.com.qa, google.com.ru, google.com.sa, google.com.sb, 
google.com.sc, google.com.sg,
+ * google.com.sv, google.com.tj, google.com.tr, google.com.tt, google.com.tw, 
google.com.uy, google.com.uz,
+ * google.com.ve, google.com.vi, google.com.vn, google.com.ws, google.cz, 
google.de, google.dj, google.dk ,
+ * google.dm , google.ec, google.ee, google.es, google.fi, google.fm, 
google.fr, google.gd, google.ge,
+ * google.gf, google.gg, google.gl, google.gm, google.gp, google.gr, 
google.gy, google.hk, google.hn,
+ * google.hr, google.ht, google.hu, google.ie, google.im, google.in, 
google.info, google.is, google.it,
+ * google.je, google.jo, google.jobs, google.jp, google.kg, google.ki, 
google.kz, google.la, google.li,
+ * google.lk, google.lt, google.lu, google.lv, google.ma, google.md, 
google.mn, google.mobi, google.ms,
+ * google.mu, google.mv, google.mw, google.net, google.nf, google.nl, 
google.no, google.nr, google.nu,
+ * google.off.ai, google.ph, google.pk, google.pl, google.pn, google.pr, 
google.pt, google.ro, google.ru,
+ * google.rw, google.sc, google.se, google.sg, google.sh, google.si, 
google.sk, google.sm, google.sn,
+ * google.sr, google.st, google.tk, google.tm, google.to, google.tp, 
google.tt, google.tv, google.tw,
+ * google.ug, google.us, google.uz, google.vg, google.vn, google.vu, 
google.ws, gps.virgin.net, hotbot.com,
+ * ilmotore.com, ithaki.net, kataweb.it, libero.it, lycos.it, mamma.com, 
megasearching.net, mirago.co.uk,
+ * netscape.com, search.aol.co.uk, search.arabia.msn.com, search.bbc.co.uk, 
search.conduit.com,
+ * search.icq.com, search.live.com, search.lycos.co.uk, search.lycos.com, 
search.msn.co.uk, search.msn.com,
+ * search.myway.com, search.mywebsearch.com, search.ntlworld.com, 
search.orange.co.uk, search.prodigy.msn.com,
+ * search.sweetim.com, search.virginmedia.com, search.yahoo.co.jp, 
search.yahoo.com, search.yahoo.jp,
+ * simpatico.ws, soso.com, suche.fireball.de, suche.t-online.de, suche.web.de, 
technorati.com, tesco.net,
+ * thespider.it, tiscali.co.uk, uk.altavista.com, uk.ask.com, 
uk.search.yahoo.com
+ * 
+ * Thanks to Spiros Denaxas for his URI::ParseSearchString, which is the basis 
for the lookups.
+ */
+
+public class SearchEngineExtractor extends EvalFunc<DataAtom> {
+    private static HashMap<String, String> searchEngines = new HashMap<String, 
String>();
+    static {
+        searchEngines.put("abacho.com", "Abacho");
+        searchEngines.put("alice.it", "Alice.it");
+        searchEngines.put("alltheweb.com", "AllTheWeb");
+        searchEngines.put("altavista.com", "Altavista");
+        searchEngines.put("aolsearch.aol.com", "AOL Search");
+        searchEngines.put("as.starware.com", "Starware");
+        searchEngines.put("ask.com", "Ask dot com");
+        searchEngines.put("blogs.icerocket.com", "IceRocket");
+        searchEngines.put("blogsearch.google.com", "Google Blogsearch");
+        searchEngines.put("blueyonder.co.uk", "Blueyonder");
+        searchEngines.put("busca.orange.es", "Orange ES");
+        searchEngines.put("buscador.lycos.es", "Lycos ES");
+        searchEngines.put("buscador.terra.es", "Terra ES");
+        searchEngines.put("buscar.ozu.es", "Ozu ES");
+        searchEngines.put("categorico.it", "Categorico IT");
+        searchEngines.put("cuil.com", "Cuil");
+        searchEngines.put("excite.com", "Excite");
+        searchEngines.put("excite.it", "Excite IT");
+        searchEngines.put("fastweb.it", "Fastweb IT");
+        searchEngines.put("feedster.com", "Feedster");
+        searchEngines.put("godado.com", "Godado");
+        searchEngines.put("godado.it", "Godado (IT)");
+        searchEngines.put("google.ad", "Google Andorra");
+        searchEngines.put("google.ae", "Google United Arab Emirates");
+        searchEngines.put("google.af", "Google Afghanistan");
+        searchEngines.put("google.ag", "Google Antiqua and Barbuda");
+        searchEngines.put("google.am", "Google Armenia");
+        searchEngines.put("google.as", "Google American Samoa");
+        searchEngines.put("google.at", "Google Austria");
+        searchEngines.put("google.az", "Google Azerbaijan");
+        searchEngines.put("google.ba", "Google Bosnia and Herzegovina");
+        searchEngines.put("google.be", "Google Belgium");
+        searchEngines.put("google.bg", "Google Bulgaria");
+        searchEngines.put("google.bi", "Google Burundi");
+        searchEngines.put("google.biz", "Google dot biz");
+        searchEngines.put("google.bo", "Google Bolivia");
+        searchEngines.put("google.bs", "Google Bahamas");
+        searchEngines.put("google.bz", "Google Belize");
+        searchEngines.put("google.ca", "Google Canada");
+        searchEngines.put("google.cc", "Google Cocos Islands");
+        searchEngines.put("google.cd", "Google Dem Rep of Congo");
+        searchEngines.put("google.cg", "Google Rep of Congo");
+        searchEngines.put("google.ch", "Google Switzerland");
+        searchEngines.put("google.ci", "Google Cote dIvoire");
+        searchEngines.put("google.cl", "Google Chile");
+        searchEngines.put("google.cn", "Google China");
+        searchEngines.put("google.co.at ", "Google Austria");
+        searchEngines.put("google.co.bi", "Google Burundi");
+        searchEngines.put("google.co.bw", "Google Botswana");
+        searchEngines.put("google.co.ci", "Google Ivory Coast");
+        searchEngines.put("google.co.ck", "Google Cook Islands");
+        searchEngines.put("google.co.cr", "Google Costa Rica");
+        searchEngines.put("google.co.gg", "Google Guernsey");
+        searchEngines.put("google.co.gl", "Google Greenland");
+        searchEngines.put("google.co.gy", "Google Guyana");
+        searchEngines.put("google.co.hu", "Google Hungary ");
+        searchEngines.put("google.co.id", "Google Indonesia");
+        searchEngines.put("google.co.il", "Google Israel");
+        searchEngines.put("google.co.im", "Google Isle of Man");
+        searchEngines.put("google.co.in", "Google India");
+        searchEngines.put("google.co.it", "Google Italy");
+        searchEngines.put("google.co.je", "Google Jersey");
+        searchEngines.put("google.co.jp", "Google Japan");
+        searchEngines.put("google.co.ke", "Google Kenya");
+        searchEngines.put("google.co.kr", "Google South Korea");
+        searchEngines.put("google.co.ls", "Google Lesotho");
+        searchEngines.put("google.co.ma", "Google Morocco");
+        searchEngines.put("google.co.mu", "Google Mauritius");
+        searchEngines.put("google.co.mw", "Google Malawi");
+        searchEngines.put("google.co.nz", "Google New Zeland");
+        searchEngines.put("google.co.pn", "Google Pitcairn Islands");
+        searchEngines.put("google.co.th", "Google Thailand");
+        searchEngines.put("google.co.tt", "Google Trinidad and Tobago");
+        searchEngines.put("google.co.ug", "Google Uganda");
+        searchEngines.put("google.co.uk", "Google UK");
+        searchEngines.put("google.co.uz", "Google Uzbekistan");
+        searchEngines.put("google.co.ve", "Google Venezuela");
+        searchEngines.put("google.co.vi", "Google US Virgin Islands");
+        searchEngines.put("google.co.za", "Google   South Africa ");
+        searchEngines.put("google.co.zm", "Google Zambia");
+        searchEngines.put("google.co.zw", "Google Zimbabwe");
+        searchEngines.put("google.com", "Google");
+        searchEngines.put("google.com.af", "Google Afghanistan");
+        searchEngines.put("google.com.ag", "Google Antiqua and Barbuda");
+        searchEngines.put("google.com.ai", "Google Anguilla");
+        searchEngines.put("google.com.ar", "Google Argentina");
+        searchEngines.put("google.com.au", "Google Australia");
+        searchEngines.put("google.com.az", "Google Azerbaijan ");
+        searchEngines.put("google.com.bd", "Google Bangladesh");
+        searchEngines.put("google.com.bh", "Google Bahrain");
+        searchEngines.put("google.com.bi", "Google Burundi");
+        searchEngines.put("google.com.bn", "Google Brunei Darussalam");
+        searchEngines.put("google.com.bo", "Google Bolivia ");
+        searchEngines.put("google.com.br", "Google Brazil");
+        searchEngines.put("google.com.bs", "Google Bahamas");
+        searchEngines.put("google.com.bz", "Google Belize");
+        searchEngines.put("google.com.cn", "Google China");
+        searchEngines.put("google.com.co", "Google ");
+        searchEngines.put("google.com.cu", "Google Cuba");
+        searchEngines.put("google.com.do", "Google Dominican Rep");
+        searchEngines.put("google.com.ec", "Google Ecuador");
+        searchEngines.put("google.com.eg", "Google Egypt");
+        searchEngines.put("google.com.et", "Google Ethiopia");
+        searchEngines.put("google.com.fj", "Google Fiji");
+        searchEngines.put("google.com.ge", "Google Georgia");
+        searchEngines.put("google.com.gh", "Google Ghana");
+        searchEngines.put("google.com.gi", "Google Gibraltar");
+        searchEngines.put("google.com.gl", "Google Greenland");
+        searchEngines.put("google.com.gp", "Google Guadeloupe");
+        searchEngines.put("google.com.gr", "Google Greece");
+        searchEngines.put("google.com.gt", "Google Guatemala");
+        searchEngines.put("google.com.gy", "Google Guyana");
+        searchEngines.put("google.com.hk", "Google Hong Kong");
+        searchEngines.put("google.com.hn", "Google Honduras");
+        searchEngines.put("google.com.hr", "Google Croatia");
+        searchEngines.put("google.com.jm", "Google Jamaica");
+        searchEngines.put("google.com.jo", "Google Jordan");
+        searchEngines.put("google.com.kg", "Google Kyrgyzstan");
+        searchEngines.put("google.com.kh", "Google Cambodia");
+        searchEngines.put("google.com.ki", "Google Kiribati");
+        searchEngines.put("google.com.kz", "Google Kazakhstan");
+        searchEngines.put("google.com.lk", "Google Sri Lanka");
+        searchEngines.put("google.com.lv", "Google Latvia");
+        searchEngines.put("google.com.ly", "Google Libya");
+        searchEngines.put("google.com.mt", "Google Malta");
+        searchEngines.put("google.com.mu", "Google Mauritius");
+        searchEngines.put("google.com.mw", "Google Malawi");
+        searchEngines.put("google.com.mx", "Google Mexico");
+        searchEngines.put("google.com.my", "Google Malaysia");
+        searchEngines.put("google.com.na", "Google Namibia");
+        searchEngines.put("google.com.nf", "Google Norfolk Island");
+        searchEngines.put("google.com.ng", "Google Nigeria");
+        searchEngines.put("google.com.ni", "Google Nicaragua");
+        searchEngines.put("google.com.np", "Google Nepal");
+        searchEngines.put("google.com.nr", "Google Nauru");
+        searchEngines.put("google.com.om", "Google Oman");
+        searchEngines.put("google.com.pa", "Google Panama");
+        searchEngines.put("google.com.pe", "Google Peru");
+        searchEngines.put("google.com.ph", "Google Philipines");
+        searchEngines.put("google.com.pk", "Google Pakistan");
+        searchEngines.put("google.com.pl", "Google Poland");
+        searchEngines.put("google.com.pr", "Google Puerto Rico");
+        searchEngines.put("google.com.pt", "Google Portugal");
+        searchEngines.put("google.com.py", "Google Paraguay");
+        searchEngines.put("google.com.qa", "Google ");
+        searchEngines.put("google.com.ru", "Google Russia");
+        searchEngines.put("google.com.sa", "Google Saudi Arabia");
+        searchEngines.put("google.com.sb", "Google Solomon Islands");
+        searchEngines.put("google.com.sc", "Google Seychelles");
+        searchEngines.put("google.com.sg", "Google Singapore");
+        searchEngines.put("google.com.sv", "Google El Savador");
+        searchEngines.put("google.com.tj", "Google Tajikistan");
+        searchEngines.put("google.com.tr", "Google Turkey");
+        searchEngines.put("google.com.tt", "Google Trinidad and Tobago");
+        searchEngines.put("google.com.tw", "Google Taiwan");
+        searchEngines.put("google.com.uy", "Google Uruguay");
+        searchEngines.put("google.com.uz", "Google Uzbekistan ");
+        searchEngines.put("google.com.ve", "Google Venezuela");
+        searchEngines.put("google.com.vi", "Google US Virgin Islands");
+        searchEngines.put("google.com.vn", "Google Vietnam");
+        searchEngines.put("google.com.ws", "Google Samoa");
+        searchEngines.put("google.cz", "Google Czech Rep");
+        searchEngines.put("google.de", "Google Germany");
+        searchEngines.put("google.dj", "Google Djubouti");
+        searchEngines.put("google.dk ", "Google Denmark");
+        searchEngines.put("google.dm ", "Google Dominica");
+        searchEngines.put("google.ec", "Google Ecuador");
+        searchEngines.put("google.ee", "Google Estonia");
+        searchEngines.put("google.es", "Google Spain");
+        searchEngines.put("google.fi", "Google Finland");
+        searchEngines.put("google.fm", "Google Micronesia");
+        searchEngines.put("google.fr", "Google France");
+        searchEngines.put("google.gd", "Google Grenada");
+        searchEngines.put("google.ge", "Google Georgia");
+        searchEngines.put("google.gf", "Google French Guiana");
+        searchEngines.put("google.gg", "Google Guernsey");
+        searchEngines.put("google.gl", "Google Greenland");
+        searchEngines.put("google.gm", "Google Gambia");
+        searchEngines.put("google.gp", "Google Guadeloupe");
+        searchEngines.put("google.gr", "Google Greece");
+        searchEngines.put("google.gy", "Google Guyana");
+        searchEngines.put("google.hk", "Google Hong Kong");
+        searchEngines.put("google.hn", "Google Honduras");
+        searchEngines.put("google.hr", "Google Croatia");
+        searchEngines.put("google.ht", "Google Haiti");
+        searchEngines.put("google.hu", "Google Hungary");
+        searchEngines.put("google.ie", "Google Ireland");
+        searchEngines.put("google.im", "Google Isle of Man");
+        searchEngines.put("google.in", "Google India");
+        searchEngines.put("google.info", "Google dot info");
+        searchEngines.put("google.is", "Google Iceland");
+        searchEngines.put("google.it", "Google Italy");
+        searchEngines.put("google.je", "Google Jersey");
+        searchEngines.put("google.jo", "Google Jordan");
+        searchEngines.put("google.jobs", "Google dot jobs");
+        searchEngines.put("google.jp", "Google Japan");
+        searchEngines.put("google.kg", "Google Kyrgyzstan");
+        searchEngines.put("google.ki", "Google Kiribati");
+        searchEngines.put("google.kz", "Google Kazakhstan");
+        searchEngines.put("google.la", "Google Laos");
+        searchEngines.put("google.li", "Google Liechtenstein");
+        searchEngines.put("google.lk", "Google Sri Lanka");
+        searchEngines.put("google.lt", "Google Lithuania");
+        searchEngines.put("google.lu", "Google Luxembourg");
+        searchEngines.put("google.lv", "Google Latvia");
+        searchEngines.put("google.ma", "Google Morocco");
+        searchEngines.put("google.md", "Google Moldova");
+        searchEngines.put("google.mn", "Google Mongolia");
+        searchEngines.put("google.mobi", "Google dot mobi");
+        searchEngines.put("google.ms", "Google Montserrat");
+        searchEngines.put("google.mu", "Google Mauritius");
+        searchEngines.put("google.mv", "Google Maldives");
+        searchEngines.put("google.mw", "Google Malawi");
+        searchEngines.put("google.net", "Google dot net");
+        searchEngines.put("google.nf", "Google Norfolk Island");
+        searchEngines.put("google.nl", "Google Netherlands");
+        searchEngines.put("google.no", "Google Norway");
+        searchEngines.put("google.nr", "Google Nauru");
+        searchEngines.put("google.nu", "Google Niue");
+        searchEngines.put("google.off.ai", "Google Anguilla");
+        searchEngines.put("google.ph", "Google Philipines");
+        searchEngines.put("google.pk", "Google Pakistan");
+        searchEngines.put("google.pl", "Google Poland");
+        searchEngines.put("google.pn", "Google Pitcairn Islands");
+        searchEngines.put("google.pr", "Google Puerto Rico");
+        searchEngines.put("google.pt", "Google Portugal");
+        searchEngines.put("google.ro", "Google Romania");
+        searchEngines.put("google.ru", "Google Russia");
+        searchEngines.put("google.rw", "Google Rwanda");
+        searchEngines.put("google.sc", "Google Seychelles");
+        searchEngines.put("google.se", "Google Sweden");
+        searchEngines.put("google.sg", "Google Singapore");
+        searchEngines.put("google.sh", "Google Saint Helena");
+        searchEngines.put("google.si", "Google Slovenia");
+        searchEngines.put("google.sk", "Google Slovakia");
+        searchEngines.put("google.sm", "Google San Marino");
+        searchEngines.put("google.sn", "Google Senegal");
+        searchEngines.put("google.sr", "Google Suriname");
+        searchEngines.put("google.st", "Google Sao Tome ");
+        searchEngines.put("google.tk", "Google Tokelau");
+        searchEngines.put("google.tm", "Google Turkmenistan");
+        searchEngines.put("google.to", "Google Tonga");
+        searchEngines.put("google.tp", "Google East Timor");
+        searchEngines.put("google.tt", "Google Trinidad and Tobago");
+        searchEngines.put("google.tv", "Google Tuvalu");
+        searchEngines.put("google.tw", "Google Taiwan");
+        searchEngines.put("google.ug", "Google Uganda");
+        searchEngines.put("google.us", "Google US");
+        searchEngines.put("google.uz", "Google Uzbekistan");
+        searchEngines.put("google.vg", "Google British Virgin Islands");
+        searchEngines.put("google.vn", "Google Vietnam");
+        searchEngines.put("google.vu", "Google Vanuatu");
+        searchEngines.put("google.ws", "Google Samoa");
+        searchEngines.put("gps.virgin.net", "Virgin Search");
+        searchEngines.put("hotbot.com", "HotBot");
+        searchEngines.put("ilmotore.com", "ilMotore");
+        searchEngines.put("ithaki.net", "Ithaki");
+        searchEngines.put("kataweb.it", "Kataweb IT");
+        searchEngines.put("libero.it", "Libero IT");
+        searchEngines.put("lycos.it", "Lycos IT");
+        searchEngines.put("mamma.com", "Mamma");
+        searchEngines.put("megasearching.net", "Megasearching");
+        searchEngines.put("mirago.co.uk", "Mirago UK");
+        searchEngines.put("netscape.com", "Netscape");
+        searchEngines.put("search.aol.co.uk", "AOL UK");
+        searchEngines.put("search.arabia.msn.com", "MSN Arabia");
+        searchEngines.put("search.bbc.co.uk", "BBC Search");
+        searchEngines.put("search.conduit.com", "Conduit");
+        searchEngines.put("search.icq.com", "ICQ dot com");
+        searchEngines.put("search.live.com", "Live.com");
+        searchEngines.put("search.lycos.co.uk", "Lycos UK");
+        searchEngines.put("search.lycos.com", "Lycos");
+        searchEngines.put("search.msn.co.uk", "MSN UK");
+        searchEngines.put("search.msn.com", "MSN");
+        searchEngines.put("search.myway.com", "MyWay");
+        searchEngines.put("search.mywebsearch.com", "My Web Search");
+        searchEngines.put("search.ntlworld.com", "NTLWorld");
+        searchEngines.put("search.orange.co.uk", "Orange Search");
+        searchEngines.put("search.prodigy.msn.com", "MSN Prodigy");
+        searchEngines.put("search.sweetim.com", "Sweetim");
+        searchEngines.put("search.virginmedia.com", "VirginMedia");
+        searchEngines.put("search.yahoo.co.jp", "Yahoo Japan");
+        searchEngines.put("search.yahoo.com", "Yahoo!");
+        searchEngines.put("search.yahoo.jp", "Yahoo! Japan");
+        searchEngines.put("simpatico.ws", "Simpatico IT");
+        searchEngines.put("soso.com", "Soso");
+        searchEngines.put("suche.fireball.de", "Fireball DE");
+        searchEngines.put("suche.t-online.de", "T-Online");
+        searchEngines.put("suche.web.de", "Suche DE");
+        searchEngines.put("technorati.com", "Technorati");
+        searchEngines.put("tesco.net", "Tesco Search");
+        searchEngines.put("thespider.it", "TheSpider IT");
+        searchEngines.put("tiscali.co.uk", "Tiscali UK");
+        searchEngines.put("uk.altavista.com", "Altavista UK");
+        searchEngines.put("uk.ask.com", "Ask UK");
+        searchEngines.put("uk.search.yahoo.com", "Yahoo! UK");
+    }
+
+    @Override
+    public void exec(Tuple input, DataAtom output) {
+        String referer = input.getAtomField(0).strval();
+
+        if (referer == null || referer.length() < 4)
+            return;
+
+        String searchEngine = null;
+
+        String host = null;
+        try {
+            host = new 
URL(referer).getHost().toLowerCase().replaceFirst("^www.", "");
+        } catch (Exception e) {
+        }
+
+        if (host != null)
+            searchEngine = searchEngines.containsKey(host) ? 
searchEngines.get(host) : null;
+
+        if (searchEngine != null)
+            output.setValue(searchEngine);
+    }
+}

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java?rev=704149&view=auto
==============================================================================
--- 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java
 Mon Oct 13 09:18:01 2008
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.util.apachelogparser;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import 
org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor;
+import org.junit.Test;
+
+public class TestSearchEngineExtractor extends TestCase {
+    private static HashMap<String, String> tests = new HashMap<String, 
String>();
+    static {
+        
tests.put("http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search";,
 "Google");
+        
tests.put("http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search";,
 "Google");
+        
tests.put("http://www.google.co.uk/search?hl=en&q=a+simple+test&btnG=Google+Search&meta=";,
 "Google UK");
+        
tests.put("http://www.google.co.jp/search?hl=ja&q=a+simple+test&btnG=Google+%E6%A4%9C%E7%B4%A2&lr=";,
 "Google Japan");
+        
tests.put("http://search.msn.co.uk/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR";,
 "MSN UK");
+        
tests.put("http://search.msn.com/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR";,
 "MSN");
+        
tests.put("http://www.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0";,
 "Altavista");
+        
tests.put("http://uk.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0";,
 "Altavista UK");
+        
tests.put("http://www.blueyonder.co.uk/blueyonder/searches/search.jsp?q=a+simple+test&cr=&sitesearch=&x=0&y=0";,
 "Blueyonder");
+        
tests.put("http://www.alltheweb.com/search?cat=web&cs=iso88591&q=a+simple+test&rys=0&itag=crv&_sb_lang=pref";,
 "AllTheWeb");
+        tests.put("http://search.lycos.com/?query=a+simple+test&x=0&y=0";, 
"Lycos");
+        
tests.put("http://search.lycos.co.uk/cgi-bin/pursuit?query=a+simple+test&enc=utf-8&cat=slim_loc&sc=blue";,
 "Lycos UK");
+        
tests.put("http://www.hotbot.com/index.php?query=a+simple+test&ps=&loc=searchbox&tab=web&mode=search&currProv=msn";,
 "HotBot");
+        
tests.put("http://search.yahoo.com/search?p=a+simple+test&fr=FP-tab-web-t400&toggle=1&cop=&ei=UTF-8";,
 "Yahoo!");
+        
tests.put("http://uk.search.yahoo.com/search?p=a+simple+test&fr=FP-tab-web-t340&ei=UTF-8&meta=vc%3D";,
 "Yahoo! UK");
+        
tests.put("http://uk.ask.com/web?q=a+simple+test&qsrc=0&o=0&l=dir&dm=all";, "Ask 
UK");
+        
tests.put("http://www.mirago.co.uk/scripts/qhandler.aspx?qry=a+simple+test&x=0&y=0";,
 "Mirago UK");
+        tests.put("http://www.netscape.com/search/?s=a+simple+test";, 
"Netscape");
+        
tests.put("http://search.aol.co.uk/web?invocationType=ns_uk&query=a%20simple%20test";,
 "AOL UK");
+        
tests.put("http://www.tiscali.co.uk/search/results.php?section=&from=&query=a+simple+test";,
 "Tiscali UK");
+        
tests.put("http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+simple+test&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0";,
 "Mamma");
+        tests.put("http://blogs.icerocket.com/search?q=a+simple+test";, 
"IceRocket");
+        
tests.put("http://blogsearch.google.com/blogsearch?hl=en&ie=UTF-8&q=a+simple+test&btnG=Search+Blogs";,
 "Google Blogsearch");
+        
tests.put("http://suche.fireball.de/cgi-bin/pursuit?query=a+simple+test&x=0&y=0&cat=fb_loc&enc=utf-8";,
 "Fireball DE");
+        
tests.put("http://suche.web.de/search/web/?allparams=&smode=&su=a+simple+test&webRb=de";,
 "Suche DE");
+        tests.put("http://www.technorati.com/search/a%20simple%20test";, 
"Technorati");
+        tests.put("http://www.feedster.com/search/a%20simple%20test";, 
"Feedster");
+        
tests.put("http://www.tesco.net/google/searchresults.asp?q=a+simple+test&cr=";, 
"Tesco Search");
+        tests
+            .put(
+                
"http://gps.virgin.net/search/sitesearch?submit.x=1&start=0&format=1&num=10&restrict=site&sitefilter=site%2Fsite_filter.hts&siteresults=site%2Fsite_results.hts&sitescorethreshold=28&q=a+simple+test&scope=UK&x=0&y=0";,
+                "Virgin Search");
+        
tests.put("http://search.bbc.co.uk/cgi-bin/search/results.pl?tab=web&go=homepage&q=a+simple+test&Search.x=0&Search.y=0&Search=Search&scope=all";,
+            "BBC Search");
+        
tests.put("http://search.live.com/results.aspx?q=a+simple+test&mkt=en-us&FORM=LVSP&go.x=0&go.y=0&go=Search";,
 "Live.com");
+        
tests.put("http://search.mywebsearch.com/mywebsearch/AJmain.jhtml?searchfor=a+simple+test";,
 "My Web Search");
+        
tests.put("http://www.megasearching.net/m/search.aspx?s=a+simple+test&mkt=&orig=1";,
 "Megasearching");
+        
tests.put("http://www.blueyonder.co.uk/blueyonder/searches/search.jsp?q=a+simple+test&cr=&sitesearch=&x=0&y=0";,
 "Blueyonder");
+        
tests.put("http://search.ntlworld.com/ntlworld/search.php?q=a+simple+test&cr=&x=0&y=0";,
 "NTLWorld");
+        
tests.put("http://search.orange.co.uk/all?p=_searchbox&pt=resultgo&brand=ouk&tab=web&q=a+simple+test";,
 "Orange Search");
+        
tests.put("http://search.virginmedia.com/results/index.php?channel=other&q=a+simple+test&cr=&x=0&y=0";,
 "VirginMedia");
+        
tests.put("http://as.starware.com/dp/search?src_id=305&product=unknown&qry=a+simple+test&z=Find+It";,
 "Starware");
+        
tests.put("http://aolsearch.aol.com/aol/search?invocationType=topsearchbox.webhome&query=a+simple+test";,
 "AOL Search");
+        tests.put("http://www.ask.com/web?q=a+simple+test&qsrc=0&o=0&l=dir";, 
"Ask dot com");
+        
tests.put("http://buscador.terra.es/Default.aspx?source=Search&ca=s&query=a%20simple%20test";,
 "Terra ES");
+        
tests.put("http://busca.orange.es/search?origen=home&destino=web&buscar=a+simple+test";,
 "Orange ES");
+        
tests.put("http://search.sweetim.com/search.asp?ln=en&q=a%20simple%20test";, 
"Sweetim");
+        
tests.put("http://search.conduit.com/Results.aspx?q=a+simple+test&hl=en&SelfSearch=1&SearchSourceOrigin=1&ctid=WEBSITE";,
 "Conduit");
+        tests.put("http://buscar.ozu.es/index.php?etq=web&q=a+simple+test";, 
"Ozu ES");
+        
tests.put("http://buscador.lycos.es/cgi-bin/pursuit?query=a+simple+test&websearchCat=loc&cat=loc&SITE=de&enc=utf-8&ref=sboxlink";,
 "Lycos ES");
+        
tests.put("http://search.icq.com/search/results.php?q=a+simple+test&ch_id=st&search_mode=web";,
 "ICQ dot com");
+        
tests.put("http://search.yahoo.co.jp/search?ei=UTF-8&fr=sfp_as&p=a+simple+test&meta=vc%3D";,
 "Yahoo Japan");
+        tests.put("http://www.soso.com/q?pid=s.idx&w=a+simple+test";, "Soso");
+        
tests.put("http://search.myway.com/search/AJmain.jhtml?searchfor=a+simple+test";,
 "MyWay");
+        
tests.put("http://www.ilmotore.com/newsearch/?query=a+simple+test&where=web";, 
"ilMotore");
+        
tests.put("http://www.ithaki.net/ricerca.cgi?where=italia&query=a+simple+test";, 
"Ithaki");
+        
tests.put("http://www.excite.it/search/web/results?l=&q=a+simple+test";, "Excite 
IT");
+        
tests.put("http://www.thespider.it/dir/index.php?q=a+simple+test&search-btn.x=0&search-btn.y=0";,
 "TheSpider IT");
+        
tests.put("http://godado.it/engine.php?l=it&key=a+simple+test&x=0&y=0";, "Godado 
(IT)");
+        
tests.put("http://www.simpatico.ws/cgi-bin/links/search.cgi?query=a+simple+test&Vai=Go";,
 "Simpatico IT");
+        tests
+            .put(
+                
"http://www.categorico.it/ricerca.html?domains=Categorico.it&q=a+simple+test&sa=Cerca+con+Google&sitesearch=&client=pub-0499722654836507&forid=1&channel=7983145815&ie=ISO-8859-1&oe=ISO-8859-1&cof=GALT%3A%23008000%3BGL%3A1%3BDIV%3A%23336699%3BVLC%3A663399%3BAH%3Acenter%3BBGC%3AFFFFFF%3BLBGC%3A336699%3BALC%3A0000FF%3BLC%3A0000FF%3BT%3A000000%3BGFNT%3A0000FF%3BGIMP%3A0000FF%3BFORID%3A11&hl=it";,
+                "Categorico IT");
+        tests.put("http://www.cuil.com/search?q=a+simple+test";, "Cuil");
+        
tests.put("http://www.google.com/search?hl=en&lr=&q=a+more%21+complex_+search%24&btnG=Search";,
 "Google");
+        
tests.put("http://www.google.co.uk/search?hl=en&q=a+more%21+complex_+search%24&btnG=Google+Search&meta=";,
 "Google UK");
+        
tests.put("http://www.google.co.jp/search?hl=ja&q=a+more%21+complex_+search%24&btnG=Google+%E6%A4%9C%E7%B4%A2&lr=";,
 "Google Japan");
+        
tests.put("http://search.msn.com/results.aspx?q=a+more%21+complex_+search%24&FORM=QBHP";,
 "MSN");
+        
tests.put("http://search.msn.co.uk/results.aspx?q=a+more%21+complex_+search%24&FORM=MSNH&srch_type=0&cp=65001";,
 "MSN UK");
+        
tests.put("http://www.altavista.com/web/results?itag=ody&q=a+more%21+complex_+search%24&kgs=1&kls=0";,
 "Altavista");
+        
tests.put("http://uk.altavista.com/web/results?itag=ody&q=a+more%21+complex_+search%24&kgs=1&kls=0";,
 "Altavista UK");
+        
tests.put("http://www.blueyonder.co.uk/blueyonder/searches/search.jsp?q=a+more%21+complex_+search%24&cr=&sitesearch=&x=0&y=0";,
 "Blueyonder");
+        
tests.put("http://www.alltheweb.com/search?cat=web&cs=iso88591&q=a+more%21+complex_+search%24&rys=0&itag=crv&_sb_lang=pref";,
 "AllTheWeb");
+        
tests.put("http://search.lycos.com/?query=a+more%21+complex_+search%24&x=0&y=0";,
 "Lycos");
+        
tests.put("http://search.lycos.co.uk/cgi-bin/pursuit?query=a+more%21+complex_+search%24&enc=utf-8&cat=slim_loc&sc=blue";,
 "Lycos UK");
+        
tests.put("http://www.hotbot.com/index.php?query=a+more%21+complex_+search%24&ps=&loc=searchbox&tab=web&mode=search&currProv=msn";,
 "HotBot");
+        
tests.put("http://search.yahoo.com/search?p=a+more%21+complex_+search%24&fr=FP-tab-web-t400&toggle=1&cop=&ei=UTF-8";,
 "Yahoo!");
+        
tests.put("http://uk.search.yahoo.com/search?p=a+more%21+complex_+search%24&fr=FP-tab-web-t340&ei=UTF-8&meta=vc%3D";,
 "Yahoo! UK");
+        
tests.put("http://uk.ask.com/web?q=a+more%21+complex_+search%24&qsrc=0&o=0&l=dir&dm=all";,
 "Ask UK");
+        
tests.put("http://www.mirago.co.uk/scripts/qhandler.aspx?qry=a+more%21+complex_+search%24&x=0&y=0";,
 "Mirago UK");
+        
tests.put("http://www.netscape.com/search/?s=a+more%21+complex_+search%24";, 
"Netscape");
+        
tests.put("http://search.aol.co.uk/web?query=a+more%21+complex_+search%24&x=0&y=0&isinit=true&restrict=wholeweb";,
 "AOL UK");
+        
tests.put("http://www.tiscali.co.uk/search/results.php?section=&from=&query=a+more%21+complex_+search%24";,
 "Tiscali UK");
+        
tests.put("http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+more%21+complex_+search%24&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0";,
 "Mamma");
+    }
+
+    @Test
+    public void testInstantiation() {
+        assertNotNull(new SearchEngineExtractor());
+    }
+
+    @Test
+    public void testTests() {
+        SearchEngineExtractor searchEngineExtractor = new 
SearchEngineExtractor();
+        int testCount = 0;
+        for (String key : tests.keySet()) {
+            String expected = tests.get(key);
+
+            assertNotNull(expected);
+            assertTrue(expected.length() > 0);
+            ArrayList<Datum> input = new ArrayList<Datum>();
+            input.add(new DataAtom(key));
+
+            DataAtom output = new DataAtom();
+            searchEngineExtractor.exec(new Tuple(input), output);
+            assertEquals(expected, output.toString());
+            testCount++;
+        }
+        assertEquals(tests.size(), testCount);
+    }
+}


Reply via email to