svn commit: r704149 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ contrib/piggybank/java/src/test/java/org/apache/pig/pig
Author: gates Date: Mon Oct 13 09:18:01 2008 New Revision: 704149 URL: http://svn.apache.org/viewvc?rev=704149view=rev Log: PIG-486: Added SearchEngineExtractor, a piggybank eval func that recognizes a set of the most common search engines in a URL and extracts the name of the search engine. Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchEngineExtractor.java Modified: incubator/pig/trunk/CHANGES.txt Modified: incubator/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704149r1=704148r2=704149view=diff == --- incubator/pig/trunk/CHANGES.txt (original) +++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:18:01 2008 @@ -361,3 +361,6 @@ PIG-474: Added MyRegexLoader, a subclass of RegExLoader, to piggybank (spackest via gates) +PIG-486: Added SearchEngineExtractor, a piggybank eval func that + recognizes a set of the most common search engines in a URL and extracts + the name of the search engine (spackest via gates). Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java?rev=704149view=auto == --- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchEngineExtractor.java Mon Oct 13 09:18:01 2008 @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the License); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.evaluation.util.apachelogparser; + +import java.net.URL; +import java.util.HashMap; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Tuple; + +/** + * SearchEngineExtractor takes a url string and extracts the search engine. For example, given + * + * http://www.google.com/search?hl=ensafe=activerls=GGLG,GGLG:2005-24,GGLG:enq=purpose+of+lifebtnG=Search + * + * then + * + * Google + * + * would be extracted. + * + * From pig latin, usage looks something like + * + * searchEngine = FOREACH row GENERATE + * org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor(referer); + * + * Supported search engines include abacho.com, alice.it, alltheweb.com, altavista.com, aolsearch.aol.com, + * as.starware.com, ask.com, blogs.icerocket.com, blogsearch.google.com, blueyonder.co.uk, busca.orange.es, + * buscador.lycos.es, buscador.terra.es, buscar.ozu.es, categorico.it, cuil.com, excite.com, excite.it, + * fastweb.it, feedster.com, godado.com, godado.it, google.ad, google.ae, google.af, google.ag, google.am, + * google.as, google.at, google.az, google.ba, google.be, google.bg, google.bi, google.biz, google.bo, + * google.bs, google.bz, google.ca, google.cc, google.cd, google.cg, google.ch, google.ci, google.cl, + * google.cn, google.co.at , google.co.bi, google.co.bw, google.co.ci, google.co.ck, google.co.cr, + * google.co.gg, google.co.gl, google.co.gy, google.co.hu, google.co.id, google.co.il, google.co.im, + * google.co.in, google.co.it, google.co.je, google.co.jp, google.co.ke, google.co.kr, google.co.ls, + * google.co.ma, google.co.mu, google.co.mw, google.co.nz, google.co.pn, google.co.th, google.co.tt,
svn commit: r704151 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ contrib/piggybank/java/src/test/java/org/apache/pig/pig
Author: gates Date: Mon Oct 13 09:20:49 2008 New Revision: 704151 URL: http://svn.apache.org/viewvc?rev=704151view=rev Log: PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, determines the host. Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java Modified: incubator/pig/trunk/CHANGES.txt Modified: incubator/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704151r1=704150r2=704151view=diff == --- incubator/pig/trunk/CHANGES.txt (original) +++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:20:49 2008 @@ -364,3 +364,6 @@ PIG-486: Added SearchEngineExtractor, a piggybank eval func that recognizes a set of the most common search engines in a URL and extracts the name of the search engine (spackest via gates). + +PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, + determines the host (spackest via gates). Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java?rev=704151view=auto == --- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java Mon Oct 13 09:20:49 2008 @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the License); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +/* + * HostExtractor takes a url and returns the host. For example, + * + * http://sports.espn.go.com/mlb/recap?gameId=281009122 + * + * leads to + * + * sports.espn.go.com + * + * Pig latin usage looks like + * + * host = FOREACH row GENERATE + * org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor(referer); + */ + +package org.apache.pig.piggybank.evaluation.util.apachelogparser; + + +import java.net.URL; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Tuple; + +public class HostExtractor extends EvalFuncDataAtom { +@Override +public void exec(Tuple input, DataAtom output) { +String string = input.getAtomField(0).strval(); + +if (string == null) +return; + +String host = null; +try { +host = new URL(string).getHost().toLowerCase(); +} catch (Exception e) { +} +if (host != null) +output.setValue(host); +} +} Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java?rev=704151view=auto == --- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java Mon Oct 13 09:20:49 2008 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the License); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + *