Author: gates
Date: Mon Oct 13 09:20:49 2008
New Revision: 704151

URL: http://svn.apache.org/viewvc?rev=704151&view=rev
Log:
 PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, 
determines the host.

Added:
    
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
    
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
Modified:
    incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704151&r1=704150&r2=704151&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:20:49 2008
@@ -364,3 +364,6 @@
     PIG-486: Added SearchEngineExtractor, a piggybank eval func that
        recognizes a set of the most common search engines in a URL and extracts
        the name of the search engine (spackest via gates).
+
+    PIG-487: Added HostExtractor, a piggybank eval func that, given a URL,
+       determines the host (spackest via gates).

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java?rev=704151&view=auto
==============================================================================
--- 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
 Mon Oct 13 09:20:49 2008
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+/*
+ * HostExtractor takes a url and returns the host. For example,
+ * 
+ * http://sports.espn.go.com/mlb/recap?gameId=281009122
+ * 
+ * leads to
+ * 
+ * sports.espn.go.com
+ * 
+ * Pig latin usage looks like
+ * 
+ * host = FOREACH row GENERATE
+ * 
org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor(referer);
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+
+import java.net.URL;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+public class HostExtractor extends EvalFunc<DataAtom> {
+    @Override
+    public void exec(Tuple input, DataAtom output) {
+        String string = input.getAtomField(0).strval();
+
+        if (string == null)
+            return;
+
+        String host = null;
+        try {
+            host = new URL(string).getHost().toLowerCase();
+        } catch (Exception e) {
+        }
+        if (host != null)
+            output.setValue(host);
+    }
+}

Added: 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java?rev=704151&view=auto
==============================================================================
--- 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
 (added)
+++ 
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
 Mon Oct 13 09:20:49 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.util.apachelogparser;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor;
+import 
org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor;
+import org.junit.Test;
+
+public class TestHostExtractor extends TestCase {
+    private static HashMap<String, String> tests = new HashMap<String, 
String>();
+    static {
+        tests.put("http://sports.espn.go.com/mlb/recap?gameId=281009122";, 
"sports.espn.go.com");
+        
tests.put("http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search";,
 "www.google.com");
+        
tests.put("http://search.msn.com/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR";,
 "search.msn.com");
+        
tests.put("http://www.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0";,
 "www.altavista.com");
+        tests.put("dud", null);
+    }
+
+    @Test
+    public void testInstantiation() {
+        assertNotNull(new SearchEngineExtractor());
+    }
+
+    @Test
+    public void testTests() {
+        HostExtractor hostExtractor = new HostExtractor();
+        int testCount = 0;
+        for (String key : tests.keySet()) {
+            String expected = tests.get(key);
+
+            ArrayList<Datum> input = new ArrayList<Datum>();
+            input.add(new DataAtom(key));
+
+            DataAtom output = new DataAtom();
+            hostExtractor.exec(new Tuple(input), output);
+            if (expected == null)
+                assertEquals(0, output.toString().length());
+            else
+                assertEquals(expected, output.toString());
+            testCount++;
+        }
+        assertEquals(tests.size(), testCount);
+    }
+}


Reply via email to