svn commit: r713854 [3/3] - in /hadoop/pig/branches/types/tutorial: ./ data/ scripts/ src/ src/org/ src/org/apache/ src/org/apache/pig/ src/org/apache/pig/tutorial/

olga Thu, 13 Nov 2008 15:09:17 -0800

Propchange: hadoop/pig/branches/types/tutorial/data/excite-small.log
------------------------------------------------------------------------------
    svn:executable = *


Added: hadoop/pig/branches/types/tutorial/data/excite.log.bz2
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/data/excite.log.bz2?rev=713854&view=auto
==============================================================================
Binary file - no diff available.

Propchange: hadoop/pig/branches/types/tutorial/data/excite.log.bz2
------------------------------------------------------------------------------
    svn:executable = *

Propchange: hadoop/pig/branches/types/tutorial/data/excite.log.bz2
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig Thu Nov 13 
15:08:21 2008
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Query Phrase Popularity (Hadoop cluster)
+
+-- This script processes a search query log file from the Excite search engine 
and finds search phrases that occur with particular high frequency during 
certain times of the day. 
+
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the  PigStorage function to load the excite log file into the raw bag 
as an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query);
+
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL. 
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase. 
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the  DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the  GROUP command to group records by n-gram and hour. 
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the  COUNT function to get the count (occurrences) of each n-gram. 
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the  GROUP command to group records by n-gram only. 
+-- Each group now corresponds to a distinct n-gram and has the count for each 
hour.
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+
+-- For each group, identify the hour in which this n-gram is used with a 
particularly high frequency.
+-- Call the ScoreGenerator UDF to calculate a "popularity" score for the 
n-gram.
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), 
flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+
+-- Use the  FOREACH-GENERATE command to assign names to the fields. 
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 
as score, $3 as count, $4 as mean;
+
+-- Use the  FILTER command to move all records with a score less than or equal 
to 2.0.
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
+
+-- Use the  ORDER command to sort the remaining records by hour and score. 
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
+
+-- Use the  PigStorage function to store the results. 
+-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
+STORE ordered_uniq_frequency INTO 'script1-hadoop-results' USING PigStorage();

Propchange: hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: hadoop/pig/branches/types/tutorial/scripts/script1-local.pig
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script1-local.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script1-local.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script1-local.pig Thu Nov 13 
15:08:21 2008
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Query Phrase Popularity (local mode)
+
+-- This script processes a search query log file from the Excite search engine 
and finds search phrases that occur with particular high frequency during 
certain times of the day.
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag 
as an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL. 
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase. 
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour. 
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram. 
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the GROUP command to group records by n-gram only. 
+-- Each group now corresponds to a distinct n-gram and has the count for each 
hour.
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+
+-- For each group, identify the hour in which this n-gram is used with a 
particularly high frequency.
+-- Call the ScoreGenerator UDF to calculate a "popularity" score for the 
n-gram.
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), 
flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+
+-- Use the FOREACH-GENERATE command to assign names to the fields. 
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 
as score, $3 as count, $4 as mean;
+
+-- Use the FILTER command to move all records with a score less than or equal 
to 2.0.
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
+
+-- Use the ORDER command to sort the remaining records by hour and score. 
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
+
+-- Use the PigStorage function to store the results. 
+-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
+STORE ordered_uniq_frequency INTO 'script1-local-results.txt' USING 
PigStorage();

Propchange: hadoop/pig/branches/types/tutorial/scripts/script1-local.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig Thu Nov 13 
15:08:21 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Temporal Query Phrase Popularity (Hadoop cluster)
+
+-- This script processes a search query log file from the Excite search engine 
and compares the occurrence of frequency of search phrases across two time 
periods separated by twelve hours. 
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag as 
an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the FOREACH-GENERATE command to assign names to the fields. 
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 
as count;
+
+-- Use the FILTER command to get the n-grams for hour 00 .
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+
+-- Use the FILTER command to get the n-grams for hour 12
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+
+-- Use the JOIN command to get the n-grams that appear in both hours.
+same = JOIN hour00 BY $0, hour12 BY $0;
+
+-- Use the FOREACH-GENERATE command to record their frequency.
+same1 = FOREACH same GENERATE hour00::group::ngram as ngram, $2 as count00, $5 
as count12;
+
+-- Use the PigStorage function to store the results. 
+-- Output: (n-gram, count_at_hour_00, count_at_hour_12)
+STORE same1 INTO 'script2-hadoop-results' USING PigStorage();

Propchange: hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: hadoop/pig/branches/types/tutorial/scripts/script2-local.pig
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script2-local.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script2-local.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script2-local.pig Thu Nov 13 
15:08:21 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Temporal Query Phrase Popularity (local mode)
+
+-- This script processes a search query log file from the Excite search engine 
and finds search phrases that occur with particular high frequency during 
certain times of the day. 
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag as 
an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the FOREACH-GENERATE command to assign names to the fields. 
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 
as count;
+
+-- Use the FILTER command to get the n-grams for hour 00 .
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+
+-- Use the FILTER command to get the n-grams for hour 12
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+
+-- Use the JOIN command to get the n-grams that appear in both hours.
+same = JOIN hour00 BY $0, hour12 BY $0;
+
+-- Use the FOREACH-GENERATE command to record their frequency.
+same1 = FOREACH same GENERATE hour00::group::ngram as ngram, $2 as count00, $5 
as count12;
+
+-- Use the PigStorage function to store the results. 
+-- Output: (n-gram, count_at_hour_00, count_at_hour_12)
+STORE same1 INTO 'script2-local-results.txt' USING PigStorage();

Propchange: hadoop/pig/branches/types/tutorial/scripts/script2-local.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java?rev=713854&view=auto
==============================================================================
--- 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java 
(added)
+++ 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java 
Thu Nov 13 15:08:21 2008
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * The excite query log timestamp format is YYMMDDHHMMSS
+ * This function extracts the hour, HH
+ */
+public class ExtractHour extends EvalFunc<String> {
+    public String exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0)
+            return null;
+        try{
+            String timestamp = (String)input.get(0);
+            return timestamp.substring(6, 8);
+        }catch(Exception e){
+            System.err.println("ExtractHour: failed to proces input; error - " 
+ e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    /**
+     * This method gives a name to the column.
+     * @param input - schema of the input data
+     * @return schema of the ouput data
+     */
+    public Schema outputSchema(Schema input) {
+        return new Schema(new 
Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), 
input), DataType.CHARARRAY));
+    }
+
+    /* (non-Javadoc)
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     * This is needed to make sure that both bytearrays and chararrays can be 
passed as arguments
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new 
Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+        return funcList;
+    }
+}

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java?rev=713854&view=auto
==============================================================================
--- 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java
 (added)
+++ 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java
 Thu Nov 13 15:08:21 2008
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultBagFactory;
+import org.apache.pig.data.DefaultTupleFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * This function divides a search query string into wrods and extracts
+ * n-grams with up to _ngramSizeLimit length.
+ * Example 1: if query = "a real nice query" and _ngramSizeLimit = 2,
+ * the query is split into: a, real, nice, query, a real, real nice, nice query
+ * Example 2: if record = (u1, h1, pig hadoop) and _ngramSizeLimit = 2,
+ * the record is split into: (u1, h1, pig), (u1, h1, hadoop), (u1, h1, pig 
hadoop)
+ */
+public class NGramGenerator extends EvalFunc<DataBag> {
+
+    private static final int _ngramSizeLimit = 2;
+  
+    public DataBag exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0)
+            return null;
+        try{
+            DataBag output = DefaultBagFactory.getInstance().newDefaultBag();
+            String query = (String)input.get(0);
+            String[] words = TutorialUtil.splitToWords(query);
+            Set<String> ngrams = new HashSet<String>();
+            TutorialUtil.makeNGram(words, ngrams, _ngramSizeLimit);
+            for (String ngram : ngrams) {
+                Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
+                t.set(0, ngram);
+                output.add(t);
+            }
+            return output;
+        }catch(Exception e){
+            System.err.println("NGramGenerator: failed to process input; error 
- " + e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    /**
+     * This method gives a name to the column.
+     * @param input - schema of the input data
+     * @return schema of the input data
+     */
+    public Schema outputSchema(Schema input) {
+         Schema bagSchema = new Schema();
+         bagSchema.add(new Schema.FieldSchema("ngram", DataType.CHARARRAY));
+         try{
+            return new Schema(new 
Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), 
input), 
+                                                    bagSchema, DataType.BAG));
+         }catch (FrontendException e){
+            return null;
+         }
+    }
+
+    /* (non-Javadoc)
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     * This is needed to make sure that both bytearrays and chararrays can be 
passed as arguments
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new 
Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+        return funcList;
+    }
+
+}

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java?rev=713854&view=auto
==============================================================================
--- 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java
 (added)
+++ 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java
 Thu Nov 13 15:08:21 2008
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.FilterFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * This function removes search queries that are URLs (as defined by 
_urlPattern).
+ * This function also removes empty queries.
+ */
+public class NonURLDetector extends FilterFunc {
+
+  private Pattern _urlPattern = 
Pattern.compile("^[\"]?(http[:|;])|(https[:|;])|(www\\.)");
+  
+  public Boolean exec(Tuple arg0) throws IOException {
+      if (arg0 == null || arg0.size() == 0)
+          return false;
+
+    String query; 
+    try{
+        query = (String)arg0.get(0);
+        if(query == null)
+            return false;
+        query = query.trim();
+    }catch(Exception e){
+        System.err.println("NonURLDetector: failed to process input; error - " 
+ e.getMessage());
+        return false;
+    }
+
+    if (query.equals("")) {
+      return false;
+    }
+    Matcher m = _urlPattern.matcher(query);
+    if (m.find()) {
+      return false;
+    }
+    return true;
+  }
+  
+  /* (non-Javadoc)
+   * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+   * This is needed to make sure that both bytearrays and chararrays can be 
passed as arguments
+   */
+  @Override
+  public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+      List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+      funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new 
Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+      return funcList;
+  }
+
+}

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java?rev=713854&view=auto
==============================================================================
--- 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java
 (added)
+++ 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java
 Thu Nov 13 15:08:21 2008
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultBagFactory;
+import org.apache.pig.data.DefaultTupleFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * For each n-gram, we have a set of (hour, count) pairs.
+ * 
+ * This function reads the set and retains those hours with above
+ * above mean count, and calculates the score of each retained hour as the 
+ * multiplier of the count of the hour over the standard deviation.
+ * 
+ * A score greater than 1.0 indicates the frequency of this n-gram 
+ * in this particular hour is at least one standard deviation away 
+ * from the average frequency among all hours
+ */
+
+public class ScoreGenerator extends EvalFunc<DataBag> {
+
+    private static double computeMean(List<Long> counts) {
+        int numCounts = counts.size();
+    
+        // compute mean
+        double mean = 0.0;
+        for (Long count : counts) {
+            mean += ((double) count) / ((double) numCounts);
+        }
+    
+        return mean;
+    }
+  
+    private static double computeSD(List<Long> counts, double mean) {
+        int numCounts = counts.size();
+    
+        // compute deviation
+        double deviation = 0.0;
+        for (Long count : counts) {
+            double d = ((double) count) - mean;
+            deviation += d * d / ((double) numCounts);
+        }
+    
+        return Math.sqrt(deviation);
+    }
+
+    public DataBag exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0)
+            return null;
+        try{
+            DataBag output = DefaultBagFactory.getInstance().newDefaultBag();
+            DataBag in = (DataBag)input.get(0);
+    
+            Map<String, Long> pairs = new HashMap<String, Long>();
+            List<Long> counts = new ArrayList<Long> ();
+
+            Iterator<Tuple> it = in.iterator();
+            while (it.hasNext()) {
+                Tuple t = it.next();
+                String hour = (String)t.get(1);
+                Long count = (Long)t.get(2);
+                pairs.put(hour, count);
+                counts.add(count);
+            }
+    
+            double mean = computeMean(counts);
+            double standardDeviation = computeSD(counts, mean);
+
+            Iterator<String> it2 = pairs.keySet().iterator();
+            while (it2.hasNext()) {
+                String hour = it2.next();
+                Long count = pairs.get(hour);
+                if ( count > mean ) {
+                    Tuple t = DefaultTupleFactory.getInstance().newTuple(4);
+                    t.set(0, hour);
+                    t.set(1, ((double) count - mean) / standardDeviation ); // 
the score
+                    t.set(2, count);
+                    t.set(3, mean);
+                    output.add(t);
+                }
+            }
+            return output;
+        }catch (Exception e){
+            System.err.println("ScoreGenerator: failed to process input; error 
- " + e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    /**
+     * This method gives a name to the column.
+     * @param input - schema of the input data
+     * @return schema of the output data
+     */
+     public Schema outputSchema(Schema input) {
+         Schema bagSchema = new Schema();
+         bagSchema.add(new Schema.FieldSchema("hour", DataType.CHARARRAY));
+         bagSchema.add(new Schema.FieldSchema("score", DataType.DOUBLE));
+         bagSchema.add(new Schema.FieldSchema("count", DataType.LONG));
+         bagSchema.add(new Schema.FieldSchema("mean", DataType.DOUBLE));
+         //TODO
+         //Here the schema of the bag is the schema of the tuple inside the bag
+         //We need to change this so that the bag has the tuple and the tuple 
has the schema
+         try{
+            return new Schema(new 
Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), 
input), bagSchema, DataType.BAG));
+         }catch (FrontendException e){
+             return null;
+         }
+     }
+
+}

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java 
(added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java 
Thu Nov 13 15:08:21 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * This function converts the input into lowercase and
+ * removes leading and trailing white spaces.
+ */
+public class ToLower extends EvalFunc<String> {
+    public String exec(Tuple input) throws IOException {
+        if(input == null || input.size() == 0)
+            return null;
+        try{
+            String query = (String)input.get(0);
+            return query.toLowerCase().trim();
+        }catch(Exception e){
+            System.err.println("ToLower: failed to process input; error - " + 
e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    /**
+     * This method gives a name to the column.
+     * @param input - schema of the input data
+     * @return schema of the output data
+     */
+    public Schema outputSchema(Schema input) {
+        return new Schema(new 
Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), 
input), DataType.CHARARRAY));
+    }
+
+    /* (non-Javadoc)
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     * This is needed to make sure that both bytearrays and chararrays can be 
passed as arguments
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new 
Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+        return funcList;
+    }
+
+}

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java?rev=713854&view=auto
==============================================================================
--- 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java
 (added)
+++ 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java
 Thu Nov 13 15:08:21 2008
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FilterFunc;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultBagFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DefaultTupleFactory;
+
+public class TutorialTest {
+
+    private static Tuple[] getTuples(String[] queries) {
+        Tuple[] tuples = new Tuple[queries.length];
+        for (int i = 0; i < tuples.length; i++) {
+            tuples[i] = DefaultTupleFactory.getInstance().newTuple(1);
+            try{tuples[i].set(0, queries[i]);}catch(Exception e){}
+        }
+        return tuples;
+    }
+  
+    public static String[] testDataAtomEvals(EvalFunc<String> eval, Tuple[] 
tuples) {
+        List<String> res = new ArrayList<String>();
+        try {
+            for (Tuple t : tuples) {
+                String output = eval.exec(t);
+                System.out.println("Converted: " + t + " to (" + output + ")");
+                res.add(output);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+            System.exit(1);
+        }
+
+        System.out.println("===");
+        return res.toArray(new String[res.size()]);
+    }
+  
+    public static DataBag[] testDataBagEvals(EvalFunc<DataBag> eval, Tuple[] 
tuples) {
+        List<DataBag> res = new ArrayList<DataBag>();
+        try {
+            for (Tuple t : tuples) {
+                DataBag output = eval.exec(t);
+                System.out.println("Converted: " + t + " to (" + output + ")");
+                res.add(output);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+            System.exit(1);
+        }
+
+        System.out.println("===");
+        return res.toArray(new DataBag[res.size()]);
+    }
+
+    public static String[] testFilters (FilterFunc filter, Tuple[] tuples) {
+        List<String> res = new ArrayList<String>();
+        try {
+            for (Tuple t : tuples) {
+                if (filter.exec(t)) {
+                    System.out.println("accepted: " + t);
+                    res.add((String)t.get(0));
+                } else {
+                    System.out.println("rejected: " + t);
+                }
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+            System.exit(1);
+        }
+
+        System.out.println("===");
+        return res.toArray(new String[res.size()]);
+    }
+
+    public static void main(String[] args) {
+        String[] queries = {
+            "http://www.yahoo.com/";,
+            "\"http://www.yahoo.com/\"";,
+            "   http;//www.yahoo.com/ ",
+            "https://www.yahoo.com/";,
+            "www.yahoo.com/",
+            "\"www.yahoo.com/\"",
+            "a real nice query ",
+            "an UPPER CASE query",
+            "  ",
+            " nude picture",
+            " +XXX",
+            "\" +porno \"",
+        };
+    
+        NonURLDetector filter1 = new NonURLDetector();
+        String[] q1 = testFilters(filter1, getTuples(queries));
+
+        ToLower eval1 = new ToLower();
+        String[] q2 = testDataAtomEvals(eval1, getTuples(q1));
+    
+        String[] timestamps = {
+            "970916072134",
+            "970916072311",
+            "970916123431",
+        };
+    
+        ExtractHour eval2 = new ExtractHour();
+        testDataAtomEvals(eval2, getTuples(timestamps));
+
+        DataBag bag = DefaultBagFactory.getInstance().newDefaultBag();
+    
+        Tuple t1 = DefaultTupleFactory.getInstance().newTuple(3);
+        try{
+            t1.set(0, "word");
+            t1.set(1, "02");
+            t1.set(2, 2);
+        }catch(Exception e){}
+        bag.add(t1);
+    
+        Tuple t2 = DefaultTupleFactory.getInstance().newTuple(3);
+        try{
+            t2.set(0, "word");
+            t2.set(1, "05");
+            t2.set(2, 2);
+        }catch(Exception e){}
+        bag.add(t2);
+
+        Tuple t3 = DefaultTupleFactory.getInstance().newTuple(3);
+        try{
+            t3.set(0, "word");
+            t3.set(1, "04");
+            t3.set(2, 3);
+        }catch(Exception e){}
+        bag.add(t3);
+
+        Tuple t4 = DefaultTupleFactory.getInstance().newTuple(3);
+        try{
+            t4.set(0, "word");
+            t4.set(1, "06");
+            t4.set(2, 4);
+        }catch(Exception e){}
+        bag.add(t4);
+
+        Tuple[] t = new Tuple[1];
+        t[0] = DefaultTupleFactory.getInstance().newTuple(1);
+        try{
+            t[0].set(0, bag);
+        }catch(Exception e){}
+
+        ScoreGenerator eval4 = new ScoreGenerator();
+        testDataBagEvals(eval4, t);
+    }
+}

Added: 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java?rev=713854&view=auto
==============================================================================
--- 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java
 (added)
+++ 
hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java
 Thu Nov 13 15:08:21 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+public class TutorialUtil {
+  
+  /**
+   * This function splits a search query string into a set 
+   * of non-empty words 
+   */
+  protected static String[] splitToWords(String query) {
+    List<String> res = new LinkedList<String>();
+    String[] words = query.split("\\W");
+    for (String word : words) {
+      if (!word.equals("")) {
+        res.add(word);
+      }
+    }
+    return res.toArray(new String[res.size()]);
+  }
+
+  /**
+   *   This is a simple utility function that make word-level
+   * ngrams from a set of words
+   * @param words
+   * @param ngrams
+   * @param size
+   */
+  protected static void makeNGram(String[] words, Set<String> ngrams, int 
size) {
+    int stop = words.length - size + 1;
+    for (int i = 0; i < stop; i++) {
+      StringBuilder sb = new StringBuilder();
+      for (int j = 0; j < size; j++) {
+        sb.append(words[i + j]).append(" ");
+      }
+      sb.deleteCharAt(sb.length() - 1);
+      ngrams.add(sb.toString());
+    }
+    if (size > 1) {
+      makeNGram(words, ngrams, size - 1);
+    }
+  }
+  
+}

svn commit: r713854 [3/3] - in /hadoop/pig/branches/types/tutorial: ./ data/ scripts/ src/ src/org/ src/org/apache/ src/org/apache/pig/ src/org/apache/pig/tutorial/

Reply via email to