Propchange: incubator/pig/trunk/tutorial/data/excite-small.log
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/pig/trunk/tutorial/data/excite.log.bz2
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/data/excite.log.bz2?rev=669974&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/pig/trunk/tutorial/data/excite.log.bz2
------------------------------------------------------------------------------
    svn:executable = *

Propchange: incubator/pig/trunk/tutorial/data/excite.log.bz2
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/pig/trunk/tutorial/scripts/script1-hadoop.pig
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/scripts/script1-hadoop.pig?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/scripts/script1-hadoop.pig (added)
+++ incubator/pig/trunk/tutorial/scripts/script1-hadoop.pig Fri Jun 20 09:27:05 
2008
@@ -0,0 +1,56 @@
+-- Query Phrase Popularity (Hadoop cluster)
+
+-- This script processes a search query log file from the Excite search engine 
and finds search phrases that occur with particular high frequency during 
certain times of the day. 
+
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the  PigStorage function to load the excite log file into the “raw” bag 
as an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query);
+
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL. 
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase. 
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the  DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the  GROUP command to group records by n-gram and hour. 
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the  COUNT function to get the count (occurrences) of each n-gram. 
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the  GROUP command to group records by n-gram only. 
+-- Each group now corresponds to a distinct n-gram and has the count for each 
hour.
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+
+-- For each group, identify the hour in which this n-gram is used with a 
particularly high frequency.
+-- Call the ScoreGenerator UDF to calculate a "popularity" score for the 
n-gram.
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), 
flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+
+-- Use the  FOREACH-GENERATE command to assign names to the fields. 
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 
as score, $3 as count, $4 as mean;
+
+-- Use the  FILTER command to move all records with a score less than or equal 
to 2.0.
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
+
+-- Use the  ORDER command to sort the remaining records by hour and score. 
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY (hour, score);
+
+-- Use the  PigStorage function to store the results. 
+-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
+STORE ordered_uniq_frequency INTO 'script1-hadoop-results' USING PigStorage();

Propchange: incubator/pig/trunk/tutorial/scripts/script1-hadoop.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/pig/trunk/tutorial/scripts/script1-local.pig
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/scripts/script1-local.pig?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/scripts/script1-local.pig (added)
+++ incubator/pig/trunk/tutorial/scripts/script1-local.pig Fri Jun 20 09:27:05 
2008
@@ -0,0 +1,54 @@
+-- Query Phrase Popularity (local mode)
+
+-- This script processes a search query log file from the Excite search engine 
and finds search phrases that occur with particular high frequency during 
certain times of the day.
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the “raw” bag 
as an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL. 
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase. 
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour. 
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram. 
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the GROUP command to group records by n-gram only. 
+-- Each group now corresponds to a distinct n-gram and has the count for each 
hour.
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+
+-- For each group, identify the hour in which this n-gram is used with a 
particularly high frequency.
+-- Call the ScoreGenerator UDF to calculate a "popularity" score for the 
n-gram.
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), 
flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+
+-- Use the FOREACH-GENERATE command to assign names to the fields. 
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 
as score, $3 as count, $4 as mean;
+
+-- Use the FILTER command to move all records with a score less than or equal 
to 2.0.
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
+
+-- Use the ORDER command to sort the remaining records by hour and score. 
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY (hour, score);
+
+-- Use the PigStorage function to store the results. 
+-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
+STORE ordered_uniq_frequency INTO 'script1-local-results.txt' USING 
PigStorage();

Propchange: incubator/pig/trunk/tutorial/scripts/script1-local.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/pig/trunk/tutorial/scripts/script2-hadoop.pig
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/scripts/script2-hadoop.pig?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/scripts/script2-hadoop.pig (added)
+++ incubator/pig/trunk/tutorial/scripts/script2-hadoop.pig Fri Jun 20 09:27:05 
2008
@@ -0,0 +1,52 @@
+-- Temporal Query Phrase Popularity (Hadoop cluster)
+
+-- This script processes a search query log file from the Excite search engine 
and compares the occurrence of frequency of search phrases across two time 
periods separated by twelve hours. 
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the “raw” bag 
as an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the FOREACH-GENERATE command to assign names to the fields. 
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 
as count;
+
+-- Use the FILTER command to get the n-grams for hour ‘00’ .
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+
+-- Use the FILTER command to get the n-grams for hour ‘12’
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+
+-- Use the JOIN command to get the n-grams that appear in both hours.
+same = JOIN hour00 BY $0, hour12 BY $0;
+
+-- Use the FOREACH-GENERATE command to record their frequency.
+same1 = FOREACH same GENERATE hour_frequency2::hour00::group::ngram as ngram, 
$2 as count00, $5 as count12;
+
+-- Use the PigStorage function to store the results. 
+-- Output: (n-gram, count_at_hour_00, count_at_hour_12)
+STORE same1 INTO 'script2-hadoop-results' USING PigStorage();

Propchange: incubator/pig/trunk/tutorial/scripts/script2-hadoop.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/pig/trunk/tutorial/scripts/script2-local.pig
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/scripts/script2-local.pig?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/scripts/script2-local.pig (added)
+++ incubator/pig/trunk/tutorial/scripts/script2-local.pig Fri Jun 20 09:27:05 
2008
@@ -0,0 +1,52 @@
+-- Temporal Query Phrase Popularity (local mode)
+
+-- This script processes a search query log file from the Excite search engine 
and finds search phrases that occur with particular high frequency during 
certain times of the day. 
+
+-- Register the tutorial JAR file so that the included UDFs can be called in 
the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the “raw” bag 
as an array of records.
+-- Input: (user,time,query) 
+raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty 
or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, 
org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only 
interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, 
org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, 
flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as 
count;
+
+-- Use the FOREACH-GENERATE command to assign names to the fields. 
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 
as count;
+
+-- Use the FILTER command to get the n-grams for hour ‘00’ .
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+
+-- Use the FILTER command to get the n-grams for hour ‘12’
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+
+-- Use the JOIN command to get the n-grams that appear in both hours.
+same = JOIN hour00 BY $0, hour12 BY $0;
+
+-- Use the FOREACH-GENERATE command to record their frequency.
+same1 = FOREACH same GENERATE hour_frequency2::hour00::group::ngram as ngram, 
$2 as count00, $5 as count12;
+
+-- Use the PigStorage function to store the results. 
+-- Output: (n-gram, count_at_hour_00, count_at_hour_12)
+STORE same1 INTO 'script2-local-results.txt' USING PigStorage();

Propchange: incubator/pig/trunk/tutorial/scripts/script2-local.pig
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ExtractHour.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ExtractHour.java?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ExtractHour.java 
(added)
+++ incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ExtractHour.java 
Fri Jun 20 09:27:05 2008
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+/**
+ * The excite query log timestamp format is YYMMDDHHMMSS
+ * This function extracts the hour, HH
+ */
+public class ExtractHour extends EvalFunc<DataAtom> {
+  public void exec(Tuple arg0, DataAtom arg1) throws IOException {
+    String timestamp = arg0.getAtomField(0).strval();
+    arg1.setValue(timestamp.substring(6, 8));
+  }
+}

Added: 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java?rev=669974&view=auto
==============================================================================
--- 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java 
(added)
+++ 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java 
Fri Jun 20 09:27:05 2008
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.Tuple;
+
+/**
+ * This function divides a search query string into wrods and extracts
+ * n-grams with up to _ngramSizeLimit length.
+ * Example 1: if query = "a real nice query" and _ngramSizeLimit = 2,
+ * the query is split into: a, real, nice, query, a real, real nice, nice query
+ * Example 2: if record = (u1, h1, pig hadoop) and _ngramSizeLimit = 2,
+ * the record is split into: (u1, h1, pig), (u1, h1, hadoop), (u1, h1, pig 
hadoop)
+ */
+public class NGramGenerator extends EvalFunc<DataBag> {
+
+  private static final int _ngramSizeLimit = 2;
+  
+  public void exec(Tuple arg0, DataBag arg1) throws IOException {
+    String query = arg0.getAtomField(0).strval();
+    String[] words = TutorialUtil.splitToWords(query);
+    Set<String> ngrams = new HashSet<String>();
+    TutorialUtil.makeNGram(words, ngrams, _ngramSizeLimit);
+    for (String ngram : ngrams) {
+      Tuple t = new Tuple();
+      t.appendField(new DataAtom(ngram));
+      arg1.add(t);
+    }
+  }
+}

Added: 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java?rev=669974&view=auto
==============================================================================
--- 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java 
(added)
+++ 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java 
Fri Jun 20 09:27:05 2008
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.FilterFunc;
+import org.apache.pig.data.Tuple;
+
+/**
+ * This function removes search queries that are URLs (as defined by 
_urlPattern).
+ * This function also removes empty queries.
+ */
+public class NonURLDetector extends FilterFunc {
+
+  private Pattern _urlPattern = 
Pattern.compile("^[\"]?(http[:|;])|(https[:|;])|(www\\.)");
+  
+  public boolean exec(Tuple arg0) throws IOException {
+    String query = arg0.getAtomField(0).strval().trim();
+    if (query.equals("")) {
+      return false;
+    }
+    Matcher m = _urlPattern.matcher(query);
+    if (m.find()) {
+      return false;
+    }
+    return true;
+  }
+  
+}

Added: 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java?rev=669974&view=auto
==============================================================================
--- 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java 
(added)
+++ 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java 
Fri Jun 20 09:27:05 2008
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.Tuple;
+
+/**
+ * For each n-gram, we have a set of (hour, count) pairs.
+ * 
+ * This function reads the set and retains those hours with above
+ * above mean count, and calculates the score of each retained hour as the 
+ * multiplier of the count of the hour over the standard deviation.
+ * 
+ * A score greater than 1.0 indicates the frequency of this n-gram 
+ * in this particular hour is at least one standard deviation away 
+ * from the average frequency among all hours
+ */
+
+public class ScoreGenerator extends EvalFunc<DataBag> {
+
+  private static double computeMean(List<Long> counts) {
+    int numCounts = counts.size();
+    
+    // compute mean
+    double mean = 0.0;
+    for (Long count : counts) {
+      mean += ((double) count) / ((double) numCounts);
+    }
+    
+    return mean;
+  }
+  
+  private static double computeSD(List<Long> counts, double mean) {
+    int numCounts = counts.size();
+    
+    // compute deviation
+    double deviation = 0.0;
+    for (Long count : counts) {
+      double d = ((double) count) - mean;
+      deviation += d * d / ((double) numCounts);
+    }
+    
+    return Math.sqrt(deviation);
+  }
+
+  public void exec(Tuple arg0, DataBag arg1) throws IOException {
+    DataBag input = arg0.getBagField(0);
+    
+    Map<String, Long> pairs = new HashMap<String, Long>();
+    List<Long> counts = new ArrayList<Long> ();
+
+    Iterator<Tuple> it = input.iterator();
+    while (it.hasNext()) {
+      Tuple t = it.next();
+      String hour = t.getAtomField(1).strval();
+      Long count = t.getAtomField(2).longVal();
+      pairs.put(hour, count);
+      counts.add(count);
+    }
+    
+    double mean = computeMean(counts);
+    double standardDeviation = computeSD(counts, mean);
+
+    Iterator<String> it2 = pairs.keySet().iterator();
+    while (it2.hasNext()) {
+      String hour = it2.next();
+      Long count = pairs.get(hour);
+      if ( count > mean ) {
+        Tuple t = new Tuple();
+        t.appendField(new DataAtom(hour));
+        t.appendField(new DataAtom( ((double) count - mean) / 
standardDeviation )); // the score
+        t.appendField(new DataAtom(count));
+        t.appendField(new DataAtom(mean));
+        arg1.add(t);
+      }
+    }
+    
+  }
+}

Added: incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ToLower.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ToLower.java?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ToLower.java 
(added)
+++ incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/ToLower.java Fri 
Jun 20 09:27:05 2008
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+/**
+ * This function converts the input into lowercase and
+ * removes leading and trailing white spaces.
+ */
+public class ToLower extends EvalFunc<DataAtom> {
+  public void exec(Tuple arg0, DataAtom arg1) throws IOException {
+    String query = arg0.getAtomField(0).strval().toLowerCase().trim();
+    arg1.setValue(query);
+  }
+}

Added: 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialTest.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialTest.java?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialTest.java 
(added)
+++ incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialTest.java 
Fri Jun 20 09:27:05 2008
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FilterFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultDataBag;
+import org.apache.pig.data.Tuple;
+
+public class TutorialTest {
+
+  private static Tuple[] getTuples(String[] queries) {
+    Tuple[] tuples = new Tuple[queries.length];
+    for (int i = 0; i < tuples.length; i++) {
+      tuples[i] = new Tuple();
+      tuples[i].appendField(new DataAtom(queries[i]));
+    }
+    return tuples;
+  }
+  
+  public static String[] testDataAtomEvals(EvalFunc<DataAtom> eval, Tuple[] 
tuples) {
+    
+    List<String> res = new ArrayList<String>();
+    try {
+      for (Tuple t : tuples) {
+        DataAtom atom = new DataAtom();
+        eval.exec(t, atom);
+        System.out.println("Converted: " + t + " to (" + atom + ")");
+        res.add(atom.strval());
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+
+    System.out.println("===");
+    return res.toArray(new String[res.size()]);
+  }
+  
+  public static DataBag[] testDataBagEvals(EvalFunc<DataBag> eval, Tuple[] 
tuples) {
+    
+    List<DataBag> res = new ArrayList<DataBag>();
+    try {
+      for (Tuple t : tuples) {
+        DataBag bag = new DefaultDataBag();
+        eval.exec(t, bag);
+        System.out.println("Converted: " + t + " to (" + bag + ")");
+        res.add(bag);
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+
+    System.out.println("===");
+    return res.toArray(new DataBag[res.size()]);
+  }
+
+  public static String[] testFilters (FilterFunc filter, Tuple[] tuples) {
+    List<String> res = new ArrayList<String>();
+    try {
+      for (Tuple t : tuples) {
+        if (filter.exec(t)) {
+          System.out.println("accepted: " + t);
+          res.add(t.getAtomField(0).strval());
+        } else {
+          System.out.println("rejected: " + t);
+        }
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+
+    System.out.println("===");
+    return res.toArray(new String[res.size()]);
+  }
+
+  public static void main(String[] args) {
+    String[] queries = {
+        "http://www.yahoo.com/";,
+        "\"http://www.yahoo.com/\"";,
+        "   http;//www.yahoo.com/ ",
+        "https://www.yahoo.com/";,
+        "www.yahoo.com/",
+        "\"www.yahoo.com/\"",
+        "a real nice query ",
+        "an UPPER CASE query",
+        "  ",
+        " nude picture",
+        " +XXX",
+        "\" +porno \"",
+    };
+    
+    NonURLDetector filter1 = new NonURLDetector();
+    String[] q1 = testFilters(filter1, getTuples(queries));
+
+    ToLower eval1 = new ToLower();
+    String[] q2 = testDataAtomEvals(eval1, getTuples(q1));
+    
+    String[] timestamps = {
+        "970916072134",
+        "970916072311",
+        "970916123431",
+    };
+    
+    ExtractHour eval2 = new ExtractHour();
+    testDataAtomEvals(eval2, getTuples(timestamps));
+
+    DataBag bag = new DefaultDataBag();
+    
+    Tuple t1 = new Tuple();
+    t1.appendField(new DataAtom("word"));
+    t1.appendField(new DataAtom("02"));
+    t1.appendField(new DataAtom(2));
+    bag.add(t1);
+    
+    Tuple t2 = new Tuple();
+    t2.appendField(new DataAtom("word"));
+    t2.appendField(new DataAtom("05"));
+    t2.appendField(new DataAtom(2));
+    bag.add(t2);
+
+    Tuple t3 = new Tuple();
+    t3.appendField(new DataAtom("word"));
+    t3.appendField(new DataAtom("04"));
+    t3.appendField(new DataAtom(3));
+    bag.add(t3);
+
+    Tuple t4 = new Tuple();
+    t4.appendField(new DataAtom("word"));
+    t4.appendField(new DataAtom("06"));
+    t4.appendField(new DataAtom(4));
+    bag.add(t4);
+
+    Tuple[] t = new Tuple[1];
+    t[0] = new Tuple();
+    t[0].appendField(bag);
+
+    ScoreGenerator eval4 = new ScoreGenerator();
+    testDataBagEvals(eval4, t);
+  }
+}

Added: 
incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java
URL: 
http://svn.apache.org/viewvc/incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java?rev=669974&view=auto
==============================================================================
--- incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java 
(added)
+++ incubator/pig/trunk/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java 
Fri Jun 20 09:27:05 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+public class TutorialUtil {
+  
+  /**
+   * This function splits a search query string into a set 
+   * of non-empty words 
+   */
+  protected static String[] splitToWords(String query) {
+    List<String> res = new LinkedList<String>();
+    String[] words = query.split("\\W");
+    for (String word : words) {
+      if (!word.equals("")) {
+        res.add(word);
+      }
+    }
+    return res.toArray(new String[res.size()]);
+  }
+
+  /**
+   *   This is a simple utility function that make word-level
+   * ngrams from a set of words
+   * @param words
+   * @param ngrams
+   * @param size
+   */
+  protected static void makeNGram(String[] words, Set<String> ngrams, int 
size) {
+    int stop = words.length - size + 1;
+    for (int i = 0; i < stop; i++) {
+      StringBuilder sb = new StringBuilder();
+      for (int j = 0; j < size; j++) {
+        sb.append(words[i + j]).append(" ");
+      }
+      sb.deleteCharAt(sb.length() - 1);
+      ngrams.add(sb.toString());
+    }
+    if (size > 1) {
+      makeNGram(words, ngrams, size - 1);
+    }
+  }
+  
+}


Reply via email to