Propchange: hadoop/pig/branches/types/tutorial/data/excite-small.log ------------------------------------------------------------------------------ svn:executable = *
Added: hadoop/pig/branches/types/tutorial/data/excite.log.bz2 URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/data/excite.log.bz2?rev=713854&view=auto ============================================================================== Binary file - no diff available. Propchange: hadoop/pig/branches/types/tutorial/data/excite.log.bz2 ------------------------------------------------------------------------------ svn:executable = * Propchange: hadoop/pig/branches/types/tutorial/data/excite.log.bz2 ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig (added) +++ hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig Thu Nov 13 15:08:21 2008 @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- Query Phrase Popularity (Hadoop cluster) + +-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. + + +-- Register the tutorial JAR file so that the included UDFs can be called in the script. +REGISTER ./tutorial.jar; + +-- Use the PigStorage function to load the excite log file into the raw bag as an array of records. +-- Input: (user,time,query) +raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query); + + +-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. +clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query); + +-- Call the ToLower UDF to change the query field to lowercase. +clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query; + +-- Because the log file only contains queries for a single day, we are only interested in the hour. +-- The excite query log timestamp format is YYMMDDHHMMSS. +-- Call the ExtractHour UDF to extract the hour (HH) from the time field. +houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query; + +-- Call the NGramGenerator UDF to compose the n-grams of the query. +ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram; + +-- Use the DISTINCT command to get the unique n-grams for all records. +ngramed2 = DISTINCT ngramed1; + +-- Use the GROUP command to group records by n-gram and hour. +hour_frequency1 = GROUP ngramed2 BY (ngram, hour); + +-- Use the COUNT function to get the count (occurrences) of each n-gram. +hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count; + +-- Use the GROUP command to group records by n-gram only. +-- Each group now corresponds to a distinct n-gram and has the count for each hour. +uniq_frequency1 = GROUP hour_frequency2 BY group::ngram; + +-- For each group, identify the hour in which this n-gram is used with a particularly high frequency. +-- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram. +uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1)); + +-- Use the FOREACH-GENERATE command to assign names to the fields. +uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean; + +-- Use the FILTER command to move all records with a score less than or equal to 2.0. +filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0; + +-- Use the ORDER command to sort the remaining records by hour and score. +ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score; + +-- Use the PigStorage function to store the results. +-- Output: (hour, n-gram, score, count, average_counts_among_all_hours) +STORE ordered_uniq_frequency INTO 'script1-hadoop-results' USING PigStorage(); Propchange: hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig ------------------------------------------------------------------------------ svn:executable = * Added: hadoop/pig/branches/types/tutorial/scripts/script1-local.pig URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script1-local.pig?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/scripts/script1-local.pig (added) +++ hadoop/pig/branches/types/tutorial/scripts/script1-local.pig Thu Nov 13 15:08:21 2008 @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- Query Phrase Popularity (local mode) + +-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. + +-- Register the tutorial JAR file so that the included UDFs can be called in the script. +REGISTER ./tutorial.jar; + +-- Use the PigStorage function to load the excite log file into the raw bag as an array of records. +-- Input: (user,time,query) +raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query); + +-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. +clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query); + +-- Call the ToLower UDF to change the query field to lowercase. +clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query; + +-- Because the log file only contains queries for a single day, we are only interested in the hour. +-- The excite query log timestamp format is YYMMDDHHMMSS. +-- Call the ExtractHour UDF to extract the hour (HH) from the time field. +houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query; + +-- Call the NGramGenerator UDF to compose the n-grams of the query. +ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram; + +-- Use the DISTINCT command to get the unique n-grams for all records. +ngramed2 = DISTINCT ngramed1; + +-- Use the GROUP command to group records by n-gram and hour. +hour_frequency1 = GROUP ngramed2 BY (ngram, hour); + +-- Use the COUNT function to get the count (occurrences) of each n-gram. +hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count; + +-- Use the GROUP command to group records by n-gram only. +-- Each group now corresponds to a distinct n-gram and has the count for each hour. +uniq_frequency1 = GROUP hour_frequency2 BY group::ngram; + +-- For each group, identify the hour in which this n-gram is used with a particularly high frequency. +-- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram. +uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1)); + +-- Use the FOREACH-GENERATE command to assign names to the fields. +uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean; + +-- Use the FILTER command to move all records with a score less than or equal to 2.0. +filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0; + +-- Use the ORDER command to sort the remaining records by hour and score. +ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score; + +-- Use the PigStorage function to store the results. +-- Output: (hour, n-gram, score, count, average_counts_among_all_hours) +STORE ordered_uniq_frequency INTO 'script1-local-results.txt' USING PigStorage(); Propchange: hadoop/pig/branches/types/tutorial/scripts/script1-local.pig ------------------------------------------------------------------------------ svn:executable = * Added: hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig (added) +++ hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig Thu Nov 13 15:08:21 2008 @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- Temporal Query Phrase Popularity (Hadoop cluster) + +-- This script processes a search query log file from the Excite search engine and compares the occurrence of frequency of search phrases across two time periods separated by twelve hours. + +-- Register the tutorial JAR file so that the included UDFs can be called in the script. +REGISTER ./tutorial.jar; + +-- Use the PigStorage function to load the excite log file into the raw bag as an array of records. +-- Input: (user,time,query) +raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query); + +-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. +clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query); + +-- Call the ToLower UDF to change the query field to lowercase. +clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query; + +-- Because the log file only contains queries for a single day, we are only interested in the hour. +-- The excite query log timestamp format is YYMMDDHHMMSS. +-- Call the ExtractHour UDF to extract the hour (HH) from the time field. +houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query; + +-- Call the NGramGenerator UDF to compose the n-grams of the query. +ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram; + +-- Use the DISTINCT command to get the unique n-grams for all records. +ngramed2 = DISTINCT ngramed1; + +-- Use the GROUP command to group records by n-gram and hour. +hour_frequency1 = GROUP ngramed2 BY (ngram, hour); + +-- Use the COUNT function to get the count (occurrences) of each n-gram. +hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count; + +-- Use the FOREACH-GENERATE command to assign names to the fields. +hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 as count; + +-- Use the FILTER command to get the n-grams for hour 00 . +hour00 = FILTER hour_frequency2 BY hour eq '00'; + +-- Use the FILTER command to get the n-grams for hour 12 +hour12 = FILTER hour_frequency3 BY hour eq '12'; + +-- Use the JOIN command to get the n-grams that appear in both hours. +same = JOIN hour00 BY $0, hour12 BY $0; + +-- Use the FOREACH-GENERATE command to record their frequency. +same1 = FOREACH same GENERATE hour00::group::ngram as ngram, $2 as count00, $5 as count12; + +-- Use the PigStorage function to store the results. +-- Output: (n-gram, count_at_hour_00, count_at_hour_12) +STORE same1 INTO 'script2-hadoop-results' USING PigStorage(); Propchange: hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig ------------------------------------------------------------------------------ svn:executable = * Added: hadoop/pig/branches/types/tutorial/scripts/script2-local.pig URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script2-local.pig?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/scripts/script2-local.pig (added) +++ hadoop/pig/branches/types/tutorial/scripts/script2-local.pig Thu Nov 13 15:08:21 2008 @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- Temporal Query Phrase Popularity (local mode) + +-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. + +-- Register the tutorial JAR file so that the included UDFs can be called in the script. +REGISTER ./tutorial.jar; + +-- Use the PigStorage function to load the excite log file into the raw bag as an array of records. +-- Input: (user,time,query) +raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query); + +-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. +clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query); + +-- Call the ToLower UDF to change the query field to lowercase. +clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query; + +-- Because the log file only contains queries for a single day, we are only interested in the hour. +-- The excite query log timestamp format is YYMMDDHHMMSS. +-- Call the ExtractHour UDF to extract the hour (HH) from the time field. +houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query; + +-- Call the NGramGenerator UDF to compose the n-grams of the query. +ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram; + +-- Use the DISTINCT command to get the unique n-grams for all records. +ngramed2 = DISTINCT ngramed1; + +-- Use the GROUP command to group records by n-gram and hour. +hour_frequency1 = GROUP ngramed2 BY (ngram, hour); + +-- Use the COUNT function to get the count (occurrences) of each n-gram. +hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count; + +-- Use the FOREACH-GENERATE command to assign names to the fields. +hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 as count; + +-- Use the FILTER command to get the n-grams for hour 00 . +hour00 = FILTER hour_frequency2 BY hour eq '00'; + +-- Use the FILTER command to get the n-grams for hour 12 +hour12 = FILTER hour_frequency3 BY hour eq '12'; + +-- Use the JOIN command to get the n-grams that appear in both hours. +same = JOIN hour00 BY $0, hour12 BY $0; + +-- Use the FOREACH-GENERATE command to record their frequency. +same1 = FOREACH same GENERATE hour00::group::ngram as ngram, $2 as count00, $5 as count12; + +-- Use the PigStorage function to store the results. +-- Output: (n-gram, count_at_hour_00, count_at_hour_12) +STORE same1 INTO 'script2-local-results.txt' USING PigStorage(); Propchange: hadoop/pig/branches/types/tutorial/scripts/script2-local.pig ------------------------------------------------------------------------------ svn:executable = * Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.pig.EvalFunc; +import org.apache.pig.FuncSpec; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.DataType; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.impl.logicalLayer.FrontendException; + +/** + * The excite query log timestamp format is YYMMDDHHMMSS + * This function extracts the hour, HH + */ +public class ExtractHour extends EvalFunc<String> { + public String exec(Tuple input) throws IOException { + if (input == null || input.size() == 0) + return null; + try{ + String timestamp = (String)input.get(0); + return timestamp.substring(6, 8); + }catch(Exception e){ + System.err.println("ExtractHour: failed to proces input; error - " + e.getMessage()); + return null; + } + } + + @Override + /** + * This method gives a name to the column. + * @param input - schema of the input data + * @return schema of the ouput data + */ + public Schema outputSchema(Schema input) { + return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY)); + } + + /* (non-Javadoc) + * @see org.apache.pig.EvalFunc#getArgToFuncMapping() + * This is needed to make sure that both bytearrays and chararrays can be passed as arguments + */ + @Override + public List<FuncSpec> getArgToFuncMapping() throws FrontendException { + List<FuncSpec> funcList = new ArrayList<FuncSpec>(); + funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY)))); + + return funcList; + } +} Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import java.util.List; +import java.util.ArrayList; + +import org.apache.pig.EvalFunc; +import org.apache.pig.FuncSpec; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.DefaultBagFactory; +import org.apache.pig.data.DefaultTupleFactory; +import org.apache.pig.data.Tuple; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.data.DataType; +import org.apache.pig.impl.logicalLayer.FrontendException; + +/** + * This function divides a search query string into wrods and extracts + * n-grams with up to _ngramSizeLimit length. + * Example 1: if query = "a real nice query" and _ngramSizeLimit = 2, + * the query is split into: a, real, nice, query, a real, real nice, nice query + * Example 2: if record = (u1, h1, pig hadoop) and _ngramSizeLimit = 2, + * the record is split into: (u1, h1, pig), (u1, h1, hadoop), (u1, h1, pig hadoop) + */ +public class NGramGenerator extends EvalFunc<DataBag> { + + private static final int _ngramSizeLimit = 2; + + public DataBag exec(Tuple input) throws IOException { + if (input == null || input.size() == 0) + return null; + try{ + DataBag output = DefaultBagFactory.getInstance().newDefaultBag(); + String query = (String)input.get(0); + String[] words = TutorialUtil.splitToWords(query); + Set<String> ngrams = new HashSet<String>(); + TutorialUtil.makeNGram(words, ngrams, _ngramSizeLimit); + for (String ngram : ngrams) { + Tuple t = DefaultTupleFactory.getInstance().newTuple(1); + t.set(0, ngram); + output.add(t); + } + return output; + }catch(Exception e){ + System.err.println("NGramGenerator: failed to process input; error - " + e.getMessage()); + return null; + } + } + + @Override + /** + * This method gives a name to the column. + * @param input - schema of the input data + * @return schema of the input data + */ + public Schema outputSchema(Schema input) { + Schema bagSchema = new Schema(); + bagSchema.add(new Schema.FieldSchema("ngram", DataType.CHARARRAY)); + try{ + return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), + bagSchema, DataType.BAG)); + }catch (FrontendException e){ + return null; + } + } + + /* (non-Javadoc) + * @see org.apache.pig.EvalFunc#getArgToFuncMapping() + * This is needed to make sure that both bytearrays and chararrays can be passed as arguments + */ + @Override + public List<FuncSpec> getArgToFuncMapping() throws FrontendException { + List<FuncSpec> funcList = new ArrayList<FuncSpec>(); + funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY)))); + + return funcList; + } + +} Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.List; +import java.util.ArrayList; + +import org.apache.pig.FilterFunc; +import org.apache.pig.FuncSpec; +import org.apache.pig.data.Tuple; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.data.DataType; +import org.apache.pig.impl.logicalLayer.FrontendException; + +/** + * This function removes search queries that are URLs (as defined by _urlPattern). + * This function also removes empty queries. + */ +public class NonURLDetector extends FilterFunc { + + private Pattern _urlPattern = Pattern.compile("^[\"]?(http[:|;])|(https[:|;])|(www\\.)"); + + public Boolean exec(Tuple arg0) throws IOException { + if (arg0 == null || arg0.size() == 0) + return false; + + String query; + try{ + query = (String)arg0.get(0); + if(query == null) + return false; + query = query.trim(); + }catch(Exception e){ + System.err.println("NonURLDetector: failed to process input; error - " + e.getMessage()); + return false; + } + + if (query.equals("")) { + return false; + } + Matcher m = _urlPattern.matcher(query); + if (m.find()) { + return false; + } + return true; + } + + /* (non-Javadoc) + * @see org.apache.pig.EvalFunc#getArgToFuncMapping() + * This is needed to make sure that both bytearrays and chararrays can be passed as arguments + */ + @Override + public List<FuncSpec> getArgToFuncMapping() throws FrontendException { + List<FuncSpec> funcList = new ArrayList<FuncSpec>(); + funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY)))); + + return funcList; + } + +} Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.DefaultBagFactory; +import org.apache.pig.data.DefaultTupleFactory; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.DataType; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.impl.logicalLayer.FrontendException; + +/** + * For each n-gram, we have a set of (hour, count) pairs. + * + * This function reads the set and retains those hours with above + * above mean count, and calculates the score of each retained hour as the + * multiplier of the count of the hour over the standard deviation. + * + * A score greater than 1.0 indicates the frequency of this n-gram + * in this particular hour is at least one standard deviation away + * from the average frequency among all hours + */ + +public class ScoreGenerator extends EvalFunc<DataBag> { + + private static double computeMean(List<Long> counts) { + int numCounts = counts.size(); + + // compute mean + double mean = 0.0; + for (Long count : counts) { + mean += ((double) count) / ((double) numCounts); + } + + return mean; + } + + private static double computeSD(List<Long> counts, double mean) { + int numCounts = counts.size(); + + // compute deviation + double deviation = 0.0; + for (Long count : counts) { + double d = ((double) count) - mean; + deviation += d * d / ((double) numCounts); + } + + return Math.sqrt(deviation); + } + + public DataBag exec(Tuple input) throws IOException { + if (input == null || input.size() == 0) + return null; + try{ + DataBag output = DefaultBagFactory.getInstance().newDefaultBag(); + DataBag in = (DataBag)input.get(0); + + Map<String, Long> pairs = new HashMap<String, Long>(); + List<Long> counts = new ArrayList<Long> (); + + Iterator<Tuple> it = in.iterator(); + while (it.hasNext()) { + Tuple t = it.next(); + String hour = (String)t.get(1); + Long count = (Long)t.get(2); + pairs.put(hour, count); + counts.add(count); + } + + double mean = computeMean(counts); + double standardDeviation = computeSD(counts, mean); + + Iterator<String> it2 = pairs.keySet().iterator(); + while (it2.hasNext()) { + String hour = it2.next(); + Long count = pairs.get(hour); + if ( count > mean ) { + Tuple t = DefaultTupleFactory.getInstance().newTuple(4); + t.set(0, hour); + t.set(1, ((double) count - mean) / standardDeviation ); // the score + t.set(2, count); + t.set(3, mean); + output.add(t); + } + } + return output; + }catch (Exception e){ + System.err.println("ScoreGenerator: failed to process input; error - " + e.getMessage()); + return null; + } + } + + @Override + /** + * This method gives a name to the column. + * @param input - schema of the input data + * @return schema of the output data + */ + public Schema outputSchema(Schema input) { + Schema bagSchema = new Schema(); + bagSchema.add(new Schema.FieldSchema("hour", DataType.CHARARRAY)); + bagSchema.add(new Schema.FieldSchema("score", DataType.DOUBLE)); + bagSchema.add(new Schema.FieldSchema("count", DataType.LONG)); + bagSchema.add(new Schema.FieldSchema("mean", DataType.DOUBLE)); + //TODO + //Here the schema of the bag is the schema of the tuple inside the bag + //We need to change this so that the bag has the tuple and the tuple has the schema + try{ + return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), bagSchema, DataType.BAG)); + }catch (FrontendException e){ + return null; + } + } + +} Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.pig.EvalFunc; +import org.apache.pig.FuncSpec; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.DataType; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.impl.logicalLayer.FrontendException; + +/** + * This function converts the input into lowercase and + * removes leading and trailing white spaces. + */ +public class ToLower extends EvalFunc<String> { + public String exec(Tuple input) throws IOException { + if(input == null || input.size() == 0) + return null; + try{ + String query = (String)input.get(0); + return query.toLowerCase().trim(); + }catch(Exception e){ + System.err.println("ToLower: failed to process input; error - " + e.getMessage()); + return null; + } + } + + @Override + /** + * This method gives a name to the column. + * @param input - schema of the input data + * @return schema of the output data + */ + public Schema outputSchema(Schema input) { + return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY)); + } + + /* (non-Javadoc) + * @see org.apache.pig.EvalFunc#getArgToFuncMapping() + * This is needed to make sure that both bytearrays and chararrays can be passed as arguments + */ + @Override + public List<FuncSpec> getArgToFuncMapping() throws FrontendException { + List<FuncSpec> funcList = new ArrayList<FuncSpec>(); + funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY)))); + + return funcList; + } + +} Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pig.EvalFunc; +import org.apache.pig.FilterFunc; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.DefaultBagFactory; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.DefaultTupleFactory; + +public class TutorialTest { + + private static Tuple[] getTuples(String[] queries) { + Tuple[] tuples = new Tuple[queries.length]; + for (int i = 0; i < tuples.length; i++) { + tuples[i] = DefaultTupleFactory.getInstance().newTuple(1); + try{tuples[i].set(0, queries[i]);}catch(Exception e){} + } + return tuples; + } + + public static String[] testDataAtomEvals(EvalFunc<String> eval, Tuple[] tuples) { + List<String> res = new ArrayList<String>(); + try { + for (Tuple t : tuples) { + String output = eval.exec(t); + System.out.println("Converted: " + t + " to (" + output + ")"); + res.add(output); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + + System.out.println("==="); + return res.toArray(new String[res.size()]); + } + + public static DataBag[] testDataBagEvals(EvalFunc<DataBag> eval, Tuple[] tuples) { + List<DataBag> res = new ArrayList<DataBag>(); + try { + for (Tuple t : tuples) { + DataBag output = eval.exec(t); + System.out.println("Converted: " + t + " to (" + output + ")"); + res.add(output); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + + System.out.println("==="); + return res.toArray(new DataBag[res.size()]); + } + + public static String[] testFilters (FilterFunc filter, Tuple[] tuples) { + List<String> res = new ArrayList<String>(); + try { + for (Tuple t : tuples) { + if (filter.exec(t)) { + System.out.println("accepted: " + t); + res.add((String)t.get(0)); + } else { + System.out.println("rejected: " + t); + } + } + } catch (Exception e) { + e.printStackTrace(); + System.exit(1); + } + + System.out.println("==="); + return res.toArray(new String[res.size()]); + } + + public static void main(String[] args) { + String[] queries = { + "http://www.yahoo.com/", + "\"http://www.yahoo.com/\"", + " http;//www.yahoo.com/ ", + "https://www.yahoo.com/", + "www.yahoo.com/", + "\"www.yahoo.com/\"", + "a real nice query ", + "an UPPER CASE query", + " ", + " nude picture", + " +XXX", + "\" +porno \"", + }; + + NonURLDetector filter1 = new NonURLDetector(); + String[] q1 = testFilters(filter1, getTuples(queries)); + + ToLower eval1 = new ToLower(); + String[] q2 = testDataAtomEvals(eval1, getTuples(q1)); + + String[] timestamps = { + "970916072134", + "970916072311", + "970916123431", + }; + + ExtractHour eval2 = new ExtractHour(); + testDataAtomEvals(eval2, getTuples(timestamps)); + + DataBag bag = DefaultBagFactory.getInstance().newDefaultBag(); + + Tuple t1 = DefaultTupleFactory.getInstance().newTuple(3); + try{ + t1.set(0, "word"); + t1.set(1, "02"); + t1.set(2, 2); + }catch(Exception e){} + bag.add(t1); + + Tuple t2 = DefaultTupleFactory.getInstance().newTuple(3); + try{ + t2.set(0, "word"); + t2.set(1, "05"); + t2.set(2, 2); + }catch(Exception e){} + bag.add(t2); + + Tuple t3 = DefaultTupleFactory.getInstance().newTuple(3); + try{ + t3.set(0, "word"); + t3.set(1, "04"); + t3.set(2, 3); + }catch(Exception e){} + bag.add(t3); + + Tuple t4 = DefaultTupleFactory.getInstance().newTuple(3); + try{ + t4.set(0, "word"); + t4.set(1, "06"); + t4.set(2, 4); + }catch(Exception e){} + bag.add(t4); + + Tuple[] t = new Tuple[1]; + t[0] = DefaultTupleFactory.getInstance().newTuple(1); + try{ + t[0].set(0, bag); + }catch(Exception e){} + + ScoreGenerator eval4 = new ScoreGenerator(); + testDataBagEvals(eval4, t); + } +} Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java?rev=713854&view=auto ============================================================================== --- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java (added) +++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java Thu Nov 13 15:08:21 2008 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pig.tutorial; + +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +public class TutorialUtil { + + /** + * This function splits a search query string into a set + * of non-empty words + */ + protected static String[] splitToWords(String query) { + List<String> res = new LinkedList<String>(); + String[] words = query.split("\\W"); + for (String word : words) { + if (!word.equals("")) { + res.add(word); + } + } + return res.toArray(new String[res.size()]); + } + + /** + * This is a simple utility function that make word-level + * ngrams from a set of words + * @param words + * @param ngrams + * @param size + */ + protected static void makeNGram(String[] words, Set<String> ngrams, int size) { + int stop = words.length - size + 1; + for (int i = 0; i < stop; i++) { + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < size; j++) { + sb.append(words[i + j]).append(" "); + } + sb.deleteCharAt(sb.length() - 1); + ngrams.add(sb.toString()); + } + if (size > 1) { + makeNGram(words, ngrams, size - 1); + } + } + +}