Modified: pig/branches/spark/src/pig-default.properties URL: http://svn.apache.org/viewvc/pig/branches/spark/src/pig-default.properties?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/src/pig-default.properties (original) +++ pig/branches/spark/src/pig-default.properties Fri Feb 24 08:19:42 2017 @@ -61,4 +61,8 @@ pig.stats.output.size.reader.unsupported pig.tez.opt.union.unsupported.storefuncs=org.apache.hcatalog.pig.HCatStorer,org.apache.hive.hcatalog.pig.HCatStorer,org.apache.pig.piggybank.storage.DBStorage,org.apache.pig.piggybank.storage.MultiStorage -pig.sort.readonce.loadfuncs=org.apache.pig.backend.hadoop.hbase.HBaseStorage,org.apache.pig.backend.hadoop.accumulo.AccumuloStorage \ No newline at end of file +pig.sort.readonce.loadfuncs=org.apache.pig.backend.hadoop.hbase.HBaseStorage,org.apache.pig.backend.hadoop.accumulo.AccumuloStorage + +pig.ats.enabled=true + +pig.tez.configure.am.memory=true
Added: pig/branches/spark/start-build-env.sh URL: http://svn.apache.org/viewvc/pig/branches/spark/start-build-env.sh?rev=1784237&view=auto ============================================================================== --- pig/branches/spark/start-build-env.sh (added) +++ pig/branches/spark/start-build-env.sh Fri Feb 24 08:19:42 2017 @@ -0,0 +1,63 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e # exit on error + +cd "$(dirname "$0")" # connect to root + +docker build -t pig-build dev-support/docker + +if [ "$(uname -s)" == "Linux" ]; then + USER_NAME=${SUDO_USER:=${USER}} + USER_ID=$(id -u "${USER_NAME}") + GROUP_ID=$(id -g "${USER_NAME}") +else # boot2docker uid and gid + USER_NAME=${USER} + USER_ID=1000 + GROUP_ID=50 +fi + +docker build -t "pig-build-${USER_NAME}" - <<UserSpecificDocker +FROM pig-build +RUN bash configure-for-user.sh ${USER_NAME} ${USER_ID} ${GROUP_ID} "$(fgrep vboxsf /etc/group)" +UserSpecificDocker + +# By mapping the .m2 directory you can do an mvn install from +# within the container and use the result on your normal +# system. This also is a significant speedup in subsequent +# builds because the dependencies are downloaded only once. +# Same with the .ivy2 directory + +DOCKER="docker run --rm=true -t -i" +DOCKER=${DOCKER}" -u ${USER_NAME}" + +# Work in the current directory +DOCKER=${DOCKER}" -v ${PWD}:/home/${USER_NAME}/pig" +DOCKER=${DOCKER}" -w /home/${USER_NAME}/pig" + +# Mount persistent caching of 'large' downloads +DOCKER=${DOCKER}" -v ${HOME}/.m2:/home/${USER_NAME}/.m2" +DOCKER=${DOCKER}" -v ${HOME}/.ivy2:/home/${USER_NAME}/.ivy2" + +# What do we run? +DOCKER=${DOCKER}" --name pig-build-${USER_NAME}-$$" +DOCKER=${DOCKER}" pig-build-${USER_NAME}" +DOCKER=${DOCKER}" bash" + +# Now actually start it +${DOCKER} + Modified: pig/branches/spark/test/e2e/pig/build.xml URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/build.xml?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/build.xml (original) +++ pig/branches/spark/test/e2e/pig/build.xml Fri Feb 24 08:19:42 2017 @@ -27,9 +27,8 @@ <property name="hive.lib.dir" value="${pig.base.dir}/build/ivy/lib/Pig"/> - <condition property="hive.hadoop.shims.version" value="0.23" else="0.20S"> - <equals arg1="${hadoopversion}" arg2="23" /> - </condition> + <property name="hadoopversion" value="2" /> + <property name="hive.hadoop.shims.version" value="0.23" /> <property name="mvnrepo" value="http://repo2.maven.org/maven2"/> @@ -61,6 +60,7 @@ <property name="harness.PH_LOCAL" value="."/> <property name="harness.PH_OUT" value="."/> <property name="harness.PERL5LIB" value="./libexec"/> + <property name="harness.user.home" value="/user/pig" /> <property name="test.location" value="${basedir}/testdist"/> <property name="benchmark.location" value="${test.location}/benchmarks"/> @@ -137,6 +137,7 @@ <path path="${test.location}/tests/multiquery.conf"/> <path path="${test.location}/tests/negative.conf"/> <path path="${test.location}/tests/nightly.conf"/> + <path path="${test.location}/tests/join.conf"/> <path path="${test.location}/tests/streaming.conf"/> <path path="${test.location}/tests/streaming_local.conf"/> <path path="${test.location}/tests/turing_jython.conf"/> @@ -309,6 +310,7 @@ <env key="PH_HIVE_LIB_DIR" value="${hive.lib.dir}"/> <env key="PH_HIVE_VERSION" value="${hive.version}"/> <env key="PH_HIVE_SHIMS_VERSION" value="${hive.hadoop.shims.version}"/> + <env key="PH_HDFS_BASE" value="${harness.user.home}" /> <env key="HARNESS_CONF" value="${harness.conf.file}"/> <env key="HADOOP_HOME" value="${harness.hadoop.home}"/> <env key="HADOOP_PREFIX" value="${HADOOP_PREFIX}"/> @@ -369,6 +371,7 @@ <env key="PH_CLUSTER_BIN" value="${harness.cluster.bin}"/> <env key="HARNESS_CONF" value="${harness.conf.file}"/> <env key="HADOOP_HOME" value="${harness.hadoop.home}"/> + <env key="PH_HDFS_BASE" value="${harness.user.home}" /> <arg value="./test_harness.pl"/> <arg value="-deploycfg"/> Modified: pig/branches/spark/test/e2e/pig/conf/spark.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/conf/spark.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/conf/spark.conf (original) +++ pig/branches/spark/test/e2e/pig/conf/spark.conf Fri Feb 24 08:19:42 2017 @@ -30,8 +30,8 @@ my $hdfsBase = $ENV{PH_HDFS_BASE} || "/u $cfg = { #HDFS - 'inpathbase' => "$ENV{PH_ROOT}/data" - , 'outpathbase' => "$ENV{PH_ROOT}/testout" + 'inpathbase' => "$hdfsBase/test/data" + , 'outpathbase' => "$hdfsBase/out" #LOCAL , 'localinpathbase' => "$ENV{PH_LOCAL}/in" @@ -55,7 +55,7 @@ $cfg = { , 'hcatbin' => "$ENV{HCAT_BIN}" , 'usePython' => "$ENV{PIG_USE_PYTHON}" , 'exectype' => 'spark' - , 'benchmark_exectype' => 'local' + , 'benchmark_exectype' => 'mapred' #HADOOP , 'mapredjars' => "$ENV{PH_ROOT}/lib" Modified: pig/branches/spark/test/e2e/pig/deployers/ExistingClusterDeployer.pm URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/deployers/ExistingClusterDeployer.pm?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/deployers/ExistingClusterDeployer.pm (original) +++ pig/branches/spark/test/e2e/pig/deployers/ExistingClusterDeployer.pm Fri Feb 24 08:19:42 2017 @@ -231,11 +231,6 @@ sub generateData 'rows' => 5000, 'hdfs' => "types/numbers.txt", }, { - 'name' => "biggish", - 'filetype' => "biggish", - 'rows' => 1000000, - 'hdfs' => "singlefile/biggish", - }, { 'name' => "prerank", 'filetype' => "ranking", 'rows' => 30, Modified: pig/branches/spark/test/e2e/pig/deployers/LocalDeployer.pm URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/deployers/LocalDeployer.pm?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/deployers/LocalDeployer.pm (original) +++ pig/branches/spark/test/e2e/pig/deployers/LocalDeployer.pm Fri Feb 24 08:19:42 2017 @@ -209,11 +209,21 @@ sub generateData 'filetype' => "ranking", 'rows' => 30, 'outfile' => "singlefile/prerank", + }, { + 'name' => "utf8Voter", + 'filetype' => "utf8Voter", + 'rows' => 30, + 'outfile' => "utf8Data/éæ°/utf8Voter", + }, { + 'name' => "utf8Student", + 'filetype' => "utf8Student", + 'rows' => 300, + 'outfile' => "utf8Data/å¦ç/utf8Student", } ); # Create the target directories - for my $dir ("singlefile", "dir", "types", "glob/star/somegood", + for my $dir ("singlefile", "utf8Data/éæ°", "utf8Data/å¦ç", "dir", "types", "glob/star/somegood", "glob/star/moregood", "glob/star/bad") { my @cmd = ("mkdir", "-p", "$cfg->{'inpathbase'}/$dir"); $self->runCmd($log, \@cmd); Modified: pig/branches/spark/test/e2e/pig/drivers/TestDriverPig.pm URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/drivers/TestDriverPig.pm?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/drivers/TestDriverPig.pm (original) +++ pig/branches/spark/test/e2e/pig/drivers/TestDriverPig.pm Fri Feb 24 08:19:42 2017 @@ -211,13 +211,6 @@ sub runTest $testCmd->{'pig'} = $testCmd->{'pig_win'}; } - if ( $testCmd->{'hadoopversion'} == '23' && $testCmd->{'pig23'}) { - $oldpig = $testCmd->{'pig'}; - $testCmd->{'pig'} = $testCmd->{'pig23'}; - } - if ( $testCmd->{'hadoopversion'} == '23' && $testCmd->{'expected_err_regex23'}) { - $testCmd->{'expected_err_regex'} = $testCmd->{'expected_err_regex23'}; - } my $res = $self->runPigCmdLine( $testCmd, $log, 1, $resources ); if ($oldpig) { $testCmd->{'pig'} = $oldpig; @@ -231,10 +224,6 @@ sub runTest $testCmd->{'pig'} = $testCmd->{'pig_win'}; } - if ( $testCmd->{'hadoopversion'} == '23' && $testCmd->{'pig23'}) { - $oldpig = $testCmd->{'pig'}; - $testCmd->{'pig'} = $testCmd->{'pig23'}; - } my $res = $self->runPig( $testCmd, $log, 1, $resources ); if ($oldpig) { $testCmd->{'pig'} = $oldpig; @@ -686,9 +675,6 @@ sub generateBenchmark if ((Util::isWindows()||Util::isCygwin()) && $testCmd->{'pig_win'}) { $modifiedTestCmd{'pig'} = $testCmd->{'pig_win'}; } - if ( $testCmd->{'hadoopversion'} == '23' && $testCmd->{'pig23'}) { - $modifiedTestCmd{'pig'} = $testCmd->{'pig23'}; - } # Change so we're looking at the old version of Pig if (defined $testCmd->{'oldpigpath'} && $testCmd->{'oldpigpath'} ne "") { $modifiedTestCmd{'pigpath'} = $testCmd->{'oldpigpath'}; @@ -1058,10 +1044,6 @@ sub wrongExecutionMode($$) } } - if (defined $testCmd->{'ignore23'} && $testCmd->{'hadoopversion'}=='23') { - $wrong = 1; - } - if ($wrong) { print $log "Skipping test $testCmd->{'group'}" . "_" . $testCmd->{'num'} . " since it is not suppsed to be run in hadoop 23\n"; Modified: pig/branches/spark/test/e2e/pig/streaming/PigStreaming.pl URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/streaming/PigStreaming.pl?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/streaming/PigStreaming.pl (original) +++ pig/branches/spark/test/e2e/pig/streaming/PigStreaming.pl Fri Feb 24 08:19:42 2017 @@ -73,7 +73,7 @@ while (<$input_handle>) { chomp; $data = $_; - if (defined(%hash) && (exists $hash{$data})) + if (exists $hash{$data}) { print $output_handle "$hash{$data}\n"; } Modified: pig/branches/spark/test/e2e/pig/tests/grunt.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/grunt.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/grunt.conf (original) +++ pig/branches/spark/test/e2e/pig/tests/grunt.conf Fri Feb 24 08:19:42 2017 @@ -46,7 +46,12 @@ $cfg = { 'execonly' => 'mapred,tez', # don't have a clue what their cwd will be for local mode 'expected_out_regex' => "/user", 'rc' => 0 - + },{ + 'num' => 3, + 'pig' => "ls .", + 'execonly' => 'mapred,tez', + 'expected_out_regex' => "/user", + 'rc' => 0 },{ 'num' => 4, 'pig' => "ls :INPATH:", @@ -77,21 +82,22 @@ $cfg = { 'rc' => 0 },{ 'num' => 10, - 'pig' => "cp :INPATH:/singlefile/studenttab10k . - ls .", + 'pig' => "mkdir :OUTPATH: + cp :INPATH:/singlefile/studenttab10k :OUTPATH: + ls :OUTPATH:", 'expected_out_regex' => ".*studenttab10k", 'rc' => 0 },{ 'num' => 11, - 'pig' => "cp :INPATH:/singlefile/studenttab10k ./fred - ls .", + 'pig' => "cp :INPATH:/singlefile/studenttab10k :OUTPATH:/fred + ls :OUTPATH:", 'expected_out_regex' => ".*fred", 'rc' => 0 },{ 'num' => 12, - 'pig' => "cp :INPATH:/singlefile/studenttab10k ./jim - mv ./jim ./bob - ls .", + 'pig' => "cp :INPATH:/singlefile/studenttab10k :OUTPATH:/jim + mv :OUTPATH:/jim :OUTPATH:/bob + ls :OUTPATH:", 'expected_out_regex' => ".*bob", 'rc' => 0 },{ @@ -103,18 +109,19 @@ $cfg = { },{ 'num' => 14, 'pig' => "copyToLocal :INPATH:/singlefile/votertab10k :TMP: - copyFromLocal :TMP:/votertab10k ./joe - cat ./joe", + copyFromLocal :TMP:/votertab10k :OUTPATH:/joe + cat :OUTPATH:/joe", 'expected_out_regex' => ":Grunt_14_output:", 'rc' => 0 },{ 'num' => 15, - 'pig' => "rm fred bob joe", - 'not_expected_out_regex' => "joe", + 'pig' => "cp :INPATH:/singlefile/studenttab10k :OUTPATH:/fred + rm :OUTPATH:/fred", + 'not_expected_out_regex' => "fred", 'rc' => 0 },{ 'num' => 16, - 'pig' => "rmf jill", + 'pig' => "rmf :OUTPATH:/jill", 'rc' => 0 } ] Modified: pig/branches/spark/test/e2e/pig/tests/hcat.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/hcat.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/hcat.conf (original) +++ pig/branches/spark/test/e2e/pig/tests/hcat.conf Fri Feb 24 08:19:42 2017 @@ -44,7 +44,7 @@ stored as textfile;\, 'num' => 2, 'java_params' => ['-Dhcat.bin=:HCATBIN:'], 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); -SQL drop table if exists pig_hcat_ddl_1; +SQL drop table if exists pig_hcat_ddl_1 purge; sql create table pig_hcat_ddl_1(name string, age int, gpa double) @@ -55,6 +55,35 @@ store a into ':OUTPATH:';\, }, ] }, + { + 'name' => 'Jython_HCatDDL', + 'tests' => [ + { + # sql command + 'num' => 1 + ,'java_params' => ['-Dhcat.bin=:HCATBIN:'] + ,'pig' => q\#!/usr/bin/python +from org.apache.pig.scripting import Pig + +#create pig script + +Pig.sql("""sql drop table if exists pig_script_hcat_ddl_1;""") +ret = Pig.sql("""sql create table pig_script_hcat_ddl_1(name string, +age int, +gpa double) +stored as textfile; +""") + +if ret==0: + print "SQL command PASSED" + +else: + raise "SQL command FAILED" +\ + ,'rc' => 0 + }, + ] + }, ] } ; Added: pig/branches/spark/test/e2e/pig/tests/join.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/join.conf?rev=1784237&view=auto ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/join.conf (added) +++ pig/branches/spark/test/e2e/pig/tests/join.conf Fri Feb 24 08:19:42 2017 @@ -0,0 +1,310 @@ +#!/usr/bin/env perl +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### + +$cfg = { + 'driver' => 'Pig', + + 'groups' => [ + { + 'name' => 'BloomJoin_Map', + 'execonly' => 'tez', + 'tests' => [ + { + # Tuple join key + 'num' => 1, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +--c = filter a by age < 20; +--d = filter b by age < 20; +e = join a by (name, age), b by (name, age) using 'bloom'; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +--c = filter a by age < 20; +--d = filter b by age < 20; +e = join a by (name, age), b by (name, age); +store e into ':OUTPATH:';\, + }, + { + # bytearray join key + 'num' => 2, + 'pig' => q\ +SET mapreduce.input.fileinputformat.split.maxsize '50000'; +SET pig.splitCombination false; +a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +c = filter a by age < 20; +d = filter b by age < 20; +e = join c by name, d by name using 'bloom'; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +c = filter a by age < 20; +d = filter b by age < 20; +e = join c by name, d by name; +store e into ':OUTPATH:';\, + }, + { + # Left outer join and chararray join key + 'num' => 3, + 'pig' => q\ +SET mapreduce.input.fileinputformat.split.maxsize '50000'; +SET pig.splitCombination false; +a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); +c = join a by name left, b by name using 'bloom'; +d = foreach c generate a::name, a::age, gpa, registration, contributions; +store d into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); +c = join a by name left, b by name; +d = foreach c generate a::name, a::age, gpa, registration, contributions; +store d into ':OUTPATH:';\, + }, + { + # Right outer join + 'num' => 4, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); +c = join a by (name,age) right, b by (name,age) using 'bloom'; +store c into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); +c = join a by (name,age) right, b by (name,age); +store c into ':OUTPATH:';\, + }, + { + # Left input from a union + 'num' => 5, + 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +d = filter d by age > 60; +e = join c by name, d by name using 'bloom' PARALLEL 3; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +d = filter d by age > 60; +e = join c by name, d by name; +store e into ':OUTPATH:';\, + }, + { + # Right input from a union and integer join key + 'num' => 6, + 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +c = filter c by age > 75; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +e = join d by age, c by age using 'bloom' PARALLEL 3; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +c = filter c by age > 75; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +e = join d by age, c by age; +store e into ':OUTPATH:';\, + }, + { + # Left input from a split + 'num' => 7, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +b = filter b by age > 75; +c = filter a by age > 50; +d = join a by age, b by age using 'bloom'; +store c into ':OUTPATH:.1'; +store d into ':OUTPATH:.2';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +b = filter b by age > 75; +c = filter a by age > 50; +d = join a by age, b by age; +store c into ':OUTPATH:.1'; +store d into ':OUTPATH:.2';\, + }, + { + # Right input from a split + 'num' => 8, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +c = filter a by age > 75; +d = filter a by name == 'nick miller'; +e = join b by age, c by age using 'bloom'; +store d into ':OUTPATH:.1'; +store e into ':OUTPATH:.2';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +c = filter a by age > 75; +d = filter a by name == 'nick miller'; +e = join b by age, c by age; +store d into ':OUTPATH:.1'; +store e into ':OUTPATH:.2';\, + }, + ] # end of tests + }, + { + 'name' => 'BloomJoin_Reduce', + 'execonly' => 'tez', + 'java_params' => ['-Dpig.bloomjoin.strategy=reduce'], + 'tests' => [ + { + # Tuple join key + 'num' => 1, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +--c = filter a by age < 20; +--d = filter b by age < 20; +e = join a by (name, age), b by (name, age) using 'bloom'; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +--c = filter a by age < 20; +--d = filter b by age < 20; +e = join a by (name, age), b by (name, age); +store e into ':OUTPATH:';\, + }, + { + # bytearray join key + 'num' => 2, + 'pig' => q\ +SET mapreduce.input.fileinputformat.split.maxsize '50000'; +SET pig.splitCombination false; +a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +c = filter a by age < 20; +d = filter b by age < 20; +e = join c by name, d by name using 'bloom'; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +c = filter a by age < 20; +d = filter b by age < 20; +e = join c by name, d by name; +store e into ':OUTPATH:';\, + }, + { + # Left outer join and chararray join key + 'num' => 3, + 'pig' => q\ +SET mapreduce.input.fileinputformat.split.maxsize '50000'; +SET pig.splitCombination false; +a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); +c = join a by name left, b by name using 'bloom'; +d = foreach c generate a::name, a::age, gpa, registration, contributions; +store d into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); +c = join a by name left, b by name; +d = foreach c generate a::name, a::age, gpa, registration, contributions; +store d into ':OUTPATH:';\, + }, + { + # Right outer join + 'num' => 4, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); +c = join a by (name,age) right, b by (name,age) using 'bloom'; +store c into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); +c = join a by (name,age) right, b by (name,age); +store c into ':OUTPATH:';\, + }, + { + # Left input from a union + 'num' => 5, + 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +d = filter d by age > 60; +e = join c by name, d by name using 'bloom' PARALLEL 3; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +d = filter d by age > 60; +e = join c by name, d by name; +store e into ':OUTPATH:';\, + }, + { + # Right input from a union and integer join key + 'num' => 6, + 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +c = filter c by age > 75; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +e = join d by age, c by age using 'bloom' PARALLEL 3; +store e into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); +c = union a, b; +c = filter c by age > 75; +d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +e = join d by age, c by age; +store e into ':OUTPATH:';\, + }, + { + # Left input from a split + 'num' => 7, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +b = filter b by age > 75; +c = filter a by age > 50; +d = join a by age, b by age using 'bloom'; +store c into ':OUTPATH:.1'; +store d into ':OUTPATH:.2';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +b = filter b by age > 75; +c = filter a by age > 50; +d = join a by age, b by age; +store c into ':OUTPATH:.1'; +store d into ':OUTPATH:.2';\, + }, + { + # Right input from a split + 'num' => 8, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +c = filter a by age > 75; +d = filter a by name == 'nick miller'; +e = join b by age, c by age using 'bloom'; +store d into ':OUTPATH:.1'; +store e into ':OUTPATH:.2';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); +c = filter a by age > 75; +d = filter a by name == 'nick miller'; +e = join b by age, c by age; +store d into ':OUTPATH:.1'; +store e into ':OUTPATH:.2';\, + }, + ] # end of tests + } + ] # end of groups +}; \ No newline at end of file Modified: pig/branches/spark/test/e2e/pig/tests/multiquery.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/multiquery.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/multiquery.conf (original) +++ pig/branches/spark/test/e2e/pig/tests/multiquery.conf Fri Feb 24 08:19:42 2017 @@ -728,6 +728,52 @@ b = union a1, a2; c = rank b by name ASC, age DESC DENSE; store c into ':OUTPATH:';\, }, + { + # Union + Split + Two replicate join + 'num' => 12, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +a1 = filter a by gpa is null or gpa <= 3.9; +a2 = filter a by gpa < 2; +b = union a1, a2; +c = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); +c1 = filter c by age < 30; +c2 = filter c by age > 50; +d = join b by name, c1 by name using 'replicated'; +e = join d by b::name, c2 by name using 'replicated'; +store e into ':OUTPATH:';\, + }, + { + # Multiple Union + Multiple Split + Single store + 'num' => 13, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/studenttab10k' as (name, age:int, gpa); +u1 = union onschema a, b; +SPLIT u1 INTO r IF age < 30, s OTHERWISE; +c = load ':INPATH:/singlefile/voternulltab10k' as (votername, voterage, registration, contributions); +d = JOIN r BY name LEFT, c BY votername; +u2 = UNION ONSCHEMA d, s; +e = FILTER u2 BY name == 'nick miller'; +f = FILTER u2 BY age > 70 ; +u3 = UNION ONSCHEMA e, f; +store u3 into ':OUTPATH:';\, + }, + { + # PIG-5082. Similar to MultiQuery_Union_13 but for non-store vertex group + 'num' => 14, + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); +b = load ':INPATH:/singlefile/studenttab10k' as (name, age:int, gpa); +u1 = union onschema a, b; +SPLIT u1 INTO r IF age < 30, s OTHERWISE; +c = load ':INPATH:/singlefile/voternulltab10k' as (votername, voterage, registration, contributions); +d = JOIN r BY name LEFT, c BY votername; +u2 = UNION ONSCHEMA d, s; +e = FILTER u2 BY name == 'nick miller'; +f = FILTER u2 BY age > 70 ; +u3 = UNION ONSCHEMA e, f; +SPLIT u3 INTO t if age > 75, u OTHERWISE; +v = JOIN t BY name LEFT, c BY votername; +store v into ':OUTPATH:';\, + } ] # end of tests }, @@ -860,7 +906,38 @@ m = UNION e, i, j, n; n = JOIN a BY name, m BY name; store n into ':OUTPATH:';\, - } + }, + { + # Self join bloom left outer + 'num' => 12, + 'execonly' => 'tez', + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = filter a by gpa >= 3.9; +c = filter a by gpa > 3; +d = join b by name left outer, c by name using 'bloom'; +store d into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = filter a by gpa >= 3.9; +c = filter a by gpa > 3; +d = join b by name left outer, c by name; +store d into ':OUTPATH:';\, + }, + { + # Self join bloom left outer with strategy as reduce + 'num' => 13, + 'execonly' => 'tez', + 'java_params' => ['-Dpig.bloomjoin.strategy=reduce'], + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = filter a by gpa >= 3.9; +c = filter a by gpa > 3; +d = join b by name left outer, c by name using 'bloom'; +store d into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); +b = filter a by gpa >= 3.9; +c = filter a by gpa > 3; +d = join b by name left outer, c by name; +store d into ':OUTPATH:';\, + }, ] # end of tests }, Modified: pig/branches/spark/test/e2e/pig/tests/negative.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/negative.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/negative.conf (original) +++ pig/branches/spark/test/e2e/pig/tests/negative.conf Fri Feb 24 08:19:42 2017 @@ -473,7 +473,7 @@ define CMD `perl PigStreaming.pl` input( A = load ':INPATH:/singlefile/studenttab10k'; B = stream A through CMD; store B into ':OUTPATH:';\, - 'expected_err_regex' => "Error reading output from Streaming binary", + 'expected_err_regex' => "Error reading output from Streaming binary|Error while reading from POStream and passing it to the streaming process", }, { # Invalid serializer - throws exception @@ -568,24 +568,7 @@ store D into ':OUTPATH:';\, 'expected_err_regex' => "Could not resolve StringStoreBad using imports", }, ] - }, - { - 'name' => 'LineageErrors', - 'tests' => [ - { - # UDF returns a bytearray that is cast to an integer - 'num' => 1, - 'pig' => q\register :FUNCPATH:/testudf.jar; -a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); -b = filter a by name lt 'b'; -c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age); -d = foreach c generate $0#'alice young'; -split d into e if $0 < 42, f if $0 >= 42; -store e into ':OUTPATH:';\, - 'expected_err_regex' => "Received a bytearray from the UDF or Union from two different Loaders. Cannot determine how to convert the bytearray to int", - }, - ] - } + } ] } ; Modified: pig/branches/spark/test/e2e/pig/tests/nightly.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/nightly.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/nightly.conf (original) +++ pig/branches/spark/test/e2e/pig/tests/nightly.conf Fri Feb 24 08:19:42 2017 @@ -567,7 +567,6 @@ store c into ':OUTPATH:';\, { 'num' => 9, 'floatpostprocess' => 1, - 'ignore23' => 'I cannot get it right due to float precision, temporarily disable', 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); b = group a by name; c = foreach b generate group, AVG(a.gpa); @@ -1518,8 +1517,8 @@ store i into ':OUTPATH:';\, { # Union + operators 'num' => 12, - 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa:double); -b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age:int, gpa:double); + 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); +b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa:double); c = union a, b; -- Exercise all expression operators -- d = foreach c generate (name is not NULL? UPPER(name) : 'FNU LNU') as name, (age < 30 ? -1 : age) as age, (gpa is NULL ? 0.0 : ((gpa > 0.5 AND gpa < 1.0) ? 1 : gpa)) as gpa; @@ -2186,7 +2185,7 @@ store d into ':OUTPATH:';\, b = order a by $0, $1, $2; c = limit b 100; store c into ':OUTPATH:';\, - 'sortArgs' => ['-t', ' ', '-k', '1,3'], + 'sortArgs' => ['-t', ' ', '-k', '1,2'], }, { # Make sure that limit higher than number of rows doesn't mess stuff up @@ -2206,6 +2205,7 @@ store c into ':OUTPATH:';\, }, { 'num' => 5, + 'execonly' => 'mapred,local', #tez may pick either input as part of the optimization so cannot be tested easily 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; b = load ':INPATH:/singlefile/votertab10k'; a1 = foreach a generate $0, $1; @@ -2285,7 +2285,21 @@ store d into ':OUTPATH:';\, 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); b = limit a 2000; store b into ':OUTPATH:';\, - } + }, + { + 'num' => 12, + 'execonly' => 'tez', #Limit_5 was not able to test on tez. + 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; +b = load ':INPATH:/singlefile/studenttab10k'; +a1 = foreach a generate $0, $1; +b1 = foreach b generate $0, $1; +c = union a1, b1; +d = limit c 100; +store d into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int); +b = limit a 100; +store b into ':OUTPATH:';\, + } ] }, { @@ -2736,6 +2750,41 @@ store c into ':OUTPATH:';\, }, ], }, + { + 'name' => 'StoreLoad', + 'tests' => [ + { + 'num' => 1, + 'floatpostprocess' => 1, + 'delimiter' => ' ', + 'pig' => q\ +a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); +b = filter a by age < 25; +c = filter a by age > 70; +store b into ':OUTPATH:.intermediate1' using PigStorage(','); +store c into ':OUTPATH:.intermediate2' using PigStorage(','); +d = load ':OUTPATH:.intermediate1' using PigStorage(',') as (name:chararray, age:int, gpa: double); +e = load ':OUTPATH:.intermediate2' using PigStorage(',') as (name:chararray, age:int, gpa: double); +f = join d by name, e by name; +store f into ':OUTPATH:';\, + 'notmq' => 1, + }, + { + # Self join + 'num' => 2, + 'floatpostprocess' => 1, + 'delimiter' => ' ', + 'pig' => q\ +a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); +b = filter a by name == 'nick miller'; +store b into ':OUTPATH:.intermediate' using PigStorage(','); +c = load ':OUTPATH:.intermediate' using PigStorage(',') as (name:chararray, age:int, gpa: double); +d = join a by name, c by name; +store d into ':OUTPATH:';\, + 'notmq' => 1, + }, + ], + }, { 'name' => 'MergeJoin', @@ -3171,6 +3220,25 @@ e = join a by name full outer, b by name store e into ':OUTPATH:';\, }, + # skew join with tuple key + { + 'num' => 15, + 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], + 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); +b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +c = group a by (name, age); +d = group b by (name, age); +e = join c by $0, d by $0 using 'skewed' parallel 5; +f = foreach e generate c::group, flatten(c::a), d::group, flatten(d::b); +store f into ':OUTPATH:';\, + 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); +b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); +c = group a by (name, age); +d = group b by (name, age); +e = join c by $0, d by $0; +f = foreach e generate c::group, flatten(c::a), d::group, flatten(d::b); +store f into ':OUTPATH:';\ + } ] }, @@ -4243,40 +4311,32 @@ store e into ':OUTPATH:';\, # test common 'num' => 1, 'pig' => q\ -rmf table_testNativeMRJobSimple_input -rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); -b = native ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; +b = native ':MAPREDJARS:/hadoop-examples.jar' Store a into ':OUTPATH:.intermediate.1' Load ':OUTPATH:.intermediate.2' `wordcount :OUTPATH:.intermediate.1 :OUTPATH:.intermediate.2`; store b into ':OUTPATH:';\, 'notmq' => 1, 'verify_pig_script' => q\ -rmf table_testNativeMRJobSimple_input -rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); -b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; +b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into ':OUTPATH:.intermediate.1' Load ':OUTPATH:.intermediate.2' `wordcount :OUTPATH:.intermediate.1 :OUTPATH:.intermediate.2`; store b into ':OUTPATH:';\, }, { # test complex 'num' => 2, 'pig' => q\ -rmf table_testNativeMRJobSimple_input -rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = foreach a generate name; c = distinct b; -d = native ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; +d = native ':MAPREDJARS:/hadoop-examples.jar' Store c into ':OUTPATH:.intermediate.1' Load ':OUTPATH:.intermediate.2' as (name:chararray, count: int) `wordcount :OUTPATH:.intermediate.1 :OUTPATH:.intermediate.2`; e = order d by name; store e into ':OUTPATH:';\, 'sortArgs' => ['-t', ' '], 'notmq' => 1, 'verify_pig_script' => q\ -rmf table_testNativeMRJobSimple_input -rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); b = foreach a generate name; c = distinct b; -d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; +d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into ':OUTPATH:.intermediate.1' Load ':OUTPATH:.intermediate.2' as (name:chararray, count: int) `wordcount :OUTPATH:.intermediate.1 :OUTPATH:.intermediate.2`; e = order d by name; store e into ':OUTPATH:';\, }, @@ -4284,16 +4344,8 @@ store e into ':OUTPATH:';\, # test streaming 'num' => 3, 'pig' => q\ -rmf table_testNativeMRJobSimple_input -rmf table_testNativeMRJobSimple_output a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); -b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper cat -reducer wc`; -store b into ':OUTPATH:';\, - 'pig23' => q\ -rmf table_testNativeMRJobSimple_input -rmf table_testNativeMRJobSimple_output -a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); -b = mapreduce ':MAPREDJARS:/hadoop-0.23.0-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper cat -reducer wc`; +b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into ':OUTPATH:.intermediate.1' Load ':OUTPATH:.intermediate.2' as (name:chararray, count: int) `-input :OUTPATH:.intermediate.1 -output :OUTPATH:.intermediate.2 -mapper cat -reducer wc`; store b into ':OUTPATH:';\, 'notmq' => 1, }, @@ -4884,21 +4936,6 @@ a = load ':INPATH:/singlefile/allscalar1 b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); C = union a, b; store C into ':OUTPATH:';\, - }, - { - # Test Union using merge with incompatible types. float->bytearray and chararray->bytearray - 'num' => 8, - 'delimiter' => ' ', - 'pig' => q\ -A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int); -B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray); -C = union onschema A, B; -store C into ':OUTPATH:';\, - 'verify_pig_script' => q\ -A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:bytearray); -B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:bytearray); -C = union A, B; -store C into ':OUTPATH:';\, } ] @@ -4927,7 +4964,6 @@ store C into ':OUTPATH:';\, 'tests' => [ { 'num' => 1, - 'ignore23' => 'guava version of Pig is higher than hadoop 23', 'pig' => q?register :FUNCPATH:/testudf.jar; define gm org.apache.pig.test.udf.evalfunc.GoodMonitored(); a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); @@ -5297,6 +5333,26 @@ store C into ':OUTPATH:';\, C = UNION A,B; D = filter C by name == 'alice allen'; store D into ':OUTPATH:';", + },{ + 'num' => 5, + 'pig' => "set pig.optimizer.rules.disabled PushUpFilter; + define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3'); + A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); + B = filter A by name == 'alice allen'; + C = group B all; + D = foreach C generate bb(B.name) as bloomfilter; + E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); + F = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); + G = union E, F; + -- PushUpFilter is disabled to avoid filter being pushed before union + H = filter G by Bloom(D.bloomfilter, name); + store H into ':OUTPATH:';", + 'verify_pig_script' => " + A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); + B = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); + C = UNION A,B; + D = filter C by name == 'alice allen'; + store D into ':OUTPATH:';", } ], },{ @@ -5637,13 +5693,15 @@ store a into ':OUTPATH:';\, 'execonly' => 'mapred,tez', 'pig' => q\ SET default_parallel 7; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A; C = foreach B generate rank_A,a,b,c; store C into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rownumber,a,b,c; store C into ':OUTPATH:'; \, @@ -5652,13 +5710,15 @@ store a into ':OUTPATH:';\, 'execonly' => 'mapred,tez', 'pig' => q\ SET default_parallel 9; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A by b DESC,a ASC; C = foreach B generate rank_A,b,a; store C into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rankbdaa,b,a; store C into ':OUTPATH:'; \, @@ -5667,13 +5727,15 @@ store a into ':OUTPATH:';\, 'execonly' => 'mapred,tez', 'pig' => q\ SET default_parallel 7; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A by c ASC,b DESC; C = foreach B generate rank_A,c,b; store C into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rankcabd,c,b; store C into ':OUTPATH:'; \, @@ -5681,26 +5743,29 @@ store a into ':OUTPATH:';\, 'num' => 4, 'execonly' => 'mapred,tez', 'pig' => q\ - SET default_parallel 25; - A = LOAD ':INPATH:/singlefile/biggish' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); + SET default_parallel 5; + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A; C = order B by rank_A; - D = foreach C generate rank_A,rownumber; + D = foreach C generate rank_A,a,b,c; store D into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/biggish' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); - D = foreach A generate idx,rownumber; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); + D = foreach A generate rownumber,a,b,c; store D into ':OUTPATH:'; \, }, { 'num' => 5, 'execonly' => 'mapred,tez', 'pig' => q\ - SET default_parallel 11; + SET default_parallel 5; + SET mapreduce.input.fileinputformat.split.maxsize '300'; SET pig.splitCombination false; - A = LOAD ':INPATH:/singlefile/biggish' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); - B = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); + B = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = join A by rownumber, B by rownumber; D = order C by B::rankcabd,B::rankbdca,B::rankaaba; E = rank D; @@ -5710,7 +5775,7 @@ store a into ':OUTPATH:';\, store H into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,idx:long); B = foreach A generate rownumber,1; C = order B by rownumber; store C into ':OUTPATH:'; @@ -5719,14 +5784,16 @@ store a into ':OUTPATH:';\, 'num' => 6, 'execonly' => 'mapred,tez', 'pig' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); split A into M if rownumber > 15, N if rownumber < 25; C = rank N; D = foreach C generate $0, a, b, c; store D into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = filter A by rownumber < 25; D = foreach B generate rownumber, a, b, c; store D into ':OUTPATH:'; @@ -5741,14 +5808,16 @@ store a into ':OUTPATH:';\, 'num' => 1, 'execonly' => 'mapred,tez', 'pig' => q\ + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; SET default_parallel 9; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A by a ASC,b ASC DENSE; C = foreach B generate rank_A,a,b; store C into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rankaaba,a,b; store C into ':OUTPATH:'; \, @@ -5756,14 +5825,16 @@ store a into ':OUTPATH:';\, 'num' => 2, 'execonly' => 'mapred,tez', 'pig' => q\ + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; SET default_parallel 9; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A by a ASC,c DESC DENSE; C = foreach B generate rank_A,a,c; store C into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rankaacd,a,c; store C into ':OUTPATH:'; \, @@ -5771,14 +5842,16 @@ store a into ':OUTPATH:';\, 'num' => 3, 'execonly' => 'mapred,tez', 'pig' => q\ + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; SET default_parallel 7; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = rank A by b DESC,c ASC DENSE; C = foreach B generate rank_A,b,c; store C into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rankbdca,b,c; store C into ':OUTPATH:'; \, @@ -5786,9 +5859,11 @@ store a into ':OUTPATH:';\, 'num' => 4, 'execonly' => 'mapred,tez', 'pig' => q\ + SET mapreduce.input.fileinputformat.split.maxsize '300'; + SET pig.splitCombination false; SET default_parallel 7; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); - B = foreach A generate a,b,c,tail; + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); + B = foreach A generate a,b,c; C = rank B by a ASC,b ASC DENSE; D = rank C by a ASC,c DESC DENSE; E = rank D by b DESC,c ASC DENSE; @@ -5796,7 +5871,7 @@ store a into ':OUTPATH:';\, store F into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = foreach A generate rankbdca,rankaacd,rankaaba,a,b,c; store B into ':OUTPATH:'; \, @@ -5805,8 +5880,9 @@ store a into ':OUTPATH:';\, 'execonly' => 'mapred,tez', 'pig' => q\ SET default_parallel 9; + SET mapreduce.input.fileinputformat.split.maxsize '300'; SET pig.splitCombination false; - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); B = foreach A generate a,b,c; C = rank B by a ASC,b ASC DENSE; D = rank B by a ASC,c DESC DENSE; @@ -5816,7 +5892,7 @@ store a into ':OUTPATH:';\, store H into ':OUTPATH:'; \, 'verify_pig_script' => q\ - A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); + A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int); C = foreach A generate rankaaba,a,b,c; E = order C by a ASC,b ASC; D = foreach A generate rankaacd,a,b,c; Modified: pig/branches/spark/test/e2e/pig/tests/orc.conf URL: http://svn.apache.org/viewvc/pig/branches/spark/test/e2e/pig/tests/orc.conf?rev=1784237&r1=1784236&r2=1784237&view=diff ============================================================================== --- pig/branches/spark/test/e2e/pig/tests/orc.conf (original) +++ pig/branches/spark/test/e2e/pig/tests/orc.conf Fri Feb 24 08:19:42 2017 @@ -1,3 +1,21 @@ +#!/usr/bin/env perl +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### $cfg = { 'driver' => 'Pig', 'nummachines' => 5,
