Author: daijy
Date: Fri Jan 9 18:55:53 2015
New Revision: 1650628
URL: http://svn.apache.org/r1650628
Log:
PIG-4358: Add test cases for utf8 chinese in Pig
Added:
pig/trunk/test/e2e/pig/tests/utf8.conf
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java
Modified:
pig/trunk/CHANGES.txt
pig/trunk/test/e2e/pig/build.xml
pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm
pig/trunk/test/e2e/pig/tools/generate/generate_data.pl
Modified: pig/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Jan 9 18:55:53 2015
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
IMPROVEMENTS
+PIG-4358: Add test cases for utf8 chinese in Pig (nmaheshwari via daijy)
+
PIG-4370: HBaseStorage should support delete markers (bridiver via daijy)
PIG-4360: HBaseStorage should support setting the timestamp field (bridiver
via daijy)
Modified: pig/trunk/test/e2e/pig/build.xml
URL:
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/build.xml?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/test/e2e/pig/build.xml (original)
+++ pig/trunk/test/e2e/pig/build.xml Fri Jan 9 18:55:53 2015
@@ -305,6 +305,7 @@
<arg value="${test.location}/tests/macro.conf"/>
<arg value="${test.location}/tests/orc.conf"/>
<arg value="${test.location}/tests/hcat.conf"/>
+ <arg value="${test.location}/tests/utf8.conf"/>
</exec>
</target>
Modified: pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm
URL:
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm (original)
+++ pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm Fri Jan 9
18:55:53 2015
@@ -240,6 +240,16 @@ sub generateData
'filetype' => "ranking",
'rows' => 30,
'hdfs' => "singlefile/prerank",
+ }, {
+ 'name' => "utf8Voter",
+ 'filetype' => "utf8Voter",
+ 'rows' => 30,
+ 'hdfs' => "utf8Data/éæ°/utf8Voter",
+ }, {
+ 'name' => "utf8Student",
+ 'filetype' => "utf8Student",
+ 'rows' => 300,
+ 'hdfs' => "utf8Data/å¦ç/utf8Student",
}
);
Added: pig/trunk/test/e2e/pig/tests/utf8.conf
URL:
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/tests/utf8.conf?rev=1650628&view=auto
==============================================================================
--- pig/trunk/test/e2e/pig/tests/utf8.conf (added)
+++ pig/trunk/test/e2e/pig/tests/utf8.conf Fri Jan 9 18:55:53 2015
@@ -0,0 +1,93 @@
+#!/usr/bin/env perl
+############################################################################
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###############################################################################
+# Nightly tests for pig.
+#
+#
+
+$cfg = {
+ 'driver' => 'Pig',
+
+ 'groups' => [
+
+ {
+ 'name' => 'Utf8Test',
+ 'tests' => [
+ {
+ 'num' => 1,
+# TC1.1 Execute Pig script with Chinese script name and path, filter by
Chinese value, and output to Chinese folder
+ ,'pig' => q\#!/usr/bin/python
+ # -*- coding: utf-8 -*-
+from org.apache.pig.scripting import Pig
+import os
+#pig script dir
+pig_script_dir=u":TMP:/èæ¬è·¯å¾"
+#delete if the dir already exists
+if os.path.exists(pig_script_dir):
+ os.remove(pig_script_dir)
+#create the dir
+os.mkdir(pig_script_dir)
+#create pig script
+pig_script = u":TMP:/èæ¬è·¯å¾/è¶
.pig"
+pigfile = open( pig_script, "w+")
+pigfile.write("""
+A = LOAD ':INPATH:/utf8Data/å¦ç/utf8Student' as (name:chararray, age:int,
gpa:double);
+B = filter A by name == 'ä½';
+store B into ':OUTPATH:';
+""")
+pigfile.close()
+
+#execute pig script
+
+result = Pig.compileFromFile( pig_script ).bind().runSingle()
+
+if result.isSuccessful():
+ print "Pig job PASSED"
+else:
+ raise "Pig job FAILED"
+\,
+
+ 'verify_pig_script' => q\A = LOAD
':INPATH:/utf8Data/å¦ç/utf8Student' as (name:chararray, age:int, gpa:double);
+ B = filter A by name == 'ä½';
+ store B into ':OUTPATH:';
+\
+ },
+
#--------------------------------------------------------------------------------
+ {
+ 'num' => 2,
+# TC1.2 Joining tables on columns storing Chinese data in Pig
+ 'pig' => q\
+A = LOAD ':INPATH:/utf8Data/å¦ç/utf8Student' as (name:chararray, age:int,
gpa:double);
+B = LOAD ':INPATH:/utf8Data/éæ°/utf8Voter' AS (name:chararray, age:int,
registration:chararray, contributions:double);
+C = JOIN A by name, B by name;
+D = FOREACH C GENERATE gpa, registration, contributions;
+store D into ':OUTPATH:';
+\,
+ 'verify_pig_script' => q\register
:FUNCPATH:/testudf.jar;
+A = LOAD ':INPATH:/utf8Data/å¦ç/utf8Student' as (name:chararray, age:int,
gpa:double);
+B = LOAD ':INPATH:/utf8Data/éæ°/utf8Voter' AS (name:chararray, age:int,
registration:chararray, contributions:double);
+C = JOIN A by org.apache.pig.test.udf.evalfunc.ENCODE(name), B by
org.apache.pig.test.udf.evalfunc.ENCODE(name);
+D = FOREACH C GENERATE gpa, registration, contributions;
+store D into ':OUTPATH:';
+\,
+ }
+#--------------------------------------------------------------------------------
+ ] # end tests
+ }, # end group
+ ] # end groups
+};
Modified: pig/trunk/test/e2e/pig/tools/generate/generate_data.pl
URL:
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/tools/generate/generate_data.pl?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/test/e2e/pig/tools/generate/generate_data.pl (original)
+++ pig/trunk/test/e2e/pig/tools/generate/generate_data.pl Fri Jan 9 18:55:53
2015
@@ -249,6 +249,13 @@ sub randomUnicodeNonAscii()
my $testvar = "\N{U+03b1}\N{U+03b3}\N{U+03b1}\N{U+03c0}\N{U+03b7}";
+our @utf8Name = ("ä½","å®","è","ä½","å®","å½","å","çµ","ä¸","马");
+
+sub randomUtf8Name()
+{
+ return sprintf("%s", $utf8Name[int(rand(10))]);
+}
+
sub getBulkCopyCmd(){
my $sourceDir= shift;
my $tableName = shift;
@@ -511,6 +518,28 @@ sub getBulkCopyCmd(){
}
printf HDFS "\n";
}
+ } elsif ($filetype eq "utf8Student") {
+ srand(3.14159 + $numRows);
+ print PSQL "create table $tableName (name varchar(100), age integer,
gpa float(3));\n" unless defined $nosql;
+ print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined
$nosql;
+ for (my $i = 0; $i < $numRows; $i++) {
+ my $name = randomUtf8Name();
+ my $age = randomAge();
+ my $gpa = randomGpa();
+ printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa;
+ }
+ } elsif ($filetype eq "utf8Voter") {
+ srand(3.14159 + $numRows);
+ print PSQL "create table $tableName (name varchar(100), age integer,
registration varchar(20), contributions float);\n" unless defined $nosql;
+ print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined
$nosql;
+ for (my $i = 0; $i < $numRows; $i++) {
+ my $name = randomUtf8Name();
+ my $age = randomAge();
+ my $registration = randomRegistration();
+ my $contribution = randomContribution();
+ printf HDFS "%s\t%d\t%s\t%.2f\n", $name, $age,
+ $registration, $contribution;
+ }
} else {
warn "Unknown filetype $filetype\n";
usage();
Added:
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java
URL:
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java?rev=1650628&view=auto
==============================================================================
---
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java
(added)
+++
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java
Fri Jan 9 18:55:53 2015
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.test.udf.evalfunc;
+
+
+import java.io.IOException;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.commons.codec.binary.Base64;
+
+public class ENCODE extends EvalFunc<String>
+{
+ public String exec(Tuple input) throws IOException
+ {
+
+ if (input == null || input.size() == 0)
+ return null;
+
+ try{
+ String decoded_str = (String)input.get(0);
+ return new String(Base64.encodeBase64( decoded_str.getBytes()));
+ }catch (Exception e){
+ System.err.println ("Failed to process input; error - " +
e.getMessage());
+ return null;
+ }
+ }
+}