Author: daijy
Date: Fri Jan  9 18:55:53 2015
New Revision: 1650628

URL: http://svn.apache.org/r1650628
Log:
PIG-4358: Add test cases for utf8 chinese in Pig

Added:
    pig/trunk/test/e2e/pig/tests/utf8.conf
    
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java
Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/test/e2e/pig/build.xml
    pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm
    pig/trunk/test/e2e/pig/tools/generate/generate_data.pl

Modified: pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Jan  9 18:55:53 2015
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
  
 IMPROVEMENTS
 
+PIG-4358: Add test cases for utf8 chinese in Pig (nmaheshwari via daijy)
+
 PIG-4370: HBaseStorage should support delete markers (bridiver via daijy)
 
 PIG-4360: HBaseStorage should support setting the timestamp field (bridiver 
via daijy)

Modified: pig/trunk/test/e2e/pig/build.xml
URL: 
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/build.xml?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/test/e2e/pig/build.xml (original)
+++ pig/trunk/test/e2e/pig/build.xml Fri Jan  9 18:55:53 2015
@@ -305,6 +305,7 @@
       <arg value="${test.location}/tests/macro.conf"/>
       <arg value="${test.location}/tests/orc.conf"/>
       <arg value="${test.location}/tests/hcat.conf"/>
+      <arg value="${test.location}/tests/utf8.conf"/>
     </exec>
   </target>
 

Modified: pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm
URL: 
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm (original)
+++ pig/trunk/test/e2e/pig/deployers/ExistingClusterDeployer.pm Fri Jan  9 
18:55:53 2015
@@ -240,6 +240,16 @@ sub generateData
             'filetype' => "ranking",
             'rows' => 30,
             'hdfs' => "singlefile/prerank",
+        }, {
+            'name' => "utf8Voter",
+            'filetype' => "utf8Voter",
+            'rows' => 30,
+            'hdfs' => "utf8Data/选民/utf8Voter",
+        }, {
+            'name' => "utf8Student",
+            'filetype' => "utf8Student",
+            'rows' => 300,
+            'hdfs' => "utf8Data/学生/utf8Student",
         }
     );
 

Added: pig/trunk/test/e2e/pig/tests/utf8.conf
URL: 
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/tests/utf8.conf?rev=1650628&view=auto
==============================================================================
--- pig/trunk/test/e2e/pig/tests/utf8.conf (added)
+++ pig/trunk/test/e2e/pig/tests/utf8.conf Fri Jan  9 18:55:53 2015
@@ -0,0 +1,93 @@
+#!/usr/bin/env perl
+############################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+###############################################################################
+# Nightly tests for pig.
+#
+#
+
+$cfg = {
+        'driver' => 'Pig',
+
+        'groups' => [
+
+                {
+                'name' => 'Utf8Test',
+                'tests' => [
+                        {
+                        'num' => 1,
+# TC1.1 Execute Pig script with Chinese script name and path, filter by 
Chinese value, and output to Chinese folder
+               ,'pig' => q\#!/usr/bin/python
+               # -*- coding: utf-8 -*-
+from org.apache.pig.scripting import Pig
+import os
+#pig script dir
+pig_script_dir=u":TMP:/脚本路径"
+#delete if the dir already exists
+if os.path.exists(pig_script_dir):
+    os.remove(pig_script_dir)
+#create the dir
+os.mkdir(pig_script_dir)
+#create pig script
+pig_script = u":TMP:/脚本路径/超.pig"
+pigfile = open( pig_script, "w+")
+pigfile.write("""
+A = LOAD ':INPATH:/utf8Data/学生/utf8Student' as (name:chararray, age:int, 
gpa:double);
+B = filter A by name == '佛';
+store B into ':OUTPATH:';
+""")
+pigfile.close()
+
+#execute pig script
+
+result = Pig.compileFromFile( pig_script ).bind().runSingle()
+
+if result.isSuccessful():
+    print "Pig job PASSED"
+else:
+    raise "Pig job FAILED"
+\,
+
+             'verify_pig_script' => q\A = LOAD 
':INPATH:/utf8Data/学生/utf8Student' as (name:chararray, age:int, gpa:double);
+                                      B = filter A by name == '佛';
+                                      store B into ':OUTPATH:';
+\
+                        },
+ 
#--------------------------------------------------------------------------------
+                        {
+                        'num' => 2,
+# TC1.2 Joining tables on columns storing Chinese data in Pig
+                        'pig' => q\
+A = LOAD ':INPATH:/utf8Data/学生/utf8Student' as (name:chararray, age:int, 
gpa:double);
+B = LOAD ':INPATH:/utf8Data/选民/utf8Voter' AS (name:chararray, age:int, 
registration:chararray, contributions:double);
+C = JOIN A by name, B by name;
+D = FOREACH C GENERATE gpa, registration, contributions;
+store D into ':OUTPATH:';
+\,
+                        'verify_pig_script' => q\register 
:FUNCPATH:/testudf.jar;
+A = LOAD ':INPATH:/utf8Data/学生/utf8Student' as (name:chararray, age:int, 
gpa:double);
+B = LOAD ':INPATH:/utf8Data/选民/utf8Voter' AS (name:chararray, age:int, 
registration:chararray, contributions:double);
+C = JOIN A by org.apache.pig.test.udf.evalfunc.ENCODE(name), B by 
org.apache.pig.test.udf.evalfunc.ENCODE(name);
+D = FOREACH C GENERATE gpa, registration, contributions;
+store D into ':OUTPATH:';
+\,
+                        }
+#--------------------------------------------------------------------------------
+                    ]  # end tests
+                }, # end group
+     ] # end groups
+};

Modified: pig/trunk/test/e2e/pig/tools/generate/generate_data.pl
URL: 
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/tools/generate/generate_data.pl?rev=1650628&r1=1650627&r2=1650628&view=diff
==============================================================================
--- pig/trunk/test/e2e/pig/tools/generate/generate_data.pl (original)
+++ pig/trunk/test/e2e/pig/tools/generate/generate_data.pl Fri Jan  9 18:55:53 
2015
@@ -249,6 +249,13 @@ sub randomUnicodeNonAscii()
 
 my $testvar = "\N{U+03b1}\N{U+03b3}\N{U+03b1}\N{U+03c0}\N{U+03b7}";
 
+our @utf8Name = ("佛","实","脑","体","宝","国","双","电","东","马");
+
+sub randomUtf8Name()
+{
+    return sprintf("%s", $utf8Name[int(rand(10))]);
+}
+
 sub getBulkCopyCmd(){
         my $sourceDir= shift;
         my $tableName = shift;
@@ -511,6 +518,28 @@ sub getBulkCopyCmd(){
             }
             printf HDFS "\n";
         }
+    } elsif ($filetype eq "utf8Student") {
+        srand(3.14159 + $numRows);
+        print PSQL "create table $tableName (name varchar(100), age integer, 
gpa float(3));\n" unless defined $nosql;
+        print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined 
$nosql;
+        for (my $i = 0; $i < $numRows; $i++) {
+            my $name = randomUtf8Name();
+            my $age = randomAge();
+            my $gpa = randomGpa();
+            printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa;
+        }
+    } elsif ($filetype eq "utf8Voter") {
+        srand(3.14159 + $numRows);
+        print PSQL "create table $tableName (name varchar(100), age integer, 
registration varchar(20), contributions float);\n" unless defined $nosql;
+        print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined 
$nosql;
+        for (my $i = 0; $i < $numRows; $i++) {
+            my $name = randomUtf8Name();
+            my $age = randomAge();
+            my $registration = randomRegistration();
+            my $contribution = randomContribution();
+            printf HDFS "%s\t%d\t%s\t%.2f\n", $name, $age,
+                $registration, $contribution;
+        }
     } else {
         warn "Unknown filetype $filetype\n";
         usage();

Added: 
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java?rev=1650628&view=auto
==============================================================================
--- 
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java 
(added)
+++ 
pig/trunk/test/e2e/pig/udfs/java/org/apache/pig/test/udf/evalfunc/ENCODE.java 
Fri Jan  9 18:55:53 2015
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.test.udf.evalfunc;
+
+
+import java.io.IOException;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.commons.codec.binary.Base64;
+
+public class ENCODE extends EvalFunc<String>
+{
+    public String exec(Tuple input) throws IOException
+    {
+
+        if (input == null || input.size() == 0)
+            return null;
+
+        try{
+            String decoded_str = (String)input.get(0);
+            return new String(Base64.encodeBase64( decoded_str.getBytes()));
+        }catch (Exception e){
+            System.err.println ("Failed to process input; error - " + 
e.getMessage());
+            return null;
+        }
+    }
+}


Reply via email to