Author: kevinwilfong
Date: Fri May 11 00:42:04 2012
New Revision: 1336986

URL: http://svn.apache.org/viewvc?rev=1336986&view=rev
Log:
HIVE-3012 hive custom scripts do not work well if the data contains new lines 
(njain via kevinwilfong)

Added:
    hive/trunk/data/scripts/newline.py
    hive/trunk/ql/src/test/queries/clientpositive/newline.q
    hive/trunk/ql/src/test/results/clientpositive/newline.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/conf/hive-default.xml.template
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1336986&r1=1336985&r2=1336986&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
(original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri 
May 11 00:42:04 2012
@@ -389,6 +389,7 @@ public class HiveConf extends Configurat
         "org.apache.hadoop.hive.ql.exec.TextRecordReader"),
     HIVESCRIPTRECORDWRITER("hive.script.recordwriter",
         "org.apache.hadoop.hive.ql.exec.TextRecordWriter"),
+    HIVESCRIPTESCAPENEWLINES("hive.script.escape.newlines", false),
 
     // HWI
     HIVEHWILISTENHOST("hive.hwi.listen.host", "0.0.0.0"),

Modified: hive/trunk/conf/hive-default.xml.template
URL: 
http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1336986&r1=1336985&r2=1336986&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Fri May 11 00:42:04 2012
@@ -1272,4 +1272,15 @@
   </description>
 </property>
 
+<property>
+  <name>hive.script.escape.newlines</name>
+  <value>false</value>
+  <description>
+    This adds an option to escape the newlines when they are passed to the 
+    user script. This is useful is the hive tables can contain data that
+    can contain newlines.
+  </description>
+</property>
+
 </configuration>
+

Added: hive/trunk/data/scripts/newline.py
URL: 
http://svn.apache.org/viewvc/hive/trunk/data/scripts/newline.py?rev=1336986&view=auto
==============================================================================
--- hive/trunk/data/scripts/newline.py (added)
+++ hive/trunk/data/scripts/newline.py Fri May 11 00:42:04 2012
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+import sys
+
+for line in sys.stdin:
+  print "1\\n2"

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java?rev=1336986&r1=1336985&r2=1336986&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java 
Fri May 11 00:42:04 2012
@@ -27,6 +27,9 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.LineRecordReader.LineReader;
 
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+
 /**
  * TextRecordReader.
  *
@@ -36,11 +39,13 @@ public class TextRecordReader implements
   private LineReader lineReader;
   private InputStream in;
   private Text row;
+  private Configuration conf;
 
   public void initialize(InputStream in, Configuration conf, Properties tbl)
       throws IOException {
     lineReader = new LineReader(in, conf);
     this.in = in;
+    this.conf = conf;
   }
 
   public Writable createRow() throws IOException {
@@ -53,7 +58,12 @@ public class TextRecordReader implements
       return -1;
     }
 
-    return lineReader.readLine((Text) row);
+    int bytesConsumed = lineReader.readLine((Text) row);
+
+    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESCRIPTESCAPENEWLINES)) 
{
+      return HiveUtils.unescapeNewLine((Text) row);
+    }
+    return bytesConsumed;
   }
 
   public void close() throws IOException {

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java?rev=1336986&r1=1336985&r2=1336986&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java 
Fri May 11 00:42:04 2012
@@ -24,6 +24,8 @@ import java.io.OutputStream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
 
 /**
  * TextRecordWriter.
@@ -32,15 +34,23 @@ import org.apache.hadoop.io.Writable;
 public class TextRecordWriter implements RecordWriter {
 
   private OutputStream out;
+  private Configuration conf;
 
   public void initialize(OutputStream out, Configuration conf)
       throws IOException {
     this.out = out;
+    this.conf = conf;
   }
 
   public void write(Writable row) throws IOException {
     Text text = (Text) row;
-    out.write(text.getBytes(), 0, text.getLength());
+    Text escapeText = text;
+
+    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESCRIPTESCAPENEWLINES)) 
{
+      escapeText = HiveUtils.escapeNewLine(text);
+    }
+
+    out.write(escapeText.getBytes(), 0, escapeText.getLength());
     out.write(Utilities.newLineCode);
   }
 

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java?rev=1336986&r1=1336985&r2=1336986&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java 
Fri May 11 00:42:04 2012
@@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.securit
 import org.apache.hadoop.hive.ql.security.HiveAuthenticationProvider;
 import 
org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider;
 import 
org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.ReflectionUtils;
 
 /**
@@ -96,6 +97,71 @@ public final class HiveUtils {
     return (escape.toString());
   }
 
+  static final byte[] newLineEscapeBytes = "\\n".getBytes();;
+  static final byte[] newLineUnescapeBytes = "\n".getBytes();
+
+  public static Text escapeNewLine(Text text) {
+    int length = text.getLength();
+    byte[] textBytes = text.getBytes();
+
+    Text escape = new Text(text);
+    escape.clear();
+
+    for (int i = 0; i < length; ++i) {
+      int c = text.charAt(i);
+      switch (c) {
+      case '\n':
+        byte[] escaped = newLineEscapeBytes;
+        escape.append(escaped, 0, escaped.length);
+        break;
+      default:
+        escape.append(textBytes, i, 1);
+        break;
+      }
+    }
+    return escape;
+  }
+
+  public static int unescapeNewLine(Text text) {
+    Text escape = new Text(text);
+    text.clear();
+
+    int length = escape.getLength();
+    byte[] textBytes = escape.getBytes();
+
+    boolean hadSlash = false;
+    for (int i = 0; i < length; ++i) {
+      int c = escape.charAt(i);
+      switch (c) {
+      case '\\':
+        if (hadSlash) {
+          text.append(textBytes, i, 1);
+        }
+        hadSlash = true;
+        break;
+      case 'n':
+        if (hadSlash) {
+          byte[] newLine = newLineUnescapeBytes;
+          text.append(newLine, 0, newLine.length);
+        }
+        else {
+          text.append(textBytes, i, 1);
+        }
+        hadSlash = false;
+        break;
+      default:
+        if (hadSlash) {
+          text.append(textBytes, i-1, 1);
+          hadSlash = false;
+        }
+
+        text.append(textBytes, i, 1);
+        break;
+      }
+    }
+    return text.getLength();
+  }
+
   public static String lightEscapeString(String str) {
     int length = str.length();
     StringBuilder escape = new StringBuilder(length + 16);

Added: hive/trunk/ql/src/test/queries/clientpositive/newline.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/newline.q?rev=1336986&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/newline.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/newline.q Fri May 11 00:42:04 
2012
@@ -0,0 +1,9 @@
+add file ../data/scripts/newline.py;
+set hive.script.escape.newlines=true;
+
+create table tmp_tmp(key string, value string) stored as rcfile;
+insert overwrite table tmp_tmp
+SELECT TRANSFORM(key, value) USING
+'python newline.py' AS key, value FROM src limit 5;
+
+select * from tmp_tmp;
\ No newline at end of file

Added: hive/trunk/ql/src/test/results/clientpositive/newline.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/newline.q.out?rev=1336986&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/newline.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/newline.q.out Fri May 11 
00:42:04 2012
@@ -0,0 +1,39 @@
+PREHOOK: query: create table tmp_tmp(key string, value string) stored as rcfile
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table tmp_tmp(key string, value string) stored as 
rcfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tmp_tmp
+PREHOOK: query: insert overwrite table tmp_tmp
+SELECT TRANSFORM(key, value) USING
+'python newline.py' AS key, value FROM src limit 5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@tmp_tmp
+POSTHOOK: query: insert overwrite table tmp_tmp
+SELECT TRANSFORM(key, value) USING
+'python newline.py' AS key, value FROM src limit 5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@tmp_tmp
+POSTHOOK: Lineage: tmp_tmp.key SCRIPT [(src)src.FieldSchema(name:key, 
type:string, comment:default), (src)src.FieldSchema(name:value, type:string, 
comment:default), ]
+POSTHOOK: Lineage: tmp_tmp.value SCRIPT [(src)src.FieldSchema(name:key, 
type:string, comment:default), (src)src.FieldSchema(name:value, type:string, 
comment:default), ]
+PREHOOK: query: select * from tmp_tmp
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tmp_tmp
+#### A masked pattern was here ####
+POSTHOOK: query: select * from tmp_tmp
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tmp_tmp
+#### A masked pattern was here ####
+POSTHOOK: Lineage: tmp_tmp.key SCRIPT [(src)src.FieldSchema(name:key, 
type:string, comment:default), (src)src.FieldSchema(name:value, type:string, 
comment:default), ]
+POSTHOOK: Lineage: tmp_tmp.value SCRIPT [(src)src.FieldSchema(name:key, 
type:string, comment:default), (src)src.FieldSchema(name:value, type:string, 
comment:default), ]
+1
+2      NULL
+1
+2      NULL
+1
+2      NULL
+1
+2      NULL
+1
+2      NULL


Reply via email to