Author: kevinwilfong Date: Fri May 11 00:42:04 2012 New Revision: 1336986 URL: http://svn.apache.org/viewvc?rev=1336986&view=rev Log: HIVE-3012 hive custom scripts do not work well if the data contains new lines (njain via kevinwilfong)
Added: hive/trunk/data/scripts/newline.py hive/trunk/ql/src/test/queries/clientpositive/newline.q hive/trunk/ql/src/test/results/clientpositive/newline.q.out Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java hive/trunk/conf/hive-default.xml.template hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1336986&r1=1336985&r2=1336986&view=diff ============================================================================== --- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original) +++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri May 11 00:42:04 2012 @@ -389,6 +389,7 @@ public class HiveConf extends Configurat "org.apache.hadoop.hive.ql.exec.TextRecordReader"), HIVESCRIPTRECORDWRITER("hive.script.recordwriter", "org.apache.hadoop.hive.ql.exec.TextRecordWriter"), + HIVESCRIPTESCAPENEWLINES("hive.script.escape.newlines", false), // HWI HIVEHWILISTENHOST("hive.hwi.listen.host", "0.0.0.0"), Modified: hive/trunk/conf/hive-default.xml.template URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1336986&r1=1336985&r2=1336986&view=diff ============================================================================== --- hive/trunk/conf/hive-default.xml.template (original) +++ hive/trunk/conf/hive-default.xml.template Fri May 11 00:42:04 2012 @@ -1272,4 +1272,15 @@ </description> </property> +<property> + <name>hive.script.escape.newlines</name> + <value>false</value> + <description> + This adds an option to escape the newlines when they are passed to the + user script. This is useful is the hive tables can contain data that + can contain newlines. + </description> +</property> + </configuration> + Added: hive/trunk/data/scripts/newline.py URL: http://svn.apache.org/viewvc/hive/trunk/data/scripts/newline.py?rev=1336986&view=auto ============================================================================== --- hive/trunk/data/scripts/newline.py (added) +++ hive/trunk/data/scripts/newline.py Fri May 11 00:42:04 2012 @@ -0,0 +1,22 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import sys + +for line in sys.stdin: + print "1\\n2" Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java?rev=1336986&r1=1336985&r2=1336986&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordReader.java Fri May 11 00:42:04 2012 @@ -27,6 +27,9 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.LineRecordReader.LineReader; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; + /** * TextRecordReader. * @@ -36,11 +39,13 @@ public class TextRecordReader implements private LineReader lineReader; private InputStream in; private Text row; + private Configuration conf; public void initialize(InputStream in, Configuration conf, Properties tbl) throws IOException { lineReader = new LineReader(in, conf); this.in = in; + this.conf = conf; } public Writable createRow() throws IOException { @@ -53,7 +58,12 @@ public class TextRecordReader implements return -1; } - return lineReader.readLine((Text) row); + int bytesConsumed = lineReader.readLine((Text) row); + + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESCRIPTESCAPENEWLINES)) { + return HiveUtils.unescapeNewLine((Text) row); + } + return bytesConsumed; } public void close() throws IOException { Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java?rev=1336986&r1=1336985&r2=1336986&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/TextRecordWriter.java Fri May 11 00:42:04 2012 @@ -24,6 +24,8 @@ import java.io.OutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; +import org.apache.hadoop.hive.conf.HiveConf; /** * TextRecordWriter. @@ -32,15 +34,23 @@ import org.apache.hadoop.io.Writable; public class TextRecordWriter implements RecordWriter { private OutputStream out; + private Configuration conf; public void initialize(OutputStream out, Configuration conf) throws IOException { this.out = out; + this.conf = conf; } public void write(Writable row) throws IOException { Text text = (Text) row; - out.write(text.getBytes(), 0, text.getLength()); + Text escapeText = text; + + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESCRIPTESCAPENEWLINES)) { + escapeText = HiveUtils.escapeNewLine(text); + } + + out.write(escapeText.getBytes(), 0, escapeText.getLength()); out.write(Utilities.newLineCode); } Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java?rev=1336986&r1=1336985&r2=1336986&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java Fri May 11 00:42:04 2012 @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.securit import org.apache.hadoop.hive.ql.security.HiveAuthenticationProvider; import org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider; import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; +import org.apache.hadoop.io.Text; import org.apache.hadoop.util.ReflectionUtils; /** @@ -96,6 +97,71 @@ public final class HiveUtils { return (escape.toString()); } + static final byte[] newLineEscapeBytes = "\\n".getBytes();; + static final byte[] newLineUnescapeBytes = "\n".getBytes(); + + public static Text escapeNewLine(Text text) { + int length = text.getLength(); + byte[] textBytes = text.getBytes(); + + Text escape = new Text(text); + escape.clear(); + + for (int i = 0; i < length; ++i) { + int c = text.charAt(i); + switch (c) { + case '\n': + byte[] escaped = newLineEscapeBytes; + escape.append(escaped, 0, escaped.length); + break; + default: + escape.append(textBytes, i, 1); + break; + } + } + return escape; + } + + public static int unescapeNewLine(Text text) { + Text escape = new Text(text); + text.clear(); + + int length = escape.getLength(); + byte[] textBytes = escape.getBytes(); + + boolean hadSlash = false; + for (int i = 0; i < length; ++i) { + int c = escape.charAt(i); + switch (c) { + case '\\': + if (hadSlash) { + text.append(textBytes, i, 1); + } + hadSlash = true; + break; + case 'n': + if (hadSlash) { + byte[] newLine = newLineUnescapeBytes; + text.append(newLine, 0, newLine.length); + } + else { + text.append(textBytes, i, 1); + } + hadSlash = false; + break; + default: + if (hadSlash) { + text.append(textBytes, i-1, 1); + hadSlash = false; + } + + text.append(textBytes, i, 1); + break; + } + } + return text.getLength(); + } + public static String lightEscapeString(String str) { int length = str.length(); StringBuilder escape = new StringBuilder(length + 16); Added: hive/trunk/ql/src/test/queries/clientpositive/newline.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/newline.q?rev=1336986&view=auto ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/newline.q (added) +++ hive/trunk/ql/src/test/queries/clientpositive/newline.q Fri May 11 00:42:04 2012 @@ -0,0 +1,9 @@ +add file ../data/scripts/newline.py; +set hive.script.escape.newlines=true; + +create table tmp_tmp(key string, value string) stored as rcfile; +insert overwrite table tmp_tmp +SELECT TRANSFORM(key, value) USING +'python newline.py' AS key, value FROM src limit 5; + +select * from tmp_tmp; \ No newline at end of file Added: hive/trunk/ql/src/test/results/clientpositive/newline.q.out URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/newline.q.out?rev=1336986&view=auto ============================================================================== --- hive/trunk/ql/src/test/results/clientpositive/newline.q.out (added) +++ hive/trunk/ql/src/test/results/clientpositive/newline.q.out Fri May 11 00:42:04 2012 @@ -0,0 +1,39 @@ +PREHOOK: query: create table tmp_tmp(key string, value string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table tmp_tmp(key string, value string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tmp_tmp +PREHOOK: query: insert overwrite table tmp_tmp +SELECT TRANSFORM(key, value) USING +'python newline.py' AS key, value FROM src limit 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tmp_tmp +POSTHOOK: query: insert overwrite table tmp_tmp +SELECT TRANSFORM(key, value) USING +'python newline.py' AS key, value FROM src limit 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tmp_tmp +POSTHOOK: Lineage: tmp_tmp.key SCRIPT [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tmp_tmp.value SCRIPT [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: select * from tmp_tmp +PREHOOK: type: QUERY +PREHOOK: Input: default@tmp_tmp +#### A masked pattern was here #### +POSTHOOK: query: select * from tmp_tmp +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmp_tmp +#### A masked pattern was here #### +POSTHOOK: Lineage: tmp_tmp.key SCRIPT [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: tmp_tmp.value SCRIPT [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +1 +2 NULL +1 +2 NULL +1 +2 NULL +1 +2 NULL +1 +2 NULL