Josh Rosen created SPARK-17160:
----------------------------------
Summary: GetExternalRowField does not properly escape field names,
causing generated code not to compile
Key: SPARK-17160
URL: https://issues.apache.org/jira/browse/SPARK-17160
Project: Spark
Issue Type: Bug
Components: SQL
Affects Versions: 2.0.0
Reporter: Josh Rosen
Priority: Critical
The following end-to-end test uncovered a bug in {{GetExternalRowField}}:
{code}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.catalyst.encoders._
spark.sql("set spark.sql.codegen.fallback=false")
val df = Seq(("100-200", "1", "300")).toDF("a", "b", "c")
val df2 = df.select(regexp_replace($"a", "(\\d+)", "num"))
df2.mapPartitions(x => x)(RowEncoder(df2.schema)).collect()
{code}
This causes
{code}
java.lang.Exception: failed to compile:
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 55,
Column 64: Invalid escape sequence
{code}
The generated code is
{code}
/* 001 */ public Object generate(Object[] references) {
/* 002 */ return new GeneratedIterator(references);
/* 003 */ }
/* 004 */
/* 005 */ final class GeneratedIterator extends
org.apache.spark.sql.execution.BufferedRowIterator {
/* 006 */ private Object[] references;
/* 007 */ private scala.collection.Iterator inputadapter_input;
/* 008 */ private java.lang.String serializefromobject_errMsg;
/* 009 */ private java.lang.String serializefromobject_errMsg1;
/* 010 */ private UnsafeRow serializefromobject_result;
/* 011 */ private
org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder
serializefromobject_holder;
/* 012 */ private
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter
serializefromobject_rowWriter;
/* 013 */
/* 014 */ public GeneratedIterator(Object[] references) {
/* 015 */ this.references = references;
/* 016 */ }
/* 017 */
/* 018 */ public void init(int index, scala.collection.Iterator inputs[]) {
/* 019 */ partitionIndex = index;
/* 020 */ inputadapter_input = inputs[0];
/* 021 */ this.serializefromobject_errMsg = (java.lang.String)
references[0];
/* 022 */ this.serializefromobject_errMsg1 = (java.lang.String)
references[1];
/* 023 */ serializefromobject_result = new UnsafeRow(1);
/* 024 */ this.serializefromobject_holder = new
org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(serializefromobject_result,
32);
/* 025 */ this.serializefromobject_rowWriter = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(serializefromobject_holder,
1);
/* 026 */ }
/* 027 */
/* 028 */ protected void processNext() throws java.io.IOException {
/* 029 */ while (inputadapter_input.hasNext()) {
/* 030 */ InternalRow inputadapter_row = (InternalRow)
inputadapter_input.next();
/* 031 */ org.apache.spark.sql.Row inputadapter_value =
(org.apache.spark.sql.Row)inputadapter_row.get(0, null);
/* 032 */
/* 033 */ if (false) {
/* 034 */ throw new RuntimeException(serializefromobject_errMsg);
/* 035 */ }
/* 036 */
/* 037 */ boolean serializefromobject_isNull1 = false || false;
/* 038 */ final boolean serializefromobject_value1 =
serializefromobject_isNull1 ? false : inputadapter_value.isNullAt(0);
/* 039 */ boolean serializefromobject_isNull = false;
/* 040 */ UTF8String serializefromobject_value = null;
/* 041 */ if (!serializefromobject_isNull1 && serializefromobject_value1)
{
/* 042 */ final UTF8String serializefromobject_value5 = null;
/* 043 */ serializefromobject_isNull = true;
/* 044 */ serializefromobject_value = serializefromobject_value5;
/* 045 */ } else {
/* 046 */ if (false) {
/* 047 */ throw new RuntimeException(serializefromobject_errMsg1);
/* 048 */ }
/* 049 */
/* 050 */ if (false) {
/* 051 */ throw new RuntimeException("The input external row cannot
be null.");
/* 052 */ }
/* 053 */
/* 054 */ if (inputadapter_value.isNullAt(0)) {
/* 055 */ throw new RuntimeException("The 0th field
'regexp_replace(a, (\d+), num)' of input row " +
/* 056 */ "cannot be null.");
/* 057 */ }
/* 058 */
/* 059 */ final Object serializefromobject_value8 =
inputadapter_value.get(0);
/* 060 */ java.lang.String serializefromobject_value7 = null;
/* 061 */ if (!false) {
/* 062 */ if (serializefromobject_value8 instanceof java.lang.String)
{
/* 063 */ serializefromobject_value7 = (java.lang.String)
serializefromobject_value8;
/* 064 */ } else {
/* 065 */ throw new
RuntimeException(serializefromobject_value8.getClass().getName() + " is not a
valid " +
/* 066 */ "external type for schema of string");
/* 067 */ }
/* 068 */ }
/* 069 */ boolean serializefromobject_isNull6 = false;
/* 070 */ final UTF8String serializefromobject_value6 =
serializefromobject_isNull6 ? null :
org.apache.spark.unsafe.types.UTF8String.fromString(serializefromobject_value7);
/* 071 */ serializefromobject_isNull6 = serializefromobject_value6 ==
null;
/* 072 */ serializefromobject_isNull = serializefromobject_isNull6;
/* 073 */ serializefromobject_value = serializefromobject_value6;
/* 074 */ }
/* 075 */ serializefromobject_holder.reset();
/* 076 */
/* 077 */ serializefromobject_rowWriter.zeroOutNullBytes();
/* 078 */
/* 079 */ if (serializefromobject_isNull) {
/* 080 */ serializefromobject_rowWriter.setNullAt(0);
/* 081 */ } else {
/* 082 */ serializefromobject_rowWriter.write(0,
serializefromobject_value);
/* 083 */ }
/* 084 */
serializefromobject_result.setTotalSize(serializefromobject_holder.totalSize());
/* 085 */ append(serializefromobject_result);
/* 086 */ if (shouldStop()) return;
/* 087 */ }
/* 088 */ }
/* 089 */ }
{code}
Here, the problem is that the auto-generated field name contains special
characters (including backslashes) and those aren't escaped when being
interpolated into the generated code, causing the invalid string literal
{code}
"The 0th field 'regexp_replace(a, (\d+), num)' of input row "
{code}
to appear in the generated code.
We need to update {{GetExternalRowField}} to escape field names and also need
to audit other expressions to make sure that we're not making the same mistake
there.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]