Josh Rosen created SPARK-17160:
----------------------------------

             Summary: GetExternalRowField does not properly escape field names, 
causing generated code not to compile
                 Key: SPARK-17160
                 URL: https://issues.apache.org/jira/browse/SPARK-17160
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 2.0.0
            Reporter: Josh Rosen
            Priority: Critical


The following end-to-end test uncovered a bug in {{GetExternalRowField}}:

{code}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.catalyst.encoders._

spark.sql("set spark.sql.codegen.fallback=false")

val df = Seq(("100-200", "1", "300")).toDF("a", "b", "c")
val df2 = df.select(regexp_replace($"a", "(\\d+)", "num"))
df2.mapPartitions(x => x)(RowEncoder(df2.schema)).collect()
{code}

This causes

{code}
java.lang.Exception: failed to compile: 
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 55, 
Column 64: Invalid escape sequence
{code}

The generated code is

{code}
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIterator(references);
/* 003 */ }
/* 004 */
/* 005 */ final class GeneratedIterator extends 
org.apache.spark.sql.execution.BufferedRowIterator {
/* 006 */   private Object[] references;
/* 007 */   private scala.collection.Iterator inputadapter_input;
/* 008 */   private java.lang.String serializefromobject_errMsg;
/* 009 */   private java.lang.String serializefromobject_errMsg1;
/* 010 */   private UnsafeRow serializefromobject_result;
/* 011 */   private 
org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder 
serializefromobject_holder;
/* 012 */   private 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter 
serializefromobject_rowWriter;
/* 013 */
/* 014 */   public GeneratedIterator(Object[] references) {
/* 015 */     this.references = references;
/* 016 */   }
/* 017 */
/* 018 */   public void init(int index, scala.collection.Iterator inputs[]) {
/* 019 */     partitionIndex = index;
/* 020 */     inputadapter_input = inputs[0];
/* 021 */     this.serializefromobject_errMsg = (java.lang.String) 
references[0];
/* 022 */     this.serializefromobject_errMsg1 = (java.lang.String) 
references[1];
/* 023 */     serializefromobject_result = new UnsafeRow(1);
/* 024 */     this.serializefromobject_holder = new 
org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(serializefromobject_result,
 32);
/* 025 */     this.serializefromobject_rowWriter = new 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(serializefromobject_holder,
 1);
/* 026 */   }
/* 027 */
/* 028 */   protected void processNext() throws java.io.IOException {
/* 029 */     while (inputadapter_input.hasNext()) {
/* 030 */       InternalRow inputadapter_row = (InternalRow) 
inputadapter_input.next();
/* 031 */       org.apache.spark.sql.Row inputadapter_value = 
(org.apache.spark.sql.Row)inputadapter_row.get(0, null);
/* 032 */
/* 033 */       if (false) {
/* 034 */         throw new RuntimeException(serializefromobject_errMsg);
/* 035 */       }
/* 036 */
/* 037 */       boolean serializefromobject_isNull1 = false || false;
/* 038 */       final boolean serializefromobject_value1 = 
serializefromobject_isNull1 ? false : inputadapter_value.isNullAt(0);
/* 039 */       boolean serializefromobject_isNull = false;
/* 040 */       UTF8String serializefromobject_value = null;
/* 041 */       if (!serializefromobject_isNull1 && serializefromobject_value1) 
{
/* 042 */         final UTF8String serializefromobject_value5 = null;
/* 043 */         serializefromobject_isNull = true;
/* 044 */         serializefromobject_value = serializefromobject_value5;
/* 045 */       } else {
/* 046 */         if (false) {
/* 047 */           throw new RuntimeException(serializefromobject_errMsg1);
/* 048 */         }
/* 049 */
/* 050 */         if (false) {
/* 051 */           throw new RuntimeException("The input external row cannot 
be null.");
/* 052 */         }
/* 053 */
/* 054 */         if (inputadapter_value.isNullAt(0)) {
/* 055 */           throw new RuntimeException("The 0th field 
'regexp_replace(a, (\d+), num)' of input row " +
/* 056 */             "cannot be null.");
/* 057 */         }
/* 058 */
/* 059 */         final Object serializefromobject_value8 = 
inputadapter_value.get(0);
/* 060 */         java.lang.String serializefromobject_value7 = null;
/* 061 */         if (!false) {
/* 062 */           if (serializefromobject_value8 instanceof java.lang.String) 
{
/* 063 */             serializefromobject_value7 = (java.lang.String) 
serializefromobject_value8;
/* 064 */           } else {
/* 065 */             throw new 
RuntimeException(serializefromobject_value8.getClass().getName() + " is not a 
valid " +
/* 066 */               "external type for schema of string");
/* 067 */           }
/* 068 */         }
/* 069 */         boolean serializefromobject_isNull6 = false;
/* 070 */         final UTF8String serializefromobject_value6 = 
serializefromobject_isNull6 ? null : 
org.apache.spark.unsafe.types.UTF8String.fromString(serializefromobject_value7);
/* 071 */         serializefromobject_isNull6 = serializefromobject_value6 == 
null;
/* 072 */         serializefromobject_isNull = serializefromobject_isNull6;
/* 073 */         serializefromobject_value = serializefromobject_value6;
/* 074 */       }
/* 075 */       serializefromobject_holder.reset();
/* 076 */
/* 077 */       serializefromobject_rowWriter.zeroOutNullBytes();
/* 078 */
/* 079 */       if (serializefromobject_isNull) {
/* 080 */         serializefromobject_rowWriter.setNullAt(0);
/* 081 */       } else {
/* 082 */         serializefromobject_rowWriter.write(0, 
serializefromobject_value);
/* 083 */       }
/* 084 */       
serializefromobject_result.setTotalSize(serializefromobject_holder.totalSize());
/* 085 */       append(serializefromobject_result);
/* 086 */       if (shouldStop()) return;
/* 087 */     }
/* 088 */   }
/* 089 */ }
{code}

Here, the problem is that the auto-generated field name contains special 
characters (including backslashes) and those aren't escaped when being 
interpolated into the generated code, causing the invalid string literal

{code}
"The 0th field 'regexp_replace(a, (\d+), num)' of input row "
{code}

to appear in the generated code.

We need to update {{GetExternalRowField}} to escape field names and also need 
to audit other expressions to make sure that we're not making the same mistake 
there.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to