HeartSaVioR opened a new pull request #28831:
URL: https://github.com/apache/spark/pull/28831


   ### What changes were proposed in this pull request?
   
   This patch fixes the code generation logic for mixed string/array types of 
columns in `concat_ws` to not split methods, because splitting method would 
generate the code which is unable to compile. (Refer the next section `Why are 
the changes needed?` for details.)
   
   The change won't bring performance issue, because it's tricky to trigger 
method split, and due to the relation of parts it should fail to compile 
whenever the method split happens.
   
   Below is the generated code for newly added UT, mixed string/array type of 
columns.
   
   > before the patch
   
   ```
   /* 001 */ public java.lang.Object generate(Object[] references) {
   /* 002 */   return new SpecificUnsafeProjection(references);
   /* 003 */ }
   /* 004 */
   /* 005 */ class SpecificUnsafeProjection extends 
org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
   /* 006 */
   /* 007 */   private Object[] references;
   /* 008 */   private 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] 
mutableStateArray_0 = new 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
   /* 009 */
   /* 010 */   public SpecificUnsafeProjection(Object[] references) {
   /* 011 */     this.references = references;
   /* 012 */     mutableStateArray_0[0] = new 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
   /* 013 */
   /* 014 */   }
   /* 015 */
   /* 016 */   public void initialize(int partitionIndex) {
   /* 017 */
   /* 018 */   }
   /* 019 */
   /* 020 */   // Scala.Function1 need this
   /* 021 */   public java.lang.Object apply(java.lang.Object row) {
   /* 022 */     return apply((InternalRow) row);
   /* 023 */   }
   /* 024 */
   /* 025 */   public UnsafeRow apply(InternalRow i) {
   /* 026 */     mutableStateArray_0[0].reset();
   /* 027 */
   /* 028 */
   /* 029 */     mutableStateArray_0[0].zeroOutNullBytes();
   /* 030 */
   /* 031 */     apply_0_0(i);
   /* 032 */     apply_0_1(i);
   /* 033 */     int varargNum_0 = 30;
   /* 034 */     int idxInVararg_0 = 0;
   /* 035 */
   /* 036 */     if (!isNull_2) {
   /* 037 */       varargNum_0 += value_2.numElements();
   /* 038 */     }
   /* 039 */
   /* 040 */     if (!isNull_3) {
   /* 041 */       varargNum_0 += value_3.numElements();
   /* 042 */     }
   /* 043 */
   /* 044 */     UTF8String[] array_0 = new UTF8String[varargNum_0];
   /* 045 */     idxInVararg_0 = varargBuildsConcatWs_0_0(i, array_0, 
idxInVararg_0);
   /* 046 */     idxInVararg_0 = varargBuildsConcatWs_0_1(i, array_0, 
idxInVararg_0);
   /* 047 */     idxInVararg_0 = varargBuildsConcatWs_0_2(i, array_0, 
idxInVararg_0);
   /* 048 */     UTF8String value_0 = UTF8String.concatWs(((UTF8String) 
references[0] /* literal */), array_0);
   /* 049 */     boolean isNull_0 = value_0 == null;
   /* 050 */     mutableStateArray_0[0].write(0, value_0);
   /* 051 */     return (mutableStateArray_0[0].getRow());
   /* 052 */   }
   /* 053 */
   /* 054 */
   /* 055 */   private void apply_0_1(InternalRow i) {
   /* 056 */     UTF8String value_25 = i.getUTF8String(22);UTF8String value_26 
= i.getUTF8String(23);UTF8String value_27 = i.getUTF8String(24);UTF8String 
value_28 = i.getUTF8String(25);UTF8String value_29 = 
i.getUTF8String(26);UTF8String value_30 = i.getUTF8String(27);UTF8String 
value_31 = i.getUTF8String(28);UTF8String value_32 = 
i.getUTF8String(29);UTF8String value_33 = i.getUTF8String(30);
   /* 057 */   }
   /* 058 */
   /* 059 */
   /* 060 */   private int varargBuildsConcatWs_0_0(InternalRow i, UTF8String 
[] array_0, int idxInVararg_0) {
   /* 061 */
   /* 062 */
   /* 063 */     if (!isNull_2) {
   /* 064 */       final int n_0 = value_2.numElements();
   /* 065 */       for (int j = 0; j < n_0; j ++) {
   /* 066 */         array_0[idxInVararg_0 ++] = value_2.getUTF8String(j);
   /* 067 */       }
   /* 068 */     }
   /* 069 */
   /* 070 */     if (!isNull_3) {
   /* 071 */       final int n_1 = value_3.numElements();
   /* 072 */       for (int j = 0; j < n_1; j ++) {
   /* 073 */         array_0[idxInVararg_0 ++] = value_3.getUTF8String(j);
   /* 074 */       }
   /* 075 */     }
   /* 076 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_4;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_5;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_6;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_7;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_8;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_9;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_10;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : value_11;
   /* 077 */     return idxInVararg_0;
   /* 078 */
   /* 079 */   }
   /* 080 */
   /* 081 */
   /* 082 */   private int varargBuildsConcatWs_0_2(InternalRow i, UTF8String 
[] array_0, int idxInVararg_0) {
   /* 083 */
   /* 084 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_28;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_29;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_30;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_31;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_32;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : value_33;
   /* 085 */     return idxInVararg_0;
   /* 086 */
   /* 087 */   }
   /* 088 */
   /* 089 */
   /* 090 */   private void apply_0_0(InternalRow i) {
   /* 091 */     boolean isNull_2 = i.isNullAt(31);
   /* 092 */     ArrayData value_2 = isNull_2 ?
   /* 093 */     null : (i.getArray(31));boolean isNull_3 = i.isNullAt(32);
   /* 094 */     ArrayData value_3 = isNull_3 ?
   /* 095 */     null : (i.getArray(32));UTF8String value_4 = 
i.getUTF8String(1);UTF8String value_5 = i.getUTF8String(2);UTF8String value_6 = 
i.getUTF8String(3);UTF8String value_7 = i.getUTF8String(4);UTF8String value_8 = 
i.getUTF8String(5);UTF8String value_9 = i.getUTF8String(6);UTF8String value_10 
= i.getUTF8String(7);UTF8String value_11 = i.getUTF8String(8);UTF8String 
value_12 = i.getUTF8String(9);UTF8String value_13 = 
i.getUTF8String(10);UTF8String value_14 = i.getUTF8String(11);UTF8String 
value_15 = i.getUTF8String(12);UTF8String value_16 = 
i.getUTF8String(13);UTF8String value_17 = i.getUTF8String(14);UTF8String 
value_18 = i.getUTF8String(15);UTF8String value_19 = 
i.getUTF8String(16);UTF8String value_20 = i.getUTF8String(17);UTF8String 
value_21 = i.getUTF8String(18);UTF8String value_22 = 
i.getUTF8String(19);UTF8String value_23 = i.getUTF8String(20);UTF8String 
value_24 = i.getUTF8String(21);
   /* 096 */   }
   /* 097 */
   /* 098 */
   /* 099 */   private int varargBuildsConcatWs_0_1(InternalRow i, UTF8String 
[] array_0, int idxInVararg_0) {
   /* 100 */
   /* 101 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_12;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_13;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_14;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_15;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_16;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_17;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_18;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_19;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_20;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_21;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_22;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_23;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_24;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_25;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_26;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : value_27;
   /* 102 */     return idxInVararg_0;
   /* 103 */
   /* 104 */   }
   /* 105 */
   /* 106 */ }
   ```
   
   Compilation of the generated code fails with error message: 
`org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 
36, Column 6: Expression "isNull_2" is not an rvalue`
   
   > after the patch
   
   ```
   /* 001 */ public java.lang.Object generate(Object[] references) {
   /* 002 */   return new SpecificUnsafeProjection(references);
   /* 003 */ }
   /* 004 */
   /* 005 */ class SpecificUnsafeProjection extends 
org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
   /* 006 */
   /* 007 */   private Object[] references;
   /* 008 */   private boolean globalIsNull_0;
   /* 009 */   private 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] 
mutableStateArray_0 = new 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
   /* 010 */
   /* 011 */   public SpecificUnsafeProjection(Object[] references) {
   /* 012 */     this.references = references;
   /* 013 */
   /* 014 */     mutableStateArray_0[0] = new 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
   /* 015 */
   /* 016 */   }
   /* 017 */
   /* 018 */   public void initialize(int partitionIndex) {
   /* 019 */
   /* 020 */   }
   /* 021 */
   /* 022 */   // Scala.Function1 need this
   /* 023 */   public java.lang.Object apply(java.lang.Object row) {
   /* 024 */     return apply((InternalRow) row);
   /* 025 */   }
   /* 026 */
   /* 027 */   public UnsafeRow apply(InternalRow i) {
   /* 028 */     mutableStateArray_0[0].reset();
   /* 029 */
   /* 030 */
   /* 031 */     mutableStateArray_0[0].zeroOutNullBytes();
   /* 032 */
   /* 033 */     UTF8String value_34 = ConcatWs_0(i);
   /* 034 */     mutableStateArray_0[0].write(0, value_34);
   /* 035 */     return (mutableStateArray_0[0].getRow());
   /* 036 */   }
   /* 037 */
   /* 038 */
   /* 039 */   private UTF8String ConcatWs_0(InternalRow i) {
   /* 040 */     boolean isNull_2 = i.isNullAt(31);
   /* 041 */     ArrayData value_2 = isNull_2 ?
   /* 042 */     null : (i.getArray(31));
   /* 043 */     boolean isNull_3 = i.isNullAt(32);
   /* 044 */     ArrayData value_3 = isNull_3 ?
   /* 045 */     null : (i.getArray(32));
   /* 046 */     UTF8String value_4 = i.getUTF8String(1);
   /* 047 */     UTF8String value_5 = i.getUTF8String(2);
   /* 048 */     UTF8String value_6 = i.getUTF8String(3);
   /* 049 */     UTF8String value_7 = i.getUTF8String(4);
   /* 050 */     UTF8String value_8 = i.getUTF8String(5);
   /* 051 */     UTF8String value_9 = i.getUTF8String(6);
   /* 052 */     UTF8String value_10 = i.getUTF8String(7);
   /* 053 */     UTF8String value_11 = i.getUTF8String(8);
   /* 054 */     UTF8String value_12 = i.getUTF8String(9);
   /* 055 */     UTF8String value_13 = i.getUTF8String(10);
   /* 056 */     UTF8String value_14 = i.getUTF8String(11);
   /* 057 */     UTF8String value_15 = i.getUTF8String(12);
   /* 058 */     UTF8String value_16 = i.getUTF8String(13);
   /* 059 */     UTF8String value_17 = i.getUTF8String(14);
   /* 060 */     UTF8String value_18 = i.getUTF8String(15);
   /* 061 */     UTF8String value_19 = i.getUTF8String(16);
   /* 062 */     UTF8String value_20 = i.getUTF8String(17);
   /* 063 */     UTF8String value_21 = i.getUTF8String(18);
   /* 064 */     UTF8String value_22 = i.getUTF8String(19);
   /* 065 */     UTF8String value_23 = i.getUTF8String(20);
   /* 066 */     UTF8String value_24 = i.getUTF8String(21);
   /* 067 */     UTF8String value_25 = i.getUTF8String(22);
   /* 068 */     UTF8String value_26 = i.getUTF8String(23);
   /* 069 */     UTF8String value_27 = i.getUTF8String(24);
   /* 070 */     UTF8String value_28 = i.getUTF8String(25);
   /* 071 */     UTF8String value_29 = i.getUTF8String(26);
   /* 072 */     UTF8String value_30 = i.getUTF8String(27);
   /* 073 */     UTF8String value_31 = i.getUTF8String(28);
   /* 074 */     UTF8String value_32 = i.getUTF8String(29);
   /* 075 */     UTF8String value_33 = i.getUTF8String(30);
   /* 076 */     int varargNum_0 = 30;
   /* 077 */     int idxInVararg_0 = 0;
   /* 078 */
   /* 079 */     if (!isNull_2) {
   /* 080 */       varargNum_0 += value_2.numElements();
   /* 081 */     }
   /* 082 */
   /* 083 */
   /* 084 */     if (!isNull_3) {
   /* 085 */       varargNum_0 += value_3.numElements();
   /* 086 */     }
   /* 087 */
   /* 088 */
   /* 089 */
   /* 090 */
   /* 091 */
   /* 092 */
   /* 093 */
   /* 094 */
   /* 095 */
   /* 096 */
   /* 097 */
   /* 098 */
   /* 099 */
   /* 100 */
   /* 101 */
   /* 102 */
   /* 103 */
   /* 104 */
   /* 105 */
   /* 106 */
   /* 107 */
   /* 108 */
   /* 109 */
   /* 110 */
   /* 111 */
   /* 112 */
   /* 113 */
   /* 114 */
   /* 115 */
   /* 116 */
   /* 117 */
   /* 118 */     UTF8String[] array_0 = new UTF8String[varargNum_0];
   /* 119 */
   /* 120 */     if (!isNull_2) {
   /* 121 */       final int n_0 = value_2.numElements();
   /* 122 */       for (int j = 0; j < n_0; j ++) {
   /* 123 */         array_0[idxInVararg_0 ++] = value_2.getUTF8String(j);
   /* 124 */       }
   /* 125 */     }
   /* 126 */
   /* 127 */
   /* 128 */     if (!isNull_3) {
   /* 129 */       final int n_1 = value_3.numElements();
   /* 130 */       for (int j = 0; j < n_1; j ++) {
   /* 131 */         array_0[idxInVararg_0 ++] = value_3.getUTF8String(j);
   /* 132 */       }
   /* 133 */     }
   /* 134 */
   /* 135 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_4;
   /* 136 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_5;
   /* 137 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_6;
   /* 138 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_7;
   /* 139 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_8;
   /* 140 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_9;
   /* 141 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_10;
   /* 142 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_11;
   /* 143 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_12;
   /* 144 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_13;
   /* 145 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_14;
   /* 146 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_15;
   /* 147 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_16;
   /* 148 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_17;
   /* 149 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_18;
   /* 150 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_19;
   /* 151 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_20;
   /* 152 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_21;
   /* 153 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_22;
   /* 154 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_23;
   /* 155 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_24;
   /* 156 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_25;
   /* 157 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_26;
   /* 158 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_27;
   /* 159 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_28;
   /* 160 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_29;
   /* 161 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_30;
   /* 162 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_31;
   /* 163 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_32;
   /* 164 */     array_0[idxInVararg_0 ++] = false ? (UTF8String) null : 
value_33;
   /* 165 */     UTF8String value_0 = UTF8String.concatWs(((UTF8String) 
references[0] /* literal */), array_0);
   /* 166 */     boolean isNull_0 = value_0 == null;
   /* 167 */     globalIsNull_0 = isNull_0;
   /* 168 */     return value_0;
   /* 169 */   }
   /* 170 */
   /* 171 */ }
   ```
   
   ### Why are the changes needed?
   
   The generated code in `concat_ws` fails to compile when the below conditions 
are met:
   
   * Plenty of columns are provided as input of `concat_ws`.
   * Above columns are mixed up with string type and array[string] type.
   * Splitting methods is triggered in `splitExpressionsWithCurrentInputs`.
     * This is a bit tricky, as the method won't split methods under whole 
stage codegen, as well as it will be simply no-op (inlined) if the number of 
blocks to convert into methods is 1.
   
   
https://github.com/apache/spark/blob/a0187cd6b59a6b6bb2cadc6711bb663d4d35a844/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala#L88-L195
   
   There're three parts of generated code in `concat_ws` (`codes`, 
`varargCounts`, `varargBuilds`) and all parts try to split method by itself, 
while `varargCounts` and `varargBuilds` refer on the generated code in `codes`, 
hence the overall generated code fails to compile if any of part succeeds to 
split.
   
   ### Does this PR introduce _any_ user-facing change?
   
   No.
   
   ### How was this patch tested?
   
   New UTs added. (One for verification of the patch, another one for 
regression test)


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to