HeartSaVioR opened a new pull request #28831:
URL: https://github.com/apache/spark/pull/28831
### What changes were proposed in this pull request?
This patch fixes the code generation logic for mixed string/array types of
columns in `concat_ws` to not split methods, because splitting method would
generate the code which is unable to compile. (Refer the next section `Why are
the changes needed?` for details.)
The change won't bring performance issue, because it's tricky to trigger
method split, and due to the relation of parts it should fail to compile
whenever the method split happens.
Below is the generated code for newly added UT, mixed string/array type of
columns.
> before the patch
```
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */ return new SpecificUnsafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificUnsafeProjection extends
org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
/* 006 */
/* 007 */ private Object[] references;
/* 008 */ private
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[]
mutableStateArray_0 = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
/* 009 */
/* 010 */ public SpecificUnsafeProjection(Object[] references) {
/* 011 */ this.references = references;
/* 012 */ mutableStateArray_0[0] = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 013 */
/* 014 */ }
/* 015 */
/* 016 */ public void initialize(int partitionIndex) {
/* 017 */
/* 018 */ }
/* 019 */
/* 020 */ // Scala.Function1 need this
/* 021 */ public java.lang.Object apply(java.lang.Object row) {
/* 022 */ return apply((InternalRow) row);
/* 023 */ }
/* 024 */
/* 025 */ public UnsafeRow apply(InternalRow i) {
/* 026 */ mutableStateArray_0[0].reset();
/* 027 */
/* 028 */
/* 029 */ mutableStateArray_0[0].zeroOutNullBytes();
/* 030 */
/* 031 */ apply_0_0(i);
/* 032 */ apply_0_1(i);
/* 033 */ int varargNum_0 = 30;
/* 034 */ int idxInVararg_0 = 0;
/* 035 */
/* 036 */ if (!isNull_2) {
/* 037 */ varargNum_0 += value_2.numElements();
/* 038 */ }
/* 039 */
/* 040 */ if (!isNull_3) {
/* 041 */ varargNum_0 += value_3.numElements();
/* 042 */ }
/* 043 */
/* 044 */ UTF8String[] array_0 = new UTF8String[varargNum_0];
/* 045 */ idxInVararg_0 = varargBuildsConcatWs_0_0(i, array_0,
idxInVararg_0);
/* 046 */ idxInVararg_0 = varargBuildsConcatWs_0_1(i, array_0,
idxInVararg_0);
/* 047 */ idxInVararg_0 = varargBuildsConcatWs_0_2(i, array_0,
idxInVararg_0);
/* 048 */ UTF8String value_0 = UTF8String.concatWs(((UTF8String)
references[0] /* literal */), array_0);
/* 049 */ boolean isNull_0 = value_0 == null;
/* 050 */ mutableStateArray_0[0].write(0, value_0);
/* 051 */ return (mutableStateArray_0[0].getRow());
/* 052 */ }
/* 053 */
/* 054 */
/* 055 */ private void apply_0_1(InternalRow i) {
/* 056 */ UTF8String value_25 = i.getUTF8String(22);UTF8String value_26
= i.getUTF8String(23);UTF8String value_27 = i.getUTF8String(24);UTF8String
value_28 = i.getUTF8String(25);UTF8String value_29 =
i.getUTF8String(26);UTF8String value_30 = i.getUTF8String(27);UTF8String
value_31 = i.getUTF8String(28);UTF8String value_32 =
i.getUTF8String(29);UTF8String value_33 = i.getUTF8String(30);
/* 057 */ }
/* 058 */
/* 059 */
/* 060 */ private int varargBuildsConcatWs_0_0(InternalRow i, UTF8String
[] array_0, int idxInVararg_0) {
/* 061 */
/* 062 */
/* 063 */ if (!isNull_2) {
/* 064 */ final int n_0 = value_2.numElements();
/* 065 */ for (int j = 0; j < n_0; j ++) {
/* 066 */ array_0[idxInVararg_0 ++] = value_2.getUTF8String(j);
/* 067 */ }
/* 068 */ }
/* 069 */
/* 070 */ if (!isNull_3) {
/* 071 */ final int n_1 = value_3.numElements();
/* 072 */ for (int j = 0; j < n_1; j ++) {
/* 073 */ array_0[idxInVararg_0 ++] = value_3.getUTF8String(j);
/* 074 */ }
/* 075 */ }
/* 076 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_4;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_5;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_6;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_7;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_8;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_9;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_10;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : value_11;
/* 077 */ return idxInVararg_0;
/* 078 */
/* 079 */ }
/* 080 */
/* 081 */
/* 082 */ private int varargBuildsConcatWs_0_2(InternalRow i, UTF8String
[] array_0, int idxInVararg_0) {
/* 083 */
/* 084 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_28;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_29;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_30;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_31;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_32;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : value_33;
/* 085 */ return idxInVararg_0;
/* 086 */
/* 087 */ }
/* 088 */
/* 089 */
/* 090 */ private void apply_0_0(InternalRow i) {
/* 091 */ boolean isNull_2 = i.isNullAt(31);
/* 092 */ ArrayData value_2 = isNull_2 ?
/* 093 */ null : (i.getArray(31));boolean isNull_3 = i.isNullAt(32);
/* 094 */ ArrayData value_3 = isNull_3 ?
/* 095 */ null : (i.getArray(32));UTF8String value_4 =
i.getUTF8String(1);UTF8String value_5 = i.getUTF8String(2);UTF8String value_6 =
i.getUTF8String(3);UTF8String value_7 = i.getUTF8String(4);UTF8String value_8 =
i.getUTF8String(5);UTF8String value_9 = i.getUTF8String(6);UTF8String value_10
= i.getUTF8String(7);UTF8String value_11 = i.getUTF8String(8);UTF8String
value_12 = i.getUTF8String(9);UTF8String value_13 =
i.getUTF8String(10);UTF8String value_14 = i.getUTF8String(11);UTF8String
value_15 = i.getUTF8String(12);UTF8String value_16 =
i.getUTF8String(13);UTF8String value_17 = i.getUTF8String(14);UTF8String
value_18 = i.getUTF8String(15);UTF8String value_19 =
i.getUTF8String(16);UTF8String value_20 = i.getUTF8String(17);UTF8String
value_21 = i.getUTF8String(18);UTF8String value_22 =
i.getUTF8String(19);UTF8String value_23 = i.getUTF8String(20);UTF8String
value_24 = i.getUTF8String(21);
/* 096 */ }
/* 097 */
/* 098 */
/* 099 */ private int varargBuildsConcatWs_0_1(InternalRow i, UTF8String
[] array_0, int idxInVararg_0) {
/* 100 */
/* 101 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_12;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_13;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_14;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_15;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_16;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_17;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_18;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_19;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_20;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_21;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_22;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_23;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_24;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_25;array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_26;array_0[idxInVararg_0 ++] = false ? (UTF8String) null : value_27;
/* 102 */ return idxInVararg_0;
/* 103 */
/* 104 */ }
/* 105 */
/* 106 */ }
```
Compilation of the generated code fails with error message:
`org.codehaus.commons.compiler.CompileException: File 'generated.java', Line
36, Column 6: Expression "isNull_2" is not an rvalue`
> after the patch
```
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */ return new SpecificUnsafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificUnsafeProjection extends
org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
/* 006 */
/* 007 */ private Object[] references;
/* 008 */ private boolean globalIsNull_0;
/* 009 */ private
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[]
mutableStateArray_0 = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
/* 010 */
/* 011 */ public SpecificUnsafeProjection(Object[] references) {
/* 012 */ this.references = references;
/* 013 */
/* 014 */ mutableStateArray_0[0] = new
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 32);
/* 015 */
/* 016 */ }
/* 017 */
/* 018 */ public void initialize(int partitionIndex) {
/* 019 */
/* 020 */ }
/* 021 */
/* 022 */ // Scala.Function1 need this
/* 023 */ public java.lang.Object apply(java.lang.Object row) {
/* 024 */ return apply((InternalRow) row);
/* 025 */ }
/* 026 */
/* 027 */ public UnsafeRow apply(InternalRow i) {
/* 028 */ mutableStateArray_0[0].reset();
/* 029 */
/* 030 */
/* 031 */ mutableStateArray_0[0].zeroOutNullBytes();
/* 032 */
/* 033 */ UTF8String value_34 = ConcatWs_0(i);
/* 034 */ mutableStateArray_0[0].write(0, value_34);
/* 035 */ return (mutableStateArray_0[0].getRow());
/* 036 */ }
/* 037 */
/* 038 */
/* 039 */ private UTF8String ConcatWs_0(InternalRow i) {
/* 040 */ boolean isNull_2 = i.isNullAt(31);
/* 041 */ ArrayData value_2 = isNull_2 ?
/* 042 */ null : (i.getArray(31));
/* 043 */ boolean isNull_3 = i.isNullAt(32);
/* 044 */ ArrayData value_3 = isNull_3 ?
/* 045 */ null : (i.getArray(32));
/* 046 */ UTF8String value_4 = i.getUTF8String(1);
/* 047 */ UTF8String value_5 = i.getUTF8String(2);
/* 048 */ UTF8String value_6 = i.getUTF8String(3);
/* 049 */ UTF8String value_7 = i.getUTF8String(4);
/* 050 */ UTF8String value_8 = i.getUTF8String(5);
/* 051 */ UTF8String value_9 = i.getUTF8String(6);
/* 052 */ UTF8String value_10 = i.getUTF8String(7);
/* 053 */ UTF8String value_11 = i.getUTF8String(8);
/* 054 */ UTF8String value_12 = i.getUTF8String(9);
/* 055 */ UTF8String value_13 = i.getUTF8String(10);
/* 056 */ UTF8String value_14 = i.getUTF8String(11);
/* 057 */ UTF8String value_15 = i.getUTF8String(12);
/* 058 */ UTF8String value_16 = i.getUTF8String(13);
/* 059 */ UTF8String value_17 = i.getUTF8String(14);
/* 060 */ UTF8String value_18 = i.getUTF8String(15);
/* 061 */ UTF8String value_19 = i.getUTF8String(16);
/* 062 */ UTF8String value_20 = i.getUTF8String(17);
/* 063 */ UTF8String value_21 = i.getUTF8String(18);
/* 064 */ UTF8String value_22 = i.getUTF8String(19);
/* 065 */ UTF8String value_23 = i.getUTF8String(20);
/* 066 */ UTF8String value_24 = i.getUTF8String(21);
/* 067 */ UTF8String value_25 = i.getUTF8String(22);
/* 068 */ UTF8String value_26 = i.getUTF8String(23);
/* 069 */ UTF8String value_27 = i.getUTF8String(24);
/* 070 */ UTF8String value_28 = i.getUTF8String(25);
/* 071 */ UTF8String value_29 = i.getUTF8String(26);
/* 072 */ UTF8String value_30 = i.getUTF8String(27);
/* 073 */ UTF8String value_31 = i.getUTF8String(28);
/* 074 */ UTF8String value_32 = i.getUTF8String(29);
/* 075 */ UTF8String value_33 = i.getUTF8String(30);
/* 076 */ int varargNum_0 = 30;
/* 077 */ int idxInVararg_0 = 0;
/* 078 */
/* 079 */ if (!isNull_2) {
/* 080 */ varargNum_0 += value_2.numElements();
/* 081 */ }
/* 082 */
/* 083 */
/* 084 */ if (!isNull_3) {
/* 085 */ varargNum_0 += value_3.numElements();
/* 086 */ }
/* 087 */
/* 088 */
/* 089 */
/* 090 */
/* 091 */
/* 092 */
/* 093 */
/* 094 */
/* 095 */
/* 096 */
/* 097 */
/* 098 */
/* 099 */
/* 100 */
/* 101 */
/* 102 */
/* 103 */
/* 104 */
/* 105 */
/* 106 */
/* 107 */
/* 108 */
/* 109 */
/* 110 */
/* 111 */
/* 112 */
/* 113 */
/* 114 */
/* 115 */
/* 116 */
/* 117 */
/* 118 */ UTF8String[] array_0 = new UTF8String[varargNum_0];
/* 119 */
/* 120 */ if (!isNull_2) {
/* 121 */ final int n_0 = value_2.numElements();
/* 122 */ for (int j = 0; j < n_0; j ++) {
/* 123 */ array_0[idxInVararg_0 ++] = value_2.getUTF8String(j);
/* 124 */ }
/* 125 */ }
/* 126 */
/* 127 */
/* 128 */ if (!isNull_3) {
/* 129 */ final int n_1 = value_3.numElements();
/* 130 */ for (int j = 0; j < n_1; j ++) {
/* 131 */ array_0[idxInVararg_0 ++] = value_3.getUTF8String(j);
/* 132 */ }
/* 133 */ }
/* 134 */
/* 135 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_4;
/* 136 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_5;
/* 137 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_6;
/* 138 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_7;
/* 139 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_8;
/* 140 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_9;
/* 141 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_10;
/* 142 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_11;
/* 143 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_12;
/* 144 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_13;
/* 145 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_14;
/* 146 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_15;
/* 147 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_16;
/* 148 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_17;
/* 149 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_18;
/* 150 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_19;
/* 151 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_20;
/* 152 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_21;
/* 153 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_22;
/* 154 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_23;
/* 155 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_24;
/* 156 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_25;
/* 157 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_26;
/* 158 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_27;
/* 159 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_28;
/* 160 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_29;
/* 161 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_30;
/* 162 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_31;
/* 163 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_32;
/* 164 */ array_0[idxInVararg_0 ++] = false ? (UTF8String) null :
value_33;
/* 165 */ UTF8String value_0 = UTF8String.concatWs(((UTF8String)
references[0] /* literal */), array_0);
/* 166 */ boolean isNull_0 = value_0 == null;
/* 167 */ globalIsNull_0 = isNull_0;
/* 168 */ return value_0;
/* 169 */ }
/* 170 */
/* 171 */ }
```
### Why are the changes needed?
The generated code in `concat_ws` fails to compile when the below conditions
are met:
* Plenty of columns are provided as input of `concat_ws`.
* Above columns are mixed up with string type and array[string] type.
* Splitting methods is triggered in `splitExpressionsWithCurrentInputs`.
* This is a bit tricky, as the method won't split methods under whole
stage codegen, as well as it will be simply no-op (inlined) if the number of
blocks to convert into methods is 1.
https://github.com/apache/spark/blob/a0187cd6b59a6b6bb2cadc6711bb663d4d35a844/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala#L88-L195
There're three parts of generated code in `concat_ws` (`codes`,
`varargCounts`, `varargBuilds`) and all parts try to split method by itself,
while `varargCounts` and `varargBuilds` refer on the generated code in `codes`,
hence the overall generated code fails to compile if any of part succeeds to
split.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
New UTs added. (One for verification of the patch, another one for
regression test)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]