[ https://issues.apache.org/jira/browse/SPARK-16913?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15410840#comment-15410840 ]
Kazuaki Ishizaki commented on SPARK-16913: ------------------------------------------ It seems to copy each elements in a struct. Since {{InternalRow}} does not include a structure, the {{internalRow}} keeps two scalar values, which consists of {{isNull}} and {{value}} in this case. If we can provide better schema property (i.e. {{nullable = false}} for {{a}} and {{b}}), lines 44-62 would be simpler. > [SQL] Better codegen where querying nested struct > ------------------------------------------------- > > Key: SPARK-16913 > URL: https://issues.apache.org/jira/browse/SPARK-16913 > Project: Spark > Issue Type: Improvement > Components: SQL > Affects Versions: 2.0.0 > Reporter: Maciej BryĆski > > I have parquet file created as result of: > {code} > spark.range(100).selectExpr("id as a", "id as b").selectExpr("struct(a, b) as > c").write.parquet("/mnt/mfs/codegen_test") > {code} > Then I'm querying whole nested structure with: > {code} > spark.read.parquet("/mnt/mfs/codegen_test").selectExpr("c.*") > {code} > As a result of spark whole stage codegen I'm getting following code. > Is it possible to remove part from line 044 and just return whole result of > getStruct ? (maybe just copied) > {code} > Generated code: > /* 001 */ public Object generate(Object[] references) { > /* 002 */ return new GeneratedIterator(references); > /* 003 */ } > /* 004 */ > /* 005 */ final class GeneratedIterator extends > org.apache.spark.sql.execution.BufferedRowIterator { > /* 006 */ private Object[] references; > /* 007 */ private org.apache.spark.sql.execution.metric.SQLMetric > scan_numOutputRows; > /* 008 */ private scala.collection.Iterator scan_input; > /* 009 */ private UnsafeRow scan_result; > /* 010 */ private > org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder scan_holder; > /* 011 */ private > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter > scan_rowWriter; > /* 012 */ private > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter > scan_rowWriter1; > /* 013 */ private UnsafeRow project_result; > /* 014 */ private > org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder project_holder; > /* 015 */ private > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter > project_rowWriter; > /* 016 */ > /* 017 */ public GeneratedIterator(Object[] references) { > /* 018 */ this.references = references; > /* 019 */ } > /* 020 */ > /* 021 */ public void init(int index, scala.collection.Iterator inputs[]) { > /* 022 */ partitionIndex = index; > /* 023 */ this.scan_numOutputRows = > (org.apache.spark.sql.execution.metric.SQLMetric) references[0]; > /* 024 */ scan_input = inputs[0]; > /* 025 */ scan_result = new UnsafeRow(1); > /* 026 */ this.scan_holder = new > org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(scan_result, > 32); > /* 027 */ this.scan_rowWriter = new > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(scan_holder, > 1); > /* 028 */ this.scan_rowWriter1 = new > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(scan_holder, > 2); > /* 029 */ project_result = new UnsafeRow(2); > /* 030 */ this.project_holder = new > org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(project_result, > 0); > /* 031 */ this.project_rowWriter = new > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(project_holder, > 2); > /* 032 */ } > /* 033 */ > /* 034 */ protected void processNext() throws java.io.IOException { > /* 035 */ while (scan_input.hasNext()) { > /* 036 */ InternalRow scan_row = (InternalRow) scan_input.next(); > /* 037 */ scan_numOutputRows.add(1); > /* 038 */ boolean scan_isNull = scan_row.isNullAt(0); > /* 039 */ InternalRow scan_value = scan_isNull ? null : > (scan_row.getStruct(0, 2)); > /* 040 */ > /* 041 */ boolean project_isNull = scan_isNull; > /* 042 */ long project_value = -1L; > /* 043 */ > /* 044 */ if (!scan_isNull) { > /* 045 */ if (scan_value.isNullAt(0)) { > /* 046 */ project_isNull = true; > /* 047 */ } else { > /* 048 */ project_value = scan_value.getLong(0); > /* 049 */ } > /* 050 */ > /* 051 */ } > /* 052 */ boolean project_isNull2 = scan_isNull; > /* 053 */ long project_value2 = -1L; > /* 054 */ > /* 055 */ if (!scan_isNull) { > /* 056 */ if (scan_value.isNullAt(1)) { > /* 057 */ project_isNull2 = true; > /* 058 */ } else { > /* 059 */ project_value2 = scan_value.getLong(1); > /* 060 */ } > /* 061 */ > /* 062 */ } > /* 063 */ project_rowWriter.zeroOutNullBytes(); > /* 064 */ > /* 065 */ if (project_isNull) { > /* 066 */ project_rowWriter.setNullAt(0); > /* 067 */ } else { > /* 068 */ project_rowWriter.write(0, project_value); > /* 069 */ } > /* 070 */ > /* 071 */ if (project_isNull2) { > /* 072 */ project_rowWriter.setNullAt(1); > /* 073 */ } else { > /* 074 */ project_rowWriter.write(1, project_value2); > /* 075 */ } > /* 076 */ append(project_result); > /* 077 */ if (shouldStop()) return; > /* 078 */ } > /* 079 */ } > /* 080 */ } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org