c21 commented on a change in pull request #31958:
URL: https://github.com/apache/spark/pull/31958#discussion_r603014404
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
##########
@@ -838,6 +838,13 @@ object SQLConf {
.intConf
.createWithDefault(4096)
+ val ORC_VECTORIZED_READER_NESTED_COLUMN_ENABLED =
+ buildConf("spark.sql.orc.enableNestedColumnVectorizedReader")
+ .doc("Enables vectorized orc decoding for nested column.")
+ .version("3.2.0")
+ .booleanConf
+ .createWithDefault(true)
Review comment:
@dongjoon-hyun - makes sense to me. Updated. For all reviewers,
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/136587/testReport
is the passed unit tests when enabling nested column vectorized reader by
default.
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
##########
@@ -131,11 +131,27 @@ class OrcFileFormat
}
}
+ private def supportBatchForNestedColumn(
+ sparkSession: SparkSession,
+ schema: StructType): Boolean = {
+ val hasNestedColumn = schema.map(_.dataType).exists {
+ case _: ArrayType | _: MapType | _: StructType => true
+ case _ => false
+ }
+ if (hasNestedColumn) {
+ sparkSession.sessionState.conf.orcVectorizedReaderNestedColumnEnabled
+ } else {
+ true
+ }
+ }
+
override def supportBatch(sparkSession: SparkSession, schema: StructType):
Boolean = {
val conf = sparkSession.sessionState.conf
conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled &&
schema.length <= conf.wholeStageMaxNumFields &&
- schema.forall(_.dataType.isInstanceOf[AtomicType])
+ schema.forall(s => supportDataType(s.dataType) &&
+ !s.dataType.isInstanceOf[UserDefinedType[_]]) &&
+ supportBatchForNestedColumn(sparkSession, schema)
Review comment:
@dongjoon-hyun - do you mean implementing Parquet vectorized reader for
nested column? I created https://issues.apache.org/jira/browse/SPARK-34863 and
plan to do it after this one, thanks.
##########
File path: project/MimaExcludes.scala
##########
@@ -417,6 +417,21 @@ object MimaExcludes {
case _ => true
},
+ // [SPARK-34862][SQL] Support nested column in ORC vectorized reader
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getBoolean"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getByte"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getShort"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getInt"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getLong"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getFloat"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getDouble"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getDecimal"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getUTF8String"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getBinary"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getArray"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getMap"),
+
ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getChild"),
Review comment:
@dongjoon-hyun - updated, thanks. Sorry I was not looking at this file
very closely.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]