yma11 commented on code in PR #5351:
URL: https://github.com/apache/incubator-gluten/pull/5351#discussion_r1616608418
##########
shims/spark32/src/main/scala/org/apache/gluten/sql/shims/spark32/Spark32Shims.scala:
##########
@@ -189,8 +189,15 @@ class Spark32Shims extends SparkShims {
def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] =
partition.files
+ def isFileSplittable(
+ relation: HadoopFsRelation,
+ filePath: Path,
+ sparkSchema: StructType): Boolean = true
+
def isRowIndexMetadataColumn(name: String): Boolean = false
+ def findRowIndexColumnIndexInSchema(sparkSchema: StructType): Int = -1
+
Review Comment:
What does `-1` mean?
##########
backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala:
##########
@@ -272,6 +272,10 @@ object VeloxBackendSettings extends BackendSettingsApi {
override def supportNativeMetadataColumns(): Boolean = true
+ override def supportNativeRowIndexColumn(): Boolean = {
+ GlutenConfig.getConf.enableNativeRowIndexColumn
+ }
Review Comment:
Can't we use `true` instead of a config here? similar as
`supportNativeMetadataColumns`.
##########
gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala:
##########
@@ -649,15 +650,21 @@ trait SparkPlanExecApi {
sparkExecNode: LeafExecNode): Seq[Expression] = {
sparkExecNode match {
case fileSourceScan: FileSourceScanExec =>
- fileSourceScan.dataFilters ++ FilterHandler.getRemainingFilters(
- fileSourceScan.dataFilters,
- extraFilters)
+ val dataFilters = fileSourceScan.dataFilters
+ val pushedFilters =
+ dataFilters ++ FilterHandler.getRemainingFilters(dataFilters,
extraFilters)
+ pushedFilters.filterNot(_.references.exists {
+ attr =>
SparkShimLoader.getSparkShims.isRowIndexMetadataColumn(attr.name)
+ })
case batchScan: BatchScanExec =>
batchScan.scan match {
case fileScan: FileScan =>
- fileScan.dataFilters ++ FilterHandler.getRemainingFilters(
- fileScan.dataFilters,
- extraFilters)
+ val dataFilters = fileScan.dataFilters
+ val pushedFilters =
+ dataFilters ++ FilterHandler.getRemainingFilters(dataFilters,
extraFilters)
+ pushedFilters.filterNot(_.references.exists {
+ attr =>
SparkShimLoader.getSparkShims.isRowIndexMetadataColumn(attr.name)
+ })
Review Comment:
Add a method to avoid code duplication?
##########
gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala:
##########
@@ -759,6 +759,10 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenFileFormatWriterSuite]
enableSuite[GlutenFileIndexSuite]
enableSuite[GlutenFileMetadataStructSuite]
+ .exclude("metadata struct (parquet): select only metadata")
Review Comment:
Add comment `// Rewrite because of ....`
##########
shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala:
##########
@@ -327,10 +327,35 @@ class Spark34Shims extends SparkShims {
def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] =
partition.files
+ def isFileSplittable(
+ relation: HadoopFsRelation,
+ filePath: Path,
+ sparkSchema: StructType): Boolean = {
+ relation.fileFormat
+ .isSplitable(relation.sparkSession, relation.options, filePath) &&
+ !(RowIndexUtil.findRowIndexColumnIndexInSchema(sparkSchema) >= 0)
Review Comment:
Add comments for this logic?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]