[ 
https://issues.apache.org/jira/browse/SPARK-34204?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Nick Hryhoriev updated SPARK-34204:
-----------------------------------
    Description: 
input_file_name() function damage applying projection to the physical plan of 
the query.
 if use this function and a new column, column-oriented formats like parquet 
and orc put all columns to Physical plan.
 While without it, only selected columns uploaded.
 In my case, performance influence is x30.
{code:java}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

object TestSize {

  def main(args: Array[String]): Unit = {
    implicit val spark: SparkSession = SparkSession.builder()
      .master("local")
      .config("spark.sql.shuffle.partitions", "5")
      .getOrCreate()

    import spark.implicits._

    val quey1 = spark.read.parquet(
      "s3a:/part-00040-a19f0d20-eab3-48ef-be5a-602c7f9a8e58.c000.gz.parquet"
    )
      .select($"app_id", $"idfa", input_file_name().as("fileName"))
      .distinct()
      .count()

   val quey2 = spark.read.parquet( "s3a:/part-00040-a19f0d20-eab3-48ef-be5a- 
602c7f9a8e58.c000.gz.parquet" ) 
      .select($"app_id", $"idfa", input_file_name().as("fileName"))
      .distinct() 
      .count()

    Thread.sleep(10000000000L)

  }

}
{code}
Query 1 has all columns in the physical plan, while query 2 only two.

  was:
input_file_name() function damage applying projection to the physical plan of 
the query.
 if use this function and a new column, column-oriented formats like parquet 
and orc put all columns to Physical plan.
 While without it, only selected columns uploaded.
 In my case, performance influence is x30.
{code:java}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

object TestSize {

  def main(args: Array[String]): Unit = {
    implicit val spark: SparkSession = SparkSession.builder()
      .master("local")
      .config("spark.sql.shuffle.partitions", "5")
      .getOrCreate()

    import spark.implicits._

    spark.read.parquet(
      "s3a:/part-00040-a19f0d20-eab3-48ef-be5a-602c7f9a8e58.c000.gz.parquet"
    )
      .select($"app_id", $"idfa", input_file_name().as("fileName"))
      .distinct()
      .count()

    Thread.sleep(10000000000L)

  }

}

{code}


> input_file_name() func damage applying projection to physical plan of query.
> ----------------------------------------------------------------------------
>
>                 Key: SPARK-34204
>                 URL: https://issues.apache.org/jira/browse/SPARK-34204
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.4.7
>            Reporter: Nick Hryhoriev
>            Priority: Major
>
> input_file_name() function damage applying projection to the physical plan of 
> the query.
>  if use this function and a new column, column-oriented formats like parquet 
> and orc put all columns to Physical plan.
>  While without it, only selected columns uploaded.
>  In my case, performance influence is x30.
> {code:java}
> import org.apache.spark.sql.SparkSession
> import org.apache.spark.sql.functions._
> object TestSize {
>   def main(args: Array[String]): Unit = {
>     implicit val spark: SparkSession = SparkSession.builder()
>       .master("local")
>       .config("spark.sql.shuffle.partitions", "5")
>       .getOrCreate()
>     import spark.implicits._
>     val quey1 = spark.read.parquet(
>       "s3a:/part-00040-a19f0d20-eab3-48ef-be5a-602c7f9a8e58.c000.gz.parquet"
>     )
>       .select($"app_id", $"idfa", input_file_name().as("fileName"))
>       .distinct()
>       .count()
>    val quey2 = spark.read.parquet( "s3a:/part-00040-a19f0d20-eab3-48ef-be5a- 
> 602c7f9a8e58.c000.gz.parquet" ) 
>       .select($"app_id", $"idfa", input_file_name().as("fileName"))
>       .distinct() 
>       .count()
>     Thread.sleep(10000000000L)
>   }
> }
> {code}
> Query 1 has all columns in the physical plan, while query 2 only two.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to