smohr003 created SPARK-24233:
--------------------------------

             Summary: union operation on read of dataframe does nor produce 
correct result 
                 Key: SPARK-24233
                 URL: https://issues.apache.org/jira/browse/SPARK-24233
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 2.1.0
            Reporter: smohr003


I know that I can use wild card * to read all subfolders. But, I am trying to 
use .par and .schema to speed up the read process. 

val absolutePath = "adl://datalakename.azuredatalakestore.net/testU/"

Seq((1, "one"), (2, "two")).toDF("k", 
"v").write.mode("overwrite").parquet(absolutePath + "1")
Seq((11, "one"), (22, "two")).toDF("k", 
"v").write.mode("overwrite").parquet(absolutePath + "2")
Seq((111, "one"), (222, "two")).toDF("k", 
"v").write.mode("overwrite").parquet(absolutePath + "3")
Seq((1111, "one"), (2222, "two")).toDF("k", 
"v").write.mode("overwrite").parquet(absolutePath + "4")
Seq((2, "one"), (2, "two")).toDF("k", 
"v").write.mode("overwrite").parquet(absolutePath + "5")

 

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.\{FileSystem, Path}
import java.net.URI
def readDir(path: String): DataFrame = {
 val fs = FileSystem.get(new URI(path), new Configuration())
 val subDir = fs.listStatus(new Path(path)).map(i => i.getPath.toString)
 var df = spark.read.parquet(subDir.head)
 val dfSchema = df.schema
 subDir.tail.par.foreach(p => df = 
df.union(spark.read.schema(dfSchema).parquet(p)).select(df.columns.head, 
df.columns.tail:_*))
 df
}
val dfAll = readDir(absolutePath)
dfAll.count

 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to