Reynold Xin created SPARK-7158:
----------------------------------
Summary: collect and show return different results
Key: SPARK-7158
URL: https://issues.apache.org/jira/browse/SPARK-7158
Project: Spark
Issue Type: Sub-task
Components: SQL
Reporter: Reynold Xin
Priority: Blocker
Reported by [~rams]
{code}
import java.util.UUID
import org.apache.spark.sql._
import org.apache.spark.sql.types._
val rdd = sc.parallelize(List(1,2,3), 2)
val schema = StructType(List(StructField("index",IntegerType,true)))
val df = sqlContext.createDataFrame(rdd.map(p => Row(p)), schema)
def id:() => String = () => {UUID.randomUUID().toString()}
def square:Int => Int = (x: Int) => {x * x}
val dfWithId = df.withColumn("id",callUDF(id, StringType)).cache() //expect the
ID to have materialized at this point
dfWithId.collect()
//res0: Array[org.apache.spark.sql.Row] =
Array([1,43c7b8e2-b4a3-43ee-beff-0bb4b7d6c1b1],
[2,efd061be-e8cc-43fa-956e-cfd6e7355982],
[3,79b0baab-627c-4761-af0d-8995b8c5a125])
val dfWithIdAndSquare = dfWithId.withColumn("square",callUDF(square,
IntegerType, col("index")))
dfWithIdAndSquare.collect()
//res1: Array[org.apache.spark.sql.Row] =
Array([1,a3b2e744-a0a1-40fe-8133-87a67660b4ab,1],
[2,0a7052a0-6071-4ef5-a25a-2670248ea5cd,4],
[3,209f269e-207a-4dfd-a186-738be5db2eff,9])
//why are the IDs in lines 11 and 15 different?
{code}
The randomly generated IDs are the same if show() is used instead of collect.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]