[ https://issues.apache.org/jira/browse/SPARK-27027?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16783478#comment-16783478 ]
Gabor Somogyi commented on SPARK-27027: --------------------------------------- What I've found is really interesting. The following test: {code:java} test("should print wrong result") { val df = Seq((1, "John Doe", 30), (2, "Mary Jane", 25), (3, "Josh Duke", 50)).toDF("id", "name", "age") val dfStruct = df.withColumn("value", struct("name","age")) val dfKV = dfStruct.select(to_avro('id).as("key"), to_avro('value).as("value")) val expectedSchema = StructType(Seq(StructField("name", StringType, true),StructField("age", IntegerType, false))) val avroTypeStruct = SchemaConverters.toAvroType(expectedSchema).toString val avroTypeStr = s""" |{ | "type": "int", | "name": "key" |} """.stripMargin dfKV.select(from_avro('key, avroTypeStr)).show dfKV.select(from_avro('value, avroTypeStruct)).show } {code} prints out the following result on v2.4.0 branch: {code:java} +-------------------+ |from_avro(key, int)| +-------------------+ | 1| | 2| | 3| +-------------------+ +---------------------------------------------+ |from_avro(value, struct<name:string,age:int>)| +---------------------------------------------+ | [John Doe, 30]| | [Mary Jane, 25]| | [Josh Duke, 50]| +---------------------------------------------+ {code} When I've tried the exact same stuff with: {code:java} spark-shell --packages org.apache.spark:spark-avro_2.11:2.4.0 {code} then it fails: {code:java} scala> spark.version res0: String = 2.4.0 scala> val df = Seq((1, "John Doe", 30), (2, "Mary Jane", 25), (3, "Josh Duke", 50)).toDF("id", "name", "age") df: org.apache.spark.sql.DataFrame = [id: int, name: string ... 1 more field] scala> val dfStruct = df.withColumn("value", struct("name","age")) dfStruct: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields] scala> import org.apache.spark.sql.avro._ import org.apache.spark.sql.avro._ scala> val dfKV = dfStruct.select(to_avro('id).as("key"), to_avro('value).as("value")) dfKV: org.apache.spark.sql.DataFrame = [key: binary, value: binary] scala> import org.apache.spark.sql.types._ import org.apache.spark.sql.types._ scala> val expectedSchema = StructType(Seq(StructField("name", StringType, true),StructField("age", IntegerType, false))) expectedSchema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,false)) scala> val avroTypeStruct = SchemaConverters.toAvroType(expectedSchema).toString avroTypeStruct: String = {"type":"record","name":"topLevelRecord","fields":[{"name":"name","type":["string","null"]},{"name":"age","type":"int"}]} scala> val avroTypeStr = s""" | |{ | | "type": "int", | | "name": "key" | |} | """.stripMargin avroTypeStr: String = " { "type": "int", "name": "key" } " scala> import org.apache.spark.sql.avro.from_avro import org.apache.spark.sql.avro.from_avro scala> dfKV.select(from_avro('key, avroTypeStr)).show +-------------------+ |from_avro(key, int)| +-------------------+ | 1| | 2| | 3| +-------------------+ scala> dfKV.select(from_avro('value, avroTypeStruct)).show +---------------------------------------------+ |from_avro(value, struct<name:string,age:int>)| +---------------------------------------------+ | [Josh Duke, 50]| | [Josh Duke, 50]| | [Josh Duke, 50]| +---------------------------------------------+ {code} > from_avro function does not deserialize the Avro record of a struct column > type correctly > ----------------------------------------------------------------------------------------- > > Key: SPARK-27027 > URL: https://issues.apache.org/jira/browse/SPARK-27027 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.4.0 > Reporter: Hien Luu > Priority: Minor > > {{from_avro}} function produces wrong output of a struct field. See the > output at the bottom of the description > {code} > import org.apache.spark.sql.types._ > import org.apache.spark.sql.avro._ > import org.apache.spark.sql.functions._ > spark.version > val df = Seq((1, "John Doe", 30), (2, "Mary Jane", 25), (3, "Josh Duke", > 50)).toDF("id", "name", "age") > val dfStruct = df.withColumn("value", struct("name","age")) > dfStruct.show > dfStruct.printSchema > val dfKV = dfStruct.select(to_avro('id).as("key"), > to_avro('value).as("value")) > val expectedSchema = StructType(Seq(StructField("name", StringType, > true),StructField("age", IntegerType, false))) > val avroTypeStruct = SchemaConverters.toAvroType(expectedSchema).toString > val avroTypeStr = s""" > |{ > | "type": "int", > | "name": "key" > |} > """.stripMargin > dfKV.select(from_avro('key, avroTypeStr)).show > dfKV.select(from_avro('value, avroTypeStruct)).show > // output for the last statement and that is not correct > +---------------------------------------------+ > |from_avro(value, struct<name:string,age:int>)| > +---------------------------------------------+ > | [Josh Duke, 50]| > | [Josh Duke, 50]| > | [Josh Duke, 50]| > +---------------------------------------------+ > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org