[ https://issues.apache.org/jira/browse/SPARK-26179?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
DB Tsai updated SPARK-26179: ---------------------------- Description: Spark SQL map is internally represented by two arrays, and when two maps are concatenated, Spark simply concatenates arrays resulting duplicated keys. The concat function should replace (k, v) in the left side. See the following example, {code:java} import org.apache.spark.sql.functions.udf val udf_map_concat = udf((a: Map[String, String], b: Map[String, String]) => { // ++ replaces any (k,v) in the left side a ++ b }) val newColumn = udf_map_concat($"data", map(lit("1"), concat(df("data")("1"), lit(", "), df("data")("2")))) val newColumnBuggy = map_concat($"data", map(lit("1"), concat(df("data")("1"), lit(", "), df("data")("2")))) val df2 = df.select(newColumn) val df3 = df.select(newColumnBuggy) df2.show(false) """ |+------------------------------------------------------------+ ||udf_map_concat(data, map(1, concat(data[1], , , data[2]))) | |+------------------------------------------------------------+ ||[2 -> Bananas, 3 -> Orange, 1 -> Apple, Bananas]| |+------------------------------------------------------------+ """.stripMargin df3.show(false) """ |+------------------------------------------------------------+ ||map_concat(data, map(1, concat(data[1], , , data[2]))) | |+------------------------------------------------------------+ ||[1 -> Apple, 2 -> Bananas, 3 -> Orange, 1 -> Apple, Bananas]| |+------------------------------------------------------------+ | """.stripMargin {code} was: See the following example, {noformat} import org.apache.spark.sql.functions.udf val udf_map_concat = udf((a: Map[String, String], b: Map[String, String]) => { // ++ replaces any (k,v) in the left side a ++ b }) val newColumn = udf_map_concat($"data", map(lit("1"), concat(df("data")("1"), lit(", "), df("data")("2")))) val newColumnBuggy = map_concat($"data", map(lit("1"), concat(df("data")("1"), lit(", "), df("data")("2")))) val df2 = df.select(newColumn) val df3 = df.select(newColumnBuggy) df2.printSchema() df2.show(false) """ |+------------------------------------------------------------+ ||udf_map_concat(data, map(1, concat(data[1], , , data[2]))) | |+------------------------------------------------------------+ ||[2 -> Bananas, 3 -> Orange, 1 -> Apple, Bananas]| |+------------------------------------------------------------+ """.stripMargin df3.show(false) """ |+------------------------------------------------------------+ ||map_concat(data, map(1, concat(data[1], , , data[2]))) | |+------------------------------------------------------------+ ||[1 -> Apple, 2 -> Bananas, 3 -> Orange, 1 -> Apple, Bananas]| |+------------------------------------------------------------+ | """.stripMargin {noformat} > `map_concat` should replace the value in the left side > ------------------------------------------------------ > > Key: SPARK-26179 > URL: https://issues.apache.org/jira/browse/SPARK-26179 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.4.0 > Reporter: DB Tsai > Priority: Major > > Spark SQL map is internally represented by two arrays, and when two maps are > concatenated, Spark simply concatenates arrays resulting duplicated keys. The > concat function should replace (k, v) in the left side. > See the following example, > {code:java} > import org.apache.spark.sql.functions.udf > val udf_map_concat = udf((a: Map[String, String], b: Map[String, String]) => { > // ++ replaces any (k,v) in the left side > a ++ b > }) > val newColumn = udf_map_concat($"data", map(lit("1"), concat(df("data")("1"), > lit(", "), df("data")("2")))) > val newColumnBuggy = map_concat($"data", map(lit("1"), > concat(df("data")("1"), lit(", "), df("data")("2")))) > val df2 = df.select(newColumn) > val df3 = df.select(newColumnBuggy) > df2.show(false) > """ > |+------------------------------------------------------------+ > ||udf_map_concat(data, map(1, concat(data[1], , , data[2]))) | > |+------------------------------------------------------------+ > ||[2 -> Bananas, 3 -> Orange, 1 -> Apple, Bananas]| > |+------------------------------------------------------------+ > """.stripMargin > df3.show(false) > """ > |+------------------------------------------------------------+ > ||map_concat(data, map(1, concat(data[1], , , data[2]))) | > |+------------------------------------------------------------+ > ||[1 -> Apple, 2 -> Bananas, 3 -> Orange, 1 -> Apple, Bananas]| > |+------------------------------------------------------------+ > | > """.stripMargin > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org