[ https://issues.apache.org/jira/browse/SPARK-23621?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ravikumar Ramasamy updated SPARK-23621: --------------------------------------- Description: The configuration data is stored in Cassandra which is unstructured data contains string columns and one json structure string column. To reproduce the issue, I stored the data into Hive table and reading from there. DataFrame saveAsTable is persisting all the column values properly but insertInto function is not storing all the columns especially json data is truncated and sub-sequent column in not stored hive table. {code:java} CREATE TABLE zone_status ( siteid string, orgid string, groupid string, zoneid string, parkingtype string, capacity int, config string, ts bigint) STORED AS TEXTFILE; {code} {code:java} val spark = SparkSession.builder().appName("Spark SQL Test"). config("hive.exec.dynamic.partition", "true"). config("hive.exec.dynamic.partition.mode", "nonstrict"). enableHiveSupport().getOrCreate() val zoneStatus = spark.table("zone_status") zoneStatus.select(col("siteid"),col("orgid"), col("parkinggroupid"), col("parkingzoneid"), col("parkingtype"), lit(0), col("config"), unix_timestamp().alias("ts")). write.mode(SaveMode.Overwrite).saveAsTable("dwh_zone_status_save") {code} Records in dwh_zone_status_save table {noformat} a8f11f90-20c9-11e8-b93e-2fc569d27605 efe5bdb3-baac-5d8e-6cae57771c13 Unknown E657F298-2D96-4C7D-8516-E228153FE010 NonDemarcated 0 {"orgid":"efe5bdb3-baac-5d8e-6cae57771c13","nodeid":"N02c00056","parkingzoneid":"E657F298-2D96-4C7D-8516-E228153FE010","siteid":"a8f11f90-20c9-11e8-b93e-2fc569d27605","channel":1,"type":"NonDemarcatedParkingConfig","active":true,"tag":"","configured_date":"2017-10-23 23:29:11.20","roi":{"roiid":"7854D5F1-9ECD-4E02-8364-7BFB15C2A01C","name":"Parking_Area_1","image_bounding_box":[[{"x":0.5083333253860474,"y":0.25468748807907104},{"x":0.6277777552604675,"y":0.45781248807907104},{"x":0.855555534362793,"y":0.42656248807907104},{"x":0.7138888835906982,"y":0.17656250298023224}]],"world_bounding_box":[[{"latitude":41.88759132852836,"longitude":-87.62231239554004},{"latitude":41.887652271934634,"longitude":-87.62230098708424},{"latitude":41.88765219325104,"longitude":-87.62227158629935},{"latitude":41.88757153728604,"longitude":-87.62227165116063}]],"vs":[5.0,1.7999999523162842,1.5]}} 1520453589{noformat} {code:java} zoneStatus. select(col("siteid"),col("orgid"), col("parkinggroupid"), col("parkingzoneid"), col("parkingtype"), lit(0), col("config"), unix_timestamp().alias("ts")). write.mode(SaveMode.Overwrite).insertInto("dwh_zone_status_insert") {code} Records in dwh_zone_status_insert table is {noformat} 985feb70-18f4-11e8-9912-e9bbd4db7f62 efe5bdb3-baac-5d8e-6cae57771c13 Unknown 04ABD29C-FA0F-4E4D-BFF2-4EC290DC29AE Demarcated 0 {"description":"" NULL{noformat} The json string column is not storing entire content and sub-sequent columns values also not stored in table. The defined table is TEXT format only. Our Environment is : scala 2.11.8 Spark 2.2.0 Hive EMR was: The configuration data is stored in Cassandra which is unstructured data contains string columns and one json structure string column. In this case, DataFrame saveAsTable is persisting all the column values properly but insertInto function is not storing all the columns especially json data is truncated and sub-sequent column in not stored. To reproduce the issue, I stored the data into Hive table and reading from there. {code:java} CREATE TABLE zone_status ( siteid string, orgid string, groupid string, zoneid string, parkingtype string, capacity int, config string, ts bigint) STORED AS TEXTFILE; {code} {code:java} val spark = SparkSession.builder().appName("Spark SQL Test"). config("hive.exec.dynamic.partition", "true"). config("hive.exec.dynamic.partition.mode", "nonstrict"). enableHiveSupport().getOrCreate() val zoneStatus = spark.table("zone_status") zoneStatus.select(col("siteid"),col("orgid"), col("parkinggroupid"), col("parkingzoneid"), col("parkingtype"), lit(0), col("config"), unix_timestamp().alias("ts")). write.mode(SaveMode.Overwrite).saveAsTable("dwh_zone_status_save") {code} Records in dwh_zone_status_save table {noformat} a8f11f90-20c9-11e8-b93e-2fc569d27605 efe5bdb3-baac-5d8e-6cae57771c13 Unknown E657F298-2D96-4C7D-8516-E228153FE010 NonDemarcated 0 {"orgid":"efe5bdb3-baac-5d8e-6cae57771c13","nodeid":"N02c00056","parkingzoneid":"E657F298-2D96-4C7D-8516-E228153FE010","siteid":"a8f11f90-20c9-11e8-b93e-2fc569d27605","channel":1,"type":"NonDemarcatedParkingConfig","active":true,"tag":"","configured_date":"2017-10-23 23:29:11.20","roi":{"roiid":"7854D5F1-9ECD-4E02-8364-7BFB15C2A01C","name":"Parking_Area_1","image_bounding_box":[[{"x":0.5083333253860474,"y":0.25468748807907104},{"x":0.6277777552604675,"y":0.45781248807907104},{"x":0.855555534362793,"y":0.42656248807907104},{"x":0.7138888835906982,"y":0.17656250298023224}]],"world_bounding_box":[[{"latitude":41.88759132852836,"longitude":-87.62231239554004},{"latitude":41.887652271934634,"longitude":-87.62230098708424},{"latitude":41.88765219325104,"longitude":-87.62227158629935},{"latitude":41.88757153728604,"longitude":-87.62227165116063}]],"vs":[5.0,1.7999999523162842,1.5]}} 1520453589{noformat} {code:java} zoneStatus. select(col("siteid"),col("orgid"), col("parkinggroupid"), col("parkingzoneid"), col("parkingtype"), lit(0), col("config"), unix_timestamp().alias("ts")). write.mode(SaveMode.Overwrite).insertInto("dwh_zone_status_insert") {code} Records in dwh_zone_status_insert table is {noformat} 985feb70-18f4-11e8-9912-e9bbd4db7f62 efe5bdb3-baac-5d8e-6cae57771c13 Unknown 04ABD29C-FA0F-4E4D-BFF2-4EC290DC29AE Demarcated 0 {"description":"" NULL{noformat} The json string column is not storing entire content and sub-sequent columns values also not stored in table. The defined table is TEXT format only. Our Environment is : scala 2.11.8 Spark 2.2.0 Hive EMR > DataFrame.insertInto() is persisting all columns for mixed structured > data-type > ------------------------------------------------------------------------------- > > Key: SPARK-23621 > URL: https://issues.apache.org/jira/browse/SPARK-23621 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.2.0 > Reporter: Ravikumar Ramasamy > Priority: Major > Attachments: sample_data.csv > > > The configuration data is stored in Cassandra which is unstructured data > contains string columns and one json structure string column. > To reproduce the issue, I stored the data into Hive table and reading from > there. > DataFrame saveAsTable is persisting all the column values properly but > insertInto function is not storing all the columns especially json data is > truncated and sub-sequent column in not stored hive table. > > {code:java} > CREATE TABLE zone_status ( > siteid string, > orgid string, > groupid string, > zoneid string, > parkingtype string, > capacity int, > config string, > ts bigint) > STORED AS TEXTFILE; > {code} > {code:java} > val spark = SparkSession.builder().appName("Spark SQL Test"). > config("hive.exec.dynamic.partition", "true"). > config("hive.exec.dynamic.partition.mode", "nonstrict"). > enableHiveSupport().getOrCreate() > val zoneStatus = spark.table("zone_status") > zoneStatus.select(col("siteid"),col("orgid"), col("parkinggroupid"), > col("parkingzoneid"), col("parkingtype"), lit(0), col("config"), > unix_timestamp().alias("ts")). > write.mode(SaveMode.Overwrite).saveAsTable("dwh_zone_status_save") > {code} > Records in dwh_zone_status_save table > {noformat} > a8f11f90-20c9-11e8-b93e-2fc569d27605 efe5bdb3-baac-5d8e-6cae57771c13 Unknown > E657F298-2D96-4C7D-8516-E228153FE010 NonDemarcated 0 > {"orgid":"efe5bdb3-baac-5d8e-6cae57771c13","nodeid":"N02c00056","parkingzoneid":"E657F298-2D96-4C7D-8516-E228153FE010","siteid":"a8f11f90-20c9-11e8-b93e-2fc569d27605","channel":1,"type":"NonDemarcatedParkingConfig","active":true,"tag":"","configured_date":"2017-10-23 > > 23:29:11.20","roi":{"roiid":"7854D5F1-9ECD-4E02-8364-7BFB15C2A01C","name":"Parking_Area_1","image_bounding_box":[[{"x":0.5083333253860474,"y":0.25468748807907104},{"x":0.6277777552604675,"y":0.45781248807907104},{"x":0.855555534362793,"y":0.42656248807907104},{"x":0.7138888835906982,"y":0.17656250298023224}]],"world_bounding_box":[[{"latitude":41.88759132852836,"longitude":-87.62231239554004},{"latitude":41.887652271934634,"longitude":-87.62230098708424},{"latitude":41.88765219325104,"longitude":-87.62227158629935},{"latitude":41.88757153728604,"longitude":-87.62227165116063}]],"vs":[5.0,1.7999999523162842,1.5]}} > 1520453589{noformat} > > {code:java} > zoneStatus. > select(col("siteid"),col("orgid"), col("parkinggroupid"), > col("parkingzoneid"), col("parkingtype"), lit(0), col("config"), > unix_timestamp().alias("ts")). > write.mode(SaveMode.Overwrite).insertInto("dwh_zone_status_insert") > {code} > Records in dwh_zone_status_insert table is > {noformat} > 985feb70-18f4-11e8-9912-e9bbd4db7f62 efe5bdb3-baac-5d8e-6cae57771c13 Unknown > 04ABD29C-FA0F-4E4D-BFF2-4EC290DC29AE Demarcated 0 {"description":"" > NULL{noformat} > > The json string column is not storing entire content and sub-sequent columns > values also not stored in table. The defined table is TEXT format only. > Our Environment is : > scala 2.11.8 > Spark 2.2.0 > Hive > EMR > -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org