Github user falaki commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14745#discussion_r75766440
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
 ---
    @@ -57,28 +57,45 @@ class CSVFileFormat extends TextBasedFileFormat with 
DataSourceRegister {
         val rdd = baseRdd(sparkSession, csvOptions, paths)
         val firstLine = findFirstLine(csvOptions, rdd)
         val firstRow = new CsvReader(csvOptions).parseLine(firstLine)
    -
    -    val header = if (csvOptions.headerFlag) {
    -      firstRow.zipWithIndex.map { case (value, index) =>
    -        if (value == null || value.isEmpty || value == 
csvOptions.nullValue) s"_c$index" else value
    -      }
    -    } else {
    -      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    -    }
    +    val header = makeSafeHeader(firstRow, csvOptions)
     
         val parsedRdd = tokenRdd(sparkSession, csvOptions, header, paths)
         val schema = if (csvOptions.inferSchemaFlag) {
           CSVInferSchema.infer(parsedRdd, header, csvOptions)
         } else {
           // By default fields are assumed to be StringType
           val schemaFields = header.map { fieldName =>
    -        StructField(fieldName.toString, StringType, nullable = true)
    +        StructField(fieldName, StringType, nullable = true)
           }
           StructType(schemaFields)
         }
         Some(schema)
       }
     
    +  /**
    +   * Generates a header from the given row which is null-safe and 
duplicates-safe.
    +   */
    +  private def makeSafeHeader(row: Array[String], options: CSVOptions): 
Array[String] = {
    --- End diff --
    
    I suggest putting this function in utils and writing a separate unit test 
for it.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to