[GitHub] spark pull request #20894: [SPARK-23786][SQL] Checking column names of csv h...

gatorsmile Sun, 25 Mar 2018 09:40:59 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20894#discussion_r176946135
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
 ---
    @@ -289,27 +294,52 @@ private[csv] object UnivocityParser {
        */
       def parseIterator(
           lines: Iterator[String],
    -      shouldDropHeader: Boolean,
           parser: UnivocityParser,
           schema: StructType): Iterator[InternalRow] = {
         val options = parser.options
     
    -    val linesWithoutHeader = if (shouldDropHeader) {
    -      // Note that if there are only comments in the first block, the 
header would probably
    -      // be not dropped.
    -      CSVUtils.dropHeaderLine(lines, options)
    -    } else {
    -      lines
    -    }
    -
         val filteredLines: Iterator[String] =
    -      CSVUtils.filterCommentAndEmpty(linesWithoutHeader, options)
    +      CSVUtils.filterCommentAndEmpty(lines, options)
     
         val safeParser = new FailureSafeParser[String](
           input => Seq(parser.parse(input)),
           parser.options.parseMode,
           schema,
           parser.options.columnNameOfCorruptRecord)
    +
         filteredLines.flatMap(safeParser.parse)
       }
    +
    +  def checkHeaderColumnNames(
    +    parser: UnivocityParser,
    +    schema: StructType,
    +    columnNames: Array[String],
    +    fileName: String
    +  ): Unit = {
    +    if (parser.options.checkHeader && columnNames != null) {
    +      val fieldNames = schema.map(_.name)
    +      val isMatched = fieldNames.zip(columnNames).forall { pair =>
    +        val (nameInSchema, nameInHeader) = pair
    +        nameInSchema == nameInHeader
    --- End diff --
    
    Do we care the case sensitivity here?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #20894: [SPARK-23786][SQL] Checking column names of csv h...

Reply via email to