Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20959#discussion_r182607885 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala --- @@ -161,7 +161,8 @@ object TextInputCSVDataSource extends CSVDataSource { val firstRow = new CsvParser(parsedOptions.asParserSettings).parseLine(firstLine) val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions) - val tokenRDD = csv.rdd.mapPartitions { iter => + val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions) + val tokenRDD = sampled.rdd.mapPartitions { iter => --- End diff -- ``` $ tree . . âââ a.json âââ b.json âââ c.json âââ d.json âââ e.json âââ f.json âââ h.json 0 directories, 7 files ``` ``` $ cat * {"a": "a"} {"a": 1} {"a": 1} {"a": 1} {"a": 1} {"a": 1} {"a": 1} ``` ```scala scala> spark.read.option("samplingRatio", 0.1).json("tmp").schema res0: org.apache.spark.sql.types.StructType = StructType(StructField(a,LongType,true)) scala> spark.read.option("samplingRatio", 1).json("tmp").schema res1: org.apache.spark.sql.types.StructType = StructType(StructField(a,StringType,true)) ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org