Github user gatorsmile commented on a diff in the pull request:
https://github.com/apache/spark/pull/20963#discussion_r179934815
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
---
@@ -2127,4 +2127,39 @@ class JsonSuite extends QueryTest with
SharedSQLContext with TestJsonData {
assert(df.schema === expectedSchema)
}
}
+
+ test("SPARK-23849: schema inferring touches less data if samplingRation
< 1.0") {
+ val predefinedSample = Set[Int](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
+ 57, 62, 68, 72)
+ withTempPath { path =>
+ val writer = Files.newBufferedWriter(Paths.get(path.getAbsolutePath),
+ StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW)
+ for (i <- 0 until 100) {
+ if (predefinedSample.contains(i)) {
+ writer.write(s"""{"f1":${i.toString}}""" + "\n")
+ } else {
+ writer.write(s"""{"f1":${(i.toDouble + 0.1).toString}}""" + "\n")
+ }
+ }
+ writer.close()
+
+ val ds = spark.read.option("samplingRatio",
0.1).json(path.getCanonicalPath)
--- End diff --
Yes. The seed is also given.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]