Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/19464#discussion_r144353093
--- Diff: core/src/test/scala/org/apache/spark/FileSuite.scala ---
@@ -510,4 +510,54 @@ class FileSuite extends SparkFunSuite with
LocalSparkContext {
}
}
+ test("spark.files.ignoreEmptySplits work correctly (old Hadoop API)") {
+ val conf = new SparkConf()
+ conf.setAppName("test").setMaster("local").set(IGNORE_EMPTY_SPLITS,
true)
+ sc = new SparkContext(conf)
+
+ def testIgnoreEmptySplits(data: Array[Tuple2[String, String]],
numSlices: Int,
+ outputSuffix: Int, checkPart: String,
partitionLength: Int): Unit = {
+ val dataRDD = sc.parallelize(data, numSlices)
+ val output = new File(tempDir, "output" + outputSuffix)
+ dataRDD.saveAsHadoopFile[TextOutputFormat[String,
String]](output.getPath)
+ assert(new File(output, checkPart).exists() === true)
+ val hadoopRDD = sc.textFile(new File(output, "part-*").getPath)
+ assert(hadoopRDD.partitions.length === partitionLength)
+ }
+
+ // Ensure that if all of the splits are empty, we remove the splits
correctly
+ testIgnoreEmptySplits(Array.empty[Tuple2[String, String]], 1, 0,
"part-00000", 0)
--- End diff --
I'd call it with named arguments, for example,
```scala
testIgnoreEmptySplits(
Array.empty[Tuple2[String, String]],
numSlices = 1,
outputSuffix = 0,
checkPart = "part-00000",
expectedPartitionNum = 0)
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]