[
https://issues.apache.org/jira/browse/SPARK-6780?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14485586#comment-14485586
]
Ilya Ganelin commented on SPARK-6780:
-------------------------------------
Matching test code:
{code}
test("saveAsHadoopFileByKey should generate a text file per key") {
val testPairs : JavaRDD[Array[Byte]] = sc.parallelize(
Seq(
Array(1.toByte,1.toByte),
Array(2.toByte,4.toByte),
Array(3.toByte,9.toByte),
Array(4.toByte,16.toByte),
Array(5.toByte,25.toByte))
).toJavaRDD()
val fs = FileSystem.get(new Configuration())
val basePath = sc.conf.get("spark.local.dir", "/tmp")
val fullPath = basePath + "/testPath"
fs.delete(new Path(fullPath), true)
PythonRDD.saveAsHadoopFileByKey(
testPairs,
false,
fullPath,
classOf[RDDMultipleTextOutputFormat].toString,
classOf[Int].toString,
classOf[Int].toString,
null,
null,
new java.util.HashMap(), "")
// Test that a file was created for each key
(1 to 5).foreach(key => {
val testPath = new Path(fullPath + "/" + key)
assert(fs.exists(testPath))
// Read the file and test that the contents are the values matching that
key split by line
val input = fs.open(testPath)
val reader = new BufferedReader(new InputStreamReader(input))
val values = new HashSet[Int]
val lines = Stream.continually(reader.readLine()).takeWhile(_ != null)
lines.foreach(s => values += s.toInt)
assert(values.contains(key*key))
})
fs.delete(new Path(fullPath), true)
}
{code}
> Add saveAsTextFileByKey method for PySpark
> ------------------------------------------
>
> Key: SPARK-6780
> URL: https://issues.apache.org/jira/browse/SPARK-6780
> Project: Spark
> Issue Type: Improvement
> Components: PySpark
> Reporter: Ilya Ganelin
>
> The PySpark API should have a method to allow saving a key-value RDD to
> subdirectories organized by key as in :
> https://issues.apache.org/jira/browse/SPARK-3533
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]