Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/21546#discussion_r194949076
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala ---
@@ -34,17 +34,36 @@ private[sql] object PythonSQLUtils {
}
/**
- * Python Callable function to convert ArrowPayloads into a
[[DataFrame]].
+ * Python callable function to convert an RDD of serialized
ArrowRecordBatches into
+ * a [[DataFrame]].
*
- * @param payloadRDD A JavaRDD of ArrowPayloads.
- * @param schemaString JSON Formatted Schema for ArrowPayloads.
+ * @param arrowBatchRDD A JavaRDD of serialized ArrowRecordBatches.
+ * @param schemaString JSON Formatted Spark schema for Arrow batches.
* @param sqlContext The active [[SQLContext]].
* @return The converted [[DataFrame]].
*/
- def arrowPayloadToDataFrame(
- payloadRDD: JavaRDD[Array[Byte]],
+ def arrowStreamToDataFrame(
+ arrowBatchRDD: JavaRDD[Array[Byte]],
schemaString: String,
sqlContext: SQLContext): DataFrame = {
- ArrowConverters.toDataFrame(payloadRDD, schemaString, sqlContext)
+ ArrowConverters.toDataFrame(arrowBatchRDD, schemaString, sqlContext)
+ }
+
+ /**
+ * Python callable function to read a file in Arrow stream format and
create a [[DataFrame]]
+ * using each serialized ArrowRecordBatch as a partition.
+ *
+ * @param sqlContext The active [[SQLContext]].
+ * @param filename File to read the Arrow stream from.
+ * @param schemaString JSON Formatted Spark schema for Arrow batches.
+ * @return A new [[DataFrame]].
+ */
+ def arrowReadStreamFromFile(
+ sqlContext: SQLContext,
+ filename: String,
+ schemaString: String): DataFrame = {
+ JavaSparkContext.fromSparkContext(sqlContext.sparkContext)
--- End diff --
What is this line for?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]