Github user mateiz commented on a diff in the pull request:
https://github.com/apache/spark/pull/363#discussion_r11554591
--- Diff: python/pyspark/rdd.py ---
@@ -1387,6 +1387,95 @@ def _jrdd(self):
def _is_pipelinable(self):
return not (self.is_cached or self.is_checkpointed)
+class Row(dict):
+ """
+ An extended L{dict} that takes a L{dict} in its constructor, and
exposes those items as fields.
+
+ >>> r = Row({"hello" : "world", "foo" : "bar"})
+ >>> r.hello
+ 'world'
+ >>> r.foo
+ 'bar'
+ """
+
+ def __init__(self, d):
+ d.update(self.__dict__)
+ self.__dict__ = d
+ dict.__init__(self, d)
+
+class SchemaRDD(RDD):
+ """
+ An RDD of Row objects that has an associated schema. The underlying
JVM object is a SchemaRDD,
+ not a PythonRDD, so we can utilize the relational query api exposed by
SparkSQL.
+
+ For normal L{RDD} operations (map, count, etc.) the L{SchemaRDD} is
not operated on directly, as
+ it's underlying implementation is a RDD composed of Java objects.
Instead it is converted to a
+ PythonRDD in the JVM, on which Python operations can be done.
+ """
+
+ def __init__(self, jschema_rdd, sql_ctx):
+ self.sql_ctx = sql_ctx
+ self._sc = sql_ctx._sc
+ self._jschema_rdd = jschema_rdd
+
+ self.is_cached = False
+ self.is_checkpointed = False
+ self.ctx = self.sql_ctx._sc
+ self._jrdd_deserializer = self.ctx.serializer
+
+ @property
+ def _jrdd(self):
+ """
+ Lazy evaluation of PythonRDD object. Only done when a user calls
methods defined by the
+ L{RDD} super class (map, count, etc.).
+ """
+ return self.toPython()._jrdd
+
+ @property
+ def _id(self):
+ return self._jrdd.id()
+
+ def saveAsParquetFile(self, path):
+ """
+ Saves the contents of this L{SchemaRDD} as a parquet file,
preserving the schema. Files
+ that are written out using this method can be read back in as a
SchemaRDD using the
+ L{SQLContext.parquetFile} method.
+
+ >>> from pyspark.context import SQLContext
+ >>> sqlCtx = SQLContext(sc)
+ >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"},
+ ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2":
"row3"}])
+ >>> srdd = sqlCtx.inferSchema(rdd)
+ >>> srdd.saveAsParquetFile("/tmp/test.parquet")
+ >>> srdd2 = sqlCtx.parquetFile("/tmp/test.parquet")
+ >>> srdd2.collect() == srdd.collect()
+ True
+ """
+ self._jschema_rdd.saveAsParquetFile(path)
+
+ def registerAsTable(self, name):
+ """
+ Registers this RDD as a temporary table using the given name. The
lifetime of this temporary
+ table is tied to the L{SQLContext} that was used to create this
SchemaRDD.
+
+ >>> from pyspark.context import SQLContext
+ >>> sqlCtx = SQLContext(sc)
+ >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"},
+ ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2":
"row3"}])
+ >>> srdd = sqlCtx.inferSchema(rdd)
+ >>> srdd.registerAsTable("test")
+ >>> srdd2 = sqlCtx.sql("select * from test")
+ >>> srdd.collect() == srdd2.collect()
+ True
+ """
+ self._jschema_rdd.registerAsTable(name)
+
+ def toPython(self):
--- End diff --
Is this supposed to be a public method? From the examples it seems that you
can call map and collect and such on the SchemaRDD itself. In that case this
should be called _toPython to make it private, or you can inline it in the
computation of _jrdd.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---