Repository: spark Updated Branches: refs/heads/branch-1.5 0f4ccdc4c -> 5bbb2d327
[SPARK-8670] [SQL] Nested columns can't be referenced in pyspark This bug is caused by a wrong column-exist-check in `__getitem__` of pyspark dataframe. `DataFrame.apply` accepts not only top level column names, but also nested column name like `a.b`, so we should remove that check from `__getitem__`. Author: Wenchen Fan <cloud0...@outlook.com> Closes #8202 from cloud-fan/nested. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bbb2d32 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bbb2d32 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bbb2d32 Branch: refs/heads/branch-1.5 Commit: 5bbb2d327d3fbcdb0d631ad3b8d960dfada64f9a Parents: 0f4ccdc Author: Wenchen Fan <cloud0...@outlook.com> Authored: Fri Aug 14 14:09:46 2015 -0700 Committer: Michael Armbrust <mich...@databricks.com> Committed: Fri Aug 14 14:12:57 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/dataframe.py | 2 -- python/pyspark/sql/tests.py | 4 +++- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/5bbb2d32/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 09647ff..da742d7 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -722,8 +722,6 @@ class DataFrame(object): [Row(age=5, name=u'Bob')] """ if isinstance(item, basestring): - if item not in self.columns: - raise IndexError("no such column: %s" % item) jc = self._jdf.apply(item) return Column(jc) elif isinstance(item, Column): http://git-wip-us.apache.org/repos/asf/spark/blob/5bbb2d32/python/pyspark/sql/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 9b74810..13cf647 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -770,7 +770,7 @@ class SQLTests(ReusedPySparkTestCase): self.assertTrue(isinstance(df['key'], Column)) self.assertTrue(isinstance(df[0], Column)) self.assertRaises(IndexError, lambda: df[2]) - self.assertRaises(IndexError, lambda: df["bad_key"]) + self.assertRaises(AnalysisException, lambda: df["bad_key"]) self.assertRaises(TypeError, lambda: df[{}]) def test_column_name_with_non_ascii(self): @@ -794,7 +794,9 @@ class SQLTests(ReusedPySparkTestCase): df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() self.assertEqual(1, df.select(df.l[0]).first()[0]) self.assertEqual(1, df.select(df.r["a"]).first()[0]) + self.assertEqual(1, df.select(df["r.a"]).first()[0]) self.assertEqual("b", df.select(df.r["b"]).first()[0]) + self.assertEqual("b", df.select(df["r.b"]).first()[0]) self.assertEqual("v", df.select(df.d["k"]).first()[0]) def test_infer_long_type(self): http://git-wip-us.apache.org/repos/asf/spark/blob/5bbb2d32/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index c466d9e..ec5084a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -634,6 +634,7 @@ class DataFrame private[sql]( /** * Selects column based on the column name and return it as a [[Column]]. + * Note that the column name can also reference to a nested column like `a.b`. * @group dfops * @since 1.3.0 */ @@ -641,6 +642,7 @@ class DataFrame private[sql]( /** * Selects column based on the column name and return it as a [[Column]]. + * Note that the column name can also reference to a nested column like `a.b`. * @group dfops * @since 1.3.0 */ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org