Repository: spark Updated Branches: refs/heads/master 0ef9fe64e -> b56f79cc3
[SPARK-20090][PYTHON] Add StructType.fieldNames in PySpark ## What changes were proposed in this pull request? This PR proposes `StructType.fieldNames` that returns a copy of a field name list rather than a (undocumented) `StructType.names`. There are two points here: - API consistency with Scala/Java - Provide a safe way to get the field names. Manipulating these might cause unexpected behaviour as below: ```python from pyspark.sql.types import * struct = StructType([StructField("f1", StringType(), True)]) names = struct.names del names[0] spark.createDataFrame([{"f1": 1}], struct).show() ``` ``` ... java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 1 fields are required while 0 values are provided. at org.apache.spark.sql.execution.python.EvaluatePython$.fromJava(EvaluatePython.scala:138) at org.apache.spark.sql.SparkSession$$anonfun$6.apply(SparkSession.scala:741) at org.apache.spark.sql.SparkSession$$anonfun$6.apply(SparkSession.scala:741) ... ``` ## How was this patch tested? Added tests in `python/pyspark/sql/tests.py`. Author: hyukjinkwon <gurwls...@gmail.com> Closes #18618 from HyukjinKwon/SPARK-20090. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b56f79cc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b56f79cc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b56f79cc Branch: refs/heads/master Commit: b56f79cc359d093d757af83171175cfd933162d1 Parents: 0ef9fe6 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Fri Jul 28 20:59:32 2017 -0700 Committer: Holden Karau <hol...@us.ibm.com> Committed: Fri Jul 28 20:59:32 2017 -0700 ---------------------------------------------------------------------- python/pyspark/sql/tests.py | 16 ++++++++-------- python/pyspark/sql/types.py | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b56f79cc/python/pyspark/sql/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 54756ed..cfd9c55 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1241,26 +1241,29 @@ class SQLTests(ReusedPySparkTestCase): struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) struct2 = StructType([StructField("f1", StringType(), True), StructField("f2", StringType(), True, None)]) + self.assertEqual(struct1.fieldNames(), struct2.names) self.assertEqual(struct1, struct2) struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) struct2 = StructType([StructField("f1", StringType(), True)]) + self.assertNotEqual(struct1.fieldNames(), struct2.names) self.assertNotEqual(struct1, struct2) struct1 = (StructType().add(StructField("f1", StringType(), True)) .add(StructField("f2", StringType(), True, None))) struct2 = StructType([StructField("f1", StringType(), True), StructField("f2", StringType(), True, None)]) + self.assertEqual(struct1.fieldNames(), struct2.names) self.assertEqual(struct1, struct2) struct1 = (StructType().add(StructField("f1", StringType(), True)) .add(StructField("f2", StringType(), True, None))) struct2 = StructType([StructField("f1", StringType(), True)]) + self.assertNotEqual(struct1.fieldNames(), struct2.names) self.assertNotEqual(struct1, struct2) # Catch exception raised during improper construction - with self.assertRaises(ValueError): - struct1 = StructType().add("name") + self.assertRaises(ValueError, lambda: StructType().add("name")) struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) for field in struct1: @@ -1273,12 +1276,9 @@ class SQLTests(ReusedPySparkTestCase): self.assertIs(struct1["f1"], struct1.fields[0]) self.assertIs(struct1[0], struct1.fields[0]) self.assertEqual(struct1[0:1], StructType(struct1.fields[0:1])) - with self.assertRaises(KeyError): - not_a_field = struct1["f9"] - with self.assertRaises(IndexError): - not_a_field = struct1[9] - with self.assertRaises(TypeError): - not_a_field = struct1[9.9] + self.assertRaises(KeyError, lambda: struct1["f9"]) + self.assertRaises(IndexError, lambda: struct1[9]) + self.assertRaises(TypeError, lambda: struct1[9.9]) def test_parse_datatype_string(self): from pyspark.sql.types import _all_atomic_types, _parse_datatype_string http://git-wip-us.apache.org/repos/asf/spark/blob/b56f79cc/python/pyspark/sql/types.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index c376805..ecb8eb9 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -446,9 +446,12 @@ class StructType(DataType): This is the data type representing a :class:`Row`. - Iterating a :class:`StructType` will iterate its :class:`StructField`s. + Iterating a :class:`StructType` will iterate its :class:`StructField`\\s. A contained :class:`StructField` can be accessed by name or position. + .. note:: `names` attribute is deprecated in 2.3. Use `fieldNames` method instead + to get a list of field names. + >>> struct1 = StructType([StructField("f1", StringType(), True)]) >>> struct1["f1"] StructField(f1,StringType,true) @@ -563,6 +566,16 @@ class StructType(DataType): def fromJson(cls, json): return StructType([StructField.fromJson(f) for f in json["fields"]]) + def fieldNames(self): + """ + Returns all field names in a list. + + >>> struct = StructType([StructField("f1", StringType(), True)]) + >>> struct.fieldNames() + ['f1'] + """ + return list(self.names) + def needConversion(self): # We need convert Row()/namedtuple into tuple() return True --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org