This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new f6be769 [SPARK-37668][PYTHON] 'Index' object has no attribute 'levels' in pyspark.pandas.frame.DataFrame.insert f6be769 is described below commit f6be7693ba66c81fda8ee97ec7c6346e34495235 Author: itholic <haejoon....@databricks.com> AuthorDate: Thu Dec 23 22:15:07 2021 +0900 [SPARK-37668][PYTHON] 'Index' object has no attribute 'levels' in pyspark.pandas.frame.DataFrame.insert ### What changes were proposed in this pull request? This PR proposes to address the unexpected error in `pyspark.pandas.frame.DataFrame.insert`. Assigning tuple as column name is only supported for MultiIndex Columns for now in pandas API on Spark: ```python # MultiIndex column >>> psdf x y 0 1 1 2 2 3 >>> psdf[('a', 'b')] = [4, 5, 6] >>> psdf x a y b 0 1 4 1 2 5 2 3 6 # However, not supported for non-MultiIndex column >>> psdf A 0 1 1 2 2 3 >>> psdf[('a', 'b')] = [4, 5, 6] Traceback (most recent call last): ... KeyError: 'Key length (2) exceeds index depth (1)' ``` So, we should show proper error message rather than `AttributeError: 'Index' object has no attribute 'levels'` when users try to insert the tuple named column. **Before** ```python >>> psdf.insert(0, ("a", "b"), 10) Traceback (most recent call last): ... AttributeError: 'Index' object has no attribute 'levels' ``` **After** ```python >>> psdf.insert(0, ("a", "b"), 10) Traceback (most recent call last): ... NotImplementedError: Assigning column name as tuple is only supported for MultiIndex columns for now. ``` ### Why are the changes needed? Let users know proper usage. ### Does this PR introduce _any_ user-facing change? Yes, the exception message is changed as described in the **After**. ### How was this patch tested? Unittests. Closes #34957 from itholic/SPARK-37668. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/frame.py | 14 ++++++++++---- python/pyspark/pandas/tests/test_dataframe.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index c2a7385..985bd9b 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -3988,13 +3988,19 @@ defaultdict(<class 'list'>, {'col..., 'col...})] '"column" should be a scalar value or tuple that contains scalar values' ) + # TODO(SPARK-37723): Support tuple for non-MultiIndex column name. if is_name_like_tuple(column): - if len(column) != len(self.columns.levels): # type: ignore[attr-defined] # SPARK-37668 - # To be consistent with pandas - raise ValueError('"column" must have length equal to number of column levels.') + if self._internal.column_labels_level > 1: + if len(column) != len(self.columns.levels): # type: ignore[attr-defined] + # To be consistent with pandas + raise ValueError('"column" must have length equal to number of column levels.') + else: + raise NotImplementedError( + "Assigning column name as tuple is only supported for MultiIndex columns for now." + ) if column in self.columns: - raise ValueError("cannot insert %s, already exists" % column) + raise ValueError("cannot insert %s, already exists" % str(column)) psdf = self.copy() psdf[column] = value diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index 84a53c0..88416d3 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -223,6 +223,12 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): "loc must be int", lambda: psdf.insert((1,), "b", 10), ) + self.assertRaisesRegex( + NotImplementedError, + "Assigning column name as tuple is only supported for MultiIndex columns for now.", + lambda: psdf.insert(0, ("e",), 10), + ) + self.assertRaises(ValueError, lambda: psdf.insert(0, "e", [7, 8, 9, 10])) self.assertRaises(ValueError, lambda: psdf.insert(0, "f", ps.Series([7, 8]))) self.assertRaises(AssertionError, lambda: psdf.insert(100, "y", psser)) @@ -249,6 +255,11 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assertRaisesRegex( ValueError, + r"cannot insert \('x', 'a', 'b'\), already exists", + lambda: psdf.insert(4, ("x", "a", "b"), 11), + ) + self.assertRaisesRegex( + ValueError, '"column" must have length equal to number of column levels.', lambda: psdf.insert(4, ("e",), 11), ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org