Re: [PR] [SPARK-45723][PYTHON][CONNECT] Catalog methods avoid pandas conversion [spark]

via GitHub Mon, 30 Oct 2023 03:36:17 -0700


zhengruifeng commented on code in PR #43583:
URL: https://github.com/apache/spark/pull/43583#discussion_r1375995141



##########
python/pyspark/sql/connect/catalog.py:
##########
@@ -83,138 +85,125 @@ def setCurrentDatabase(self, dbName: str) -> None:
     setCurrentDatabase.__doc__ = PySparkCatalog.setCurrentDatabase.__doc__
 
     def listDatabases(self, pattern: Optional[str] = None) -> List[Database]:
-        pdf = self._execute_and_fetch(plan.ListDatabases(pattern=pattern))
+        table = self._execute_and_fetch(plan.ListDatabases(pattern=pattern))
         return [
             Database(
-                name=row.iloc[0],
-                catalog=row.iloc[1],
-                description=row.iloc[2],
-                locationUri=row.iloc[3],
+                name=table[0][i].as_py(),
+                catalog=table[1][i].as_py(),
+                description=table[2][i].as_py(),
+                locationUri=table[3][i].as_py(),
             )
-            for _, row in pdf.iterrows()
+            for i in range(table.num_rows)
         ]
 
     listDatabases.__doc__ = PySparkCatalog.listDatabases.__doc__
 
     def getDatabase(self, dbName: str) -> Database:
-        pdf = self._execute_and_fetch(plan.GetDatabase(db_name=dbName))
-        assert pdf is not None
-        row = pdf.iloc[0]
+        table = self._execute_and_fetch(plan.GetDatabase(db_name=dbName))
         return Database(
-            name=row[0],
-            catalog=row[1],
-            description=row[2],
-            locationUri=row[3],
+            name=table[0][0].as_py(),
+            catalog=table[1][0].as_py(),
+            description=table[2][0].as_py(),
+            locationUri=table[3][0].as_py(),
         )
 
     getDatabase.__doc__ = PySparkCatalog.getDatabase.__doc__
 
     def databaseExists(self, dbName: str) -> bool:
-        pdf = self._execute_and_fetch(plan.DatabaseExists(db_name=dbName))
-        assert pdf is not None
-        return pdf.iloc[0].iloc[0]
+        table = self._execute_and_fetch(plan.DatabaseExists(db_name=dbName))
+        return table[0][0].as_py()
 
     databaseExists.__doc__ = PySparkCatalog.databaseExists.__doc__
 
     def listTables(
         self, dbName: Optional[str] = None, pattern: Optional[str] = None
     ) -> List[Table]:
-        pdf = self._execute_and_fetch(plan.ListTables(db_name=dbName, 
pattern=pattern))
+        table = self._execute_and_fetch(plan.ListTables(db_name=dbName, 
pattern=pattern))
         return [
             Table(
-                name=row.iloc[0],
-                catalog=row.iloc[1],
-                # If None, returns None.
-                namespace=None if row.iloc[2] is None else list(row.iloc[2]),
-                description=row.iloc[3],
-                tableType=row.iloc[4],
-                isTemporary=row.iloc[5],
+                name=table[0][i].as_py(),
+                catalog=table[1][i].as_py(),
+                namespace=table[2][i].as_py(),
+                description=table[3][i].as_py(),
+                tableType=table[4][i].as_py(),
+                isTemporary=table[5][i].as_py(),
             )
-            for _, row in pdf.iterrows()
+            for i in range(table.num_rows)
         ]
 
     listTables.__doc__ = PySparkCatalog.listTables.__doc__
 
     def getTable(self, tableName: str) -> Table:
-        pdf = self._execute_and_fetch(plan.GetTable(table_name=tableName))
-        assert pdf is not None
-        row = pdf.iloc[0]
+        table = self._execute_and_fetch(plan.GetTable(table_name=tableName))
         return Table(
-            name=row.iloc[0],
-            catalog=row.iloc[1],
-            # If None, returns None.
-            namespace=None if row.iloc[2] is None else list(row.iloc[2]),
-            description=row.iloc[3],
-            tableType=row.iloc[4],
-            isTemporary=row.iloc[5],
+            name=table[0][0].as_py(),
+            catalog=table[1][0].as_py(),
+            namespace=table[2][0].as_py(),

Review Comment:
   `table[2][0]` returns a `pyarrow.StringScalar` value here.
   If the underlying value is None, `table[2][0]` is `<pyarrow.StringScalar: 
None>` and its `as_py()` is None.
   So we no longer need check `None` for optional field.
   
   ```
   In [11]: pdf = pd.DataFrame({"A": ["x", "y", "z", None]})
   
   In [12]: table = pa.Table.from_pandas(pdf)
   
   In [13]: table
   Out[13]:
   pyarrow.Table
   A: string
   ----
   A: [["x","y","z",null]]
   
   In [14]: table[0][3]
   Out[14]: <pyarrow.StringScalar: None>
   
   In [15]: table[0][3].as_py()
   
   In [16]: table[0][3].as_py() is None
   Out[16]: True
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-45723][PYTHON][CONNECT] Catalog methods avoid pandas conversion [spark]

Reply via email to