Re: [PR] Optimization of I/O path of python interface [systemds]

via GitHub Tue, 30 Jul 2024 04:47:02 -0700


Baunsgaard commented on code in PR #2032:
URL: https://github.com/apache/systemds/pull/2032#discussion_r1696817654



##########
src/main/java/org/apache/sysds/runtime/util/Py4jConverterUtils.java:
##########
@@ -114,6 +120,82 @@ public static MatrixBlock convertPy4JArrayToMB(byte[] 
data, int rlen, int clen,
                return mb;
        }
 
+       public static Array<?> convert(byte[] data, int numElements, 
Types.ValueType valueType) {

Review Comment:
   we need to add some internal tests (java only) for this function.
   however the implementation does look fine. There are some minor improvements 
possible if we do not cast the array types inside the inner loops but assign 
the cast arrays outside the loop and then use that allocation.



##########
src/main/python/tests/iotests/test_io_pandas_systemds.py:
##########
@@ -0,0 +1,89 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+import os
+import shutil
+import unittest
+import pandas as pd
+from systemds.context import SystemDSContext
+
+class TestPandasFromToSystemds(unittest.TestCase):
+
+    sds: SystemDSContext = None
+    temp_dir: str = "tests/iotests/temp_write_csv/"
+    n_cols = 3
+    n_rows = 5
+    df = pd.DataFrame(
+        {
+            'C1': [f"col1_string_{i}" for i in range(n_rows)],
+            'C2': [i for i in range(n_rows)],
+        }
+    )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.sds = SystemDSContext()
+        if not os.path.exists(cls.temp_dir):
+            os.makedirs(cls.temp_dir)
+
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sds.close()
+        shutil.rmtree(cls.temp_dir, ignore_errors=True)
+
+
+    def test_into_systemds(self):
+        # Transfer into SystemDS and write to CSV
+        frame = self.sds.from_pandas(self.df)
+        frame.write(self.temp_dir + "into_systemds.csv", format="csv", 
header=True).compute(verbose=True)
+
+        # Read the CSV file using pandas
+        result_df = pd.read_csv(self.temp_dir + "into_systemds.csv")
+
+        # Verify the data
+        print("result_df:")
+        print(result_df)
+        print("self.df:")
+        print(self.df)   
+        self.assertTrue(isinstance(result_df, pd.DataFrame))
+        self.assertTrue(self.df.equals(result_df))
+
+    def test_out_of_systemds(self):
+        # Create a CSV file to read into SystemDS
+        self.df.to_csv(self.temp_dir + "out_of_systemds.csv", header=False, 
index=False)
+
+        # Read the CSV file into SystemDS and then compute back to pandas
+        frame = self.sds.read(self.temp_dir + "out_of_systemds.csv", 
data_type="frame", format="csv")
+        result_df = frame.replace("xyz", "yzx").compute()
+
+        # Verify the data
+        print("result_df:")
+        result_df['C2'] = result_df['C2'].astype(int)
+        print(result_df)
+        print("self.df:")
+        print(self.df)   
+        self.assertTrue(isinstance(result_df, pd.DataFrame))
+        self.assertTrue(self.df.equals(result_df))
+
+if __name__ == "__main__":
+    unittest.main(exit=False)

Review Comment:
   newline in the end.



##########
src/main/python/systemds/utils/converters.py:
##########
@@ -116,20 +121,46 @@ def pandas_to_frame_block(sds, pd_df: pd.DataFrame):
         jc_FrameBlock = jvm.org.apache.sysds.runtime.frame.data.FrameBlock
         j_valueTypeArray = java_gate.new_array(jc_ValueType, len(schema))
         j_colNameArray = java_gate.new_array(jc_String, len(col_names))
-        j_dataArray = java_gate.new_array(jc_String, rows, cols)
-        for i in range(len(schema)):
-            j_valueTypeArray[i] = schema[i]
-        for i in range(len(col_names)):
-            j_colNameArray[i] = str(col_names[i])
-        j = 0
-        for j, col_name in enumerate(col_names):
-            col_data = pd_df[col_name].fillna("").to_numpy(dtype=str)
-            for i in range(col_data.shape[0]):
-                if col_data[i]:
-                    j_dataArray[i][j] = col_data[i]
-        fb = jc_FrameBlock(j_valueTypeArray, j_colNameArray, j_dataArray)
-
-        return fb
+        
+        if rows > 4:

Review Comment:
   add a comment for why we make this differentiation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@systemds.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] Optimization of I/O path of python interface [systemds]

Reply via email to