This is an automated email from the ASF dual-hosted git repository.

rong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 279b775b85 [CLIENT-PY] Optimize array concatenation performance in 
`todf()` (#9001)
279b775b85 is described below

commit 279b775b85943d518a1b13f08c817e7f06eda84e
Author: Wei Fu <[email protected]>
AuthorDate: Thu Mar 16 11:50:26 2023 +0800

    [CLIENT-PY] Optimize array concatenation performance in `todf()` (#9001)
    
    * [CLIENT-PY]When fetching data from the server in batches, each 
np.concatenate() will create a new array and copy the stock array. To optimize 
performance, after parsing all batches of arrays, execute np.concatenate() only 
once.
    
    * Update pom.xml
    
    * Update test_todf.py
    
    * Update IoTDBRpcDataSet.py
    
    * Update client-py/tests/test_todf.py
    
    * Update client-py/iotdb/utils/IoTDBRpcDataSet.py
    
    ---------
    
    Co-authored-by: wei.fu <[email protected]>
    Co-authored-by: Haonan <[email protected]>
---
 client-py/iotdb/utils/IoTDBRpcDataSet.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/client-py/iotdb/utils/IoTDBRpcDataSet.py 
b/client-py/iotdb/utils/IoTDBRpcDataSet.py
index e9dfa7bfe6..6870f57be7 100644
--- a/client-py/iotdb/utils/IoTDBRpcDataSet.py
+++ b/client-py/iotdb/utils/IoTDBRpcDataSet.py
@@ -162,7 +162,7 @@ class IoTDBRpcDataSet(object):
     def resultset_to_pandas(self):
         result = {}
         for column_name in self.__column_name_list:
-            result[column_name] = None
+            result[column_name] = []
         while self._has_next_result_set():
             time_array = np.frombuffer(
                 self.__query_data_set.time, 
np.dtype(np.longlong).newbyteorder(">")
@@ -173,12 +173,8 @@ class IoTDBRpcDataSet(object):
                 self.get_ignore_timestamp() is None
                 or self.get_ignore_timestamp() is False
             ):
-                if result[IoTDBRpcDataSet.TIMESTAMP_STR] is None:
-                    result[IoTDBRpcDataSet.TIMESTAMP_STR] = time_array
-                else:
-                    result[IoTDBRpcDataSet.TIMESTAMP_STR] = np.concatenate(
-                        (result[IoTDBRpcDataSet.TIMESTAMP_STR], time_array), 
axis=0
-                    )
+                result[IoTDBRpcDataSet.TIMESTAMP_STR].append(time_array)
+
             self.__query_data_set.time = []
             total_length = len(time_array)
 
@@ -266,18 +262,19 @@ class IoTDBRpcDataSet(object):
 
                     data_array = tmp_array
 
-                if result[column_name] is None:
-                    result[column_name] = data_array
-                else:
-                    if isinstance(result[column_name], pd.Series):
-                        result[column_name] = 
result[column_name].append(data_array)
-                    else:
-                        result[column_name] = np.concatenate(
-                            (result[column_name], data_array), axis=0
-                        )
+                result[column_name].append(data_array)
+
         for k, v in result.items():
-            if v is None:
+            if v is None or len(v) < 1 or v[0] is None:
                 result[k] = []
+            elif v[0].dtype == "Int32":
+                result[k] = pd.Series(np.concatenate(v, 
axis=0)).astype("Int32")
+            elif v[0].dtype == "Int64":
+                result[k] = pd.Series(np.concatenate(v, 
axis=0)).astype("Int64")
+            elif v[0].dtype == "boolean":
+                result[k] = pd.Series(np.concatenate(v, 
axis=0)).astype("boolean")
+            else:
+                result[k] = np.concatenate(v, axis=0)
 
         df = pd.DataFrame(result)
         return df

Reply via email to