This is an automated email from the ASF dual-hosted git repository.
rong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new 279b775b85 [CLIENT-PY] Optimize array concatenation performance in
`todf()` (#9001)
279b775b85 is described below
commit 279b775b85943d518a1b13f08c817e7f06eda84e
Author: Wei Fu <[email protected]>
AuthorDate: Thu Mar 16 11:50:26 2023 +0800
[CLIENT-PY] Optimize array concatenation performance in `todf()` (#9001)
* [CLIENT-PY]When fetching data from the server in batches, each
np.concatenate() will create a new array and copy the stock array. To optimize
performance, after parsing all batches of arrays, execute np.concatenate() only
once.
* Update pom.xml
* Update test_todf.py
* Update IoTDBRpcDataSet.py
* Update client-py/tests/test_todf.py
* Update client-py/iotdb/utils/IoTDBRpcDataSet.py
---------
Co-authored-by: wei.fu <[email protected]>
Co-authored-by: Haonan <[email protected]>
---
client-py/iotdb/utils/IoTDBRpcDataSet.py | 31 ++++++++++++++-----------------
1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/client-py/iotdb/utils/IoTDBRpcDataSet.py
b/client-py/iotdb/utils/IoTDBRpcDataSet.py
index e9dfa7bfe6..6870f57be7 100644
--- a/client-py/iotdb/utils/IoTDBRpcDataSet.py
+++ b/client-py/iotdb/utils/IoTDBRpcDataSet.py
@@ -162,7 +162,7 @@ class IoTDBRpcDataSet(object):
def resultset_to_pandas(self):
result = {}
for column_name in self.__column_name_list:
- result[column_name] = None
+ result[column_name] = []
while self._has_next_result_set():
time_array = np.frombuffer(
self.__query_data_set.time,
np.dtype(np.longlong).newbyteorder(">")
@@ -173,12 +173,8 @@ class IoTDBRpcDataSet(object):
self.get_ignore_timestamp() is None
or self.get_ignore_timestamp() is False
):
- if result[IoTDBRpcDataSet.TIMESTAMP_STR] is None:
- result[IoTDBRpcDataSet.TIMESTAMP_STR] = time_array
- else:
- result[IoTDBRpcDataSet.TIMESTAMP_STR] = np.concatenate(
- (result[IoTDBRpcDataSet.TIMESTAMP_STR], time_array),
axis=0
- )
+ result[IoTDBRpcDataSet.TIMESTAMP_STR].append(time_array)
+
self.__query_data_set.time = []
total_length = len(time_array)
@@ -266,18 +262,19 @@ class IoTDBRpcDataSet(object):
data_array = tmp_array
- if result[column_name] is None:
- result[column_name] = data_array
- else:
- if isinstance(result[column_name], pd.Series):
- result[column_name] =
result[column_name].append(data_array)
- else:
- result[column_name] = np.concatenate(
- (result[column_name], data_array), axis=0
- )
+ result[column_name].append(data_array)
+
for k, v in result.items():
- if v is None:
+ if v is None or len(v) < 1 or v[0] is None:
result[k] = []
+ elif v[0].dtype == "Int32":
+ result[k] = pd.Series(np.concatenate(v,
axis=0)).astype("Int32")
+ elif v[0].dtype == "Int64":
+ result[k] = pd.Series(np.concatenate(v,
axis=0)).astype("Int64")
+ elif v[0].dtype == "boolean":
+ result[k] = pd.Series(np.concatenate(v,
axis=0)).astype("boolean")
+ else:
+ result[k] = np.concatenate(v, axis=0)
df = pd.DataFrame(result)
return df