Re: [PR] support python to_dataframe. [tsfile]

via GitHub Sun, 02 Nov 2025 04:58:26 -0800


Copilot commented on code in PR #624:
URL: https://github.com/apache/tsfile/pull/624#discussion_r2484802552



##########
python/tests/test_write_and_read.py:
##########
@@ -268,3 +272,35 @@ def test_tsfile_config():
         set_tsfile_config({"float_encoding_type_": TSEncoding.BITMAP})
     with pytest.raises(NotSupportedError):
         set_tsfile_config({"time_compress_type_": Compressor.PAA})
+
+
+def test_tsfile_to_df():
+    table = TableSchema("test_table",
+                        [ColumnSchema("device", TSDataType.STRING, 
ColumnCategory.TAG),
+                         ColumnSchema("value", TSDataType.DOUBLE, 
ColumnCategory.FIELD),
+                         ColumnSchema("value2", TSDataType.INT64, 
ColumnCategory.FIELD)])
+    try:
+        with TsFileTableWriter("table_write_to_df.tsfile", table) as writer:
+            tablet = Tablet(["device", "value", "value2"],
+                            [TSDataType.STRING, TSDataType.DOUBLE, 
TSDataType.INT64], 4097)
+            for i in range(4097):
+                tablet.add_timestamp(i, i)
+                tablet.add_value_by_name("device", i, "device" + str(i))
+                tablet.add_value_by_index(1, i, i * 100.0)
+                tablet.add_value_by_index(2, i, i * 100)
+            writer.write_table(tablet)
+        df1 = tsfile.to_dataframe("table_write_to_df.tsfile")
+        assert df1.shape == (4097, 4)
+        assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096
+        assert df1["time"].dtype == np.int64
+        assert df1["value"].dtype == np.float64
+        assert df1["value2"].dtype == np.int64
+        df2 = tsfile.to_dataframe("table_write_to_df.tsfile", 
column_names=["device", "value2"])
+        assert df2.shape == (4097, 3)
+        assert df1["value2"].equals(df2["value2"])
+        with pytest.raises(TableNotExistError):
+            df2 = tsfile.to_dataframe("table_write_to_df.tsfile", "test_tb")
+        with pytest.raises(ColumnNotExistError):
+            df3 = tsfile.to_dataframe("table_write_to_df.tsfile", 
"test_table", "device1")

Review Comment:
   The test passes a string `'device1'` for the `column_names` parameter, but 
the function signature expects `list[str]`. This should be `['device1']` (a 
list) to match the API design. The test will fail unless the implementation 
handles this type mismatch, which it doesn't.
   ```suggestion
               df3 = tsfile.to_dataframe("table_write_to_df.tsfile", 
"test_table", ["device1"])
   ```



##########
python/tsfile/utils.py:
##########
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import pandas as pd
+
+from tsfile.exceptions import *
+from tsfile.tsfile_reader import TsFileReaderPy
+
+
+def to_dataframe(file_path: str,
+                 table_name: str = None,
+                 column_names: list[str] = None,
+                 max_row_num: int = None) -> pd.DataFrame:
+    with TsFileReaderPy(file_path) as reader:
+        total_rows = 0
+        table_schema = reader.get_all_table_schemas()
+        if len(table_schema) == 0:
+            raise TableNotExistError("Not found any table in the TsFile.")
+        if table_name is None:
+            # get the first table name by default
+            table_name, columns = next(iter(table_schema.items()))
+        else:
+            if table_name not in table_schema:
+                raise TableNotExistError(table_name)
+            columns = table_schema[table_name]
+
+        column_names_in_file = columns.get_column_names()
+
+        if column_names is not None:
+            for column in column_names:
+                if column not in column_names_in_file:
+                    raise ColumnNotExistError(column)
+        else:
+            column_names = column_names_in_file
+
+        df_list: list[pd.DataFrame] = []
+
+        with reader.query_table(table_name, column_names) as result:
+            while result.next():
+                if max_row_num is not None:
+                    remaining_rows = max_row_num - total_rows
+                    if remaining_rows <= 0:
+                        break
+                    else:
+                        batch_rows = min(remaining_rows, 1024)
+                    df = result.read_data_frame(batch_rows)
+                    total_rows += batch_rows
+                else:
+                    df = result.read_data_frame()
+                df_list.append(df)

Review Comment:
   If no data is read (empty result set), `df_list` will be empty and 
`pd.concat()` will raise a ValueError. Consider adding a check: `if not 
df_list: return pd.DataFrame()` before the concat operation.
   ```suggestion
                   df_list.append(df)
           if not df_list:
               return pd.DataFrame(columns=column_names)
   ```



##########
python/tests/test_write_and_read.py:
##########
@@ -18,15 +18,18 @@
 
 import os
 
+import numpy as np
 import pytest
 
-from tsfile import ColumnSchema, TableSchema, TSEncoding, NotSupportedError
+import tsfile

Review Comment:
   Module 'tsfile' is imported with both 'import' and 'import from'.
   ```suggestion
   
   ```



##########
python/tsfile/utils.py:
##########
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import pandas as pd
+
+from tsfile.exceptions import *
+from tsfile.tsfile_reader import TsFileReaderPy
+
+
+def to_dataframe(file_path: str,
+                 table_name: str = None,
+                 column_names: list[str] = None,
+                 max_row_num: int = None) -> pd.DataFrame:
+    with TsFileReaderPy(file_path) as reader:
+        total_rows = 0
+        table_schema = reader.get_all_table_schemas()
+        if len(table_schema) == 0:
+            raise TableNotExistError("Not found any table in the TsFile.")
+        if table_name is None:
+            # get the first table name by default
+            table_name, columns = next(iter(table_schema.items()))
+        else:
+            if table_name not in table_schema:
+                raise TableNotExistError(table_name)
+            columns = table_schema[table_name]
+
+        column_names_in_file = columns.get_column_names()
+
+        if column_names is not None:
+            for column in column_names:
+                if column not in column_names_in_file:
+                    raise ColumnNotExistError(column)
+        else:
+            column_names = column_names_in_file
+
+        df_list: list[pd.DataFrame] = []
+
+        with reader.query_table(table_name, column_names) as result:
+            while result.next():
+                if max_row_num is not None:
+                    remaining_rows = max_row_num - total_rows
+                    if remaining_rows <= 0:
+                        break
+                    else:
+                        batch_rows = min(remaining_rows, 1024)

Review Comment:
   The magic number `1024` for batch size is hardcoded. Consider defining it as 
a named constant at the module level (e.g., `BATCH_SIZE = 1024`) to improve 
maintainability and make it easier to adjust.



##########
python/tests/test_write_and_read.py:
##########
@@ -268,3 +272,35 @@ def test_tsfile_config():
         set_tsfile_config({"float_encoding_type_": TSEncoding.BITMAP})
     with pytest.raises(NotSupportedError):
         set_tsfile_config({"time_compress_type_": Compressor.PAA})
+
+
+def test_tsfile_to_df():
+    table = TableSchema("test_table",
+                        [ColumnSchema("device", TSDataType.STRING, 
ColumnCategory.TAG),
+                         ColumnSchema("value", TSDataType.DOUBLE, 
ColumnCategory.FIELD),
+                         ColumnSchema("value2", TSDataType.INT64, 
ColumnCategory.FIELD)])
+    try:
+        with TsFileTableWriter("table_write_to_df.tsfile", table) as writer:
+            tablet = Tablet(["device", "value", "value2"],
+                            [TSDataType.STRING, TSDataType.DOUBLE, 
TSDataType.INT64], 4097)
+            for i in range(4097):
+                tablet.add_timestamp(i, i)
+                tablet.add_value_by_name("device", i, "device" + str(i))
+                tablet.add_value_by_index(1, i, i * 100.0)
+                tablet.add_value_by_index(2, i, i * 100)
+            writer.write_table(tablet)
+        df1 = tsfile.to_dataframe("table_write_to_df.tsfile")
+        assert df1.shape == (4097, 4)
+        assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096
+        assert df1["time"].dtype == np.int64
+        assert df1["value"].dtype == np.float64
+        assert df1["value2"].dtype == np.int64
+        df2 = tsfile.to_dataframe("table_write_to_df.tsfile", 
column_names=["device", "value2"])
+        assert df2.shape == (4097, 3)
+        assert df1["value2"].equals(df2["value2"])
+        with pytest.raises(TableNotExistError):
+            df2 = tsfile.to_dataframe("table_write_to_df.tsfile", "test_tb")
+        with pytest.raises(ColumnNotExistError):
+            df3 = tsfile.to_dataframe("table_write_to_df.tsfile", 
"test_table", "device1")

Review Comment:
   Variable df3 is not used.
   ```suggestion
               tsfile.to_dataframe("table_write_to_df.tsfile", "test_table", 
"device1")
   ```



##########
python/tsfile/utils.py:
##########
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import pandas as pd
+
+from tsfile.exceptions import *
+from tsfile.tsfile_reader import TsFileReaderPy
+
+
+def to_dataframe(file_path: str,
+                 table_name: str = None,
+                 column_names: list[str] = None,
+                 max_row_num: int = None) -> pd.DataFrame:
+    with TsFileReaderPy(file_path) as reader:
+        total_rows = 0
+        table_schema = reader.get_all_table_schemas()
+        if len(table_schema) == 0:
+            raise TableNotExistError("Not found any table in the TsFile.")
+        if table_name is None:
+            # get the first table name by default
+            table_name, columns = next(iter(table_schema.items()))
+        else:
+            if table_name not in table_schema:
+                raise TableNotExistError(table_name)
+            columns = table_schema[table_name]
+
+        column_names_in_file = columns.get_column_names()
+
+        if column_names is not None:
+            for column in column_names:
+                if column not in column_names_in_file:
+                    raise ColumnNotExistError(column)
+        else:
+            column_names = column_names_in_file
+
+        df_list: list[pd.DataFrame] = []
+
+        with reader.query_table(table_name, column_names) as result:
+            while result.next():
+                if max_row_num is not None:
+                    remaining_rows = max_row_num - total_rows
+                    if remaining_rows <= 0:
+                        break
+                    else:
+                        batch_rows = min(remaining_rows, 1024)
+                    df = result.read_data_frame(batch_rows)
+                    total_rows += batch_rows

Review Comment:
   The `total_rows` counter is incremented by `batch_rows` (the requested 
rows), but this assumes all requested rows were successfully read. If the 
result set has fewer rows than `batch_rows`, this will cause an inaccurate 
count and potentially allow more rows to be read than `max_row_num`. Consider 
using `total_rows += len(df)` instead.
   ```suggestion
                       total_rows += len(df)
   ```



##########
python/tests/test_write_and_read.py:
##########
@@ -268,3 +272,35 @@ def test_tsfile_config():
         set_tsfile_config({"float_encoding_type_": TSEncoding.BITMAP})
     with pytest.raises(NotSupportedError):
         set_tsfile_config({"time_compress_type_": Compressor.PAA})
+
+
+def test_tsfile_to_df():
+    table = TableSchema("test_table",
+                        [ColumnSchema("device", TSDataType.STRING, 
ColumnCategory.TAG),
+                         ColumnSchema("value", TSDataType.DOUBLE, 
ColumnCategory.FIELD),
+                         ColumnSchema("value2", TSDataType.INT64, 
ColumnCategory.FIELD)])
+    try:
+        with TsFileTableWriter("table_write_to_df.tsfile", table) as writer:
+            tablet = Tablet(["device", "value", "value2"],
+                            [TSDataType.STRING, TSDataType.DOUBLE, 
TSDataType.INT64], 4097)
+            for i in range(4097):
+                tablet.add_timestamp(i, i)
+                tablet.add_value_by_name("device", i, "device" + str(i))
+                tablet.add_value_by_index(1, i, i * 100.0)
+                tablet.add_value_by_index(2, i, i * 100)
+            writer.write_table(tablet)
+        df1 = tsfile.to_dataframe("table_write_to_df.tsfile")
+        assert df1.shape == (4097, 4)
+        assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096
+        assert df1["time"].dtype == np.int64
+        assert df1["value"].dtype == np.float64
+        assert df1["value2"].dtype == np.int64
+        df2 = tsfile.to_dataframe("table_write_to_df.tsfile", 
column_names=["device", "value2"])
+        assert df2.shape == (4097, 3)
+        assert df1["value2"].equals(df2["value2"])
+        with pytest.raises(TableNotExistError):
+            df2 = tsfile.to_dataframe("table_write_to_df.tsfile", "test_tb")

Review Comment:
   Variable df2 is not used.
   ```suggestion
               tsfile.to_dataframe("table_write_to_df.tsfile", "test_tb")
   ```



##########
python/tests/test_write_and_read.py:
##########
@@ -18,15 +18,18 @@
 
 import os
 
+import numpy as np
 import pytest
 
-from tsfile import ColumnSchema, TableSchema, TSEncoding, NotSupportedError
+import tsfile
+from tsfile import ColumnSchema, TableSchema, TSEncoding
+from tsfile import Compressor
 from tsfile import TSDataType
 from tsfile import Tablet, RowRecord, Field
 from tsfile import TimeseriesSchema
 from tsfile import TsFileTableWriter
 from tsfile import TsFileWriter, TsFileReader, ColumnCategory
-from tsfile import Compressor
+from tsfile.exceptions import *
 
 

Review Comment:
   Import pollutes the enclosing namespace, as the imported module 
[tsfile.exceptions](1) does not define '__all__'.
   ```suggestion
   
   ```



##########
python/tsfile/utils.py:
##########
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import pandas as pd
+
+from tsfile.exceptions import *

Review Comment:
   Import pollutes the enclosing namespace, as the imported module 
[tsfile.exceptions](1) does not define '__all__'.
   ```suggestion
   from tsfile.exceptions import TableNotExistError, ColumnNotExistError
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] support python to_dataframe. [tsfile]

Reply via email to