This is an automated email from the ASF dual-hosted git repository. colinlee pushed a commit to branch support_dataframe_to_tsfile in repository https://gitbox.apache.org/repos/asf/tsfile.git
commit 9e48483996ee585a64802fcdc5e65cc8442507ae Author: ColinLee <[email protected]> AuthorDate: Wed Feb 11 00:25:38 2026 +0800 tmp code. --- cpp/src/common/constant/tsfile_constant.h | 4 ++-- cpp/src/common/global.cc | 2 +- cpp/src/cwrapper/tsfile_cwrapper.cc | 5 ----- cpp/src/utils/db_utils.h | 6 ++++-- .../tests/resources/table_with_time_column.tsfile | Bin 0 -> 644 bytes python/tests/test_dataframe.py | 24 ++++++++++----------- python/tests/test_load_tsfile_from_iotdb.py | 23 ++++++++++++++------ python/tests/test_to_tsfile.py | 12 +++++------ python/tests/test_write_and_read.py | 16 +++++++------- python/tsfile/constants.py | 18 ++++++++-------- python/tsfile/schema.py | 8 ++++--- 11 files changed, 63 insertions(+), 55 deletions(-) diff --git a/cpp/src/common/constant/tsfile_constant.h b/cpp/src/common/constant/tsfile_constant.h index d3f4dec1..096c645a 100644 --- a/cpp/src/common/constant/tsfile_constant.h +++ b/cpp/src/common/constant/tsfile_constant.h @@ -37,15 +37,15 @@ static const std::string BACK_QUOTE_STRING = "`"; static const std::string DOUBLE_BACK_QUOTE_STRING = "``"; static const unsigned char TIME_COLUMN_MASK = 0x80; +static const std::string TIME_COLUMN_NAME = "time"; static const unsigned char VALUE_COLUMN_MASK = 0x40; - -static const std::string TIME_COLUMN_ID = ""; static const int NO_STR_TO_READ = -1; static const std::regex IDENTIFIER_PATTERN("([a-zA-Z0-9_\\u2E80-\\u9FFF]+)"); static const std::regex NODE_NAME_PATTERN( "(\\*{0,2}[a-zA-Z0-9_\\u2E80-\\u9FFF]+\\*{0,2})"); static const int DEFAULT_SEGMENT_NUM_FOR_TABLE_NAME = 3; + } // namespace storage #endif diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc index 37b8c1bb..fd1d0132 100644 --- a/cpp/src/common/global.cc +++ b/cpp/src/common/global.cc @@ -122,7 +122,7 @@ int init_common() { g_time_column_schema.data_type_ = INT64; g_time_column_schema.encoding_ = PLAIN; g_time_column_schema.compression_ = UNCOMPRESSED; - g_time_column_schema.column_name_ = std::string("time"); + g_time_column_schema.column_name_ = storage::TIME_COLUMN_NAME; return ret; } diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc index f384698b..539d5b96 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.cc +++ b/cpp/src/cwrapper/tsfile_cwrapper.cc @@ -116,11 +116,6 @@ TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema, *err_code = common::E_INVALID_SCHEMA; return nullptr; } - // Ignore time column definition. - if (cur_schema.column_category == TIME) { - continue; - } - column_schemas.emplace_back( cur_schema.column_name, static_cast<common::TSDataType>(cur_schema.data_type), diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h index 85d99b1a..5a1dea8d 100644 --- a/cpp/src/utils/db_utils.h +++ b/cpp/src/utils/db_utils.h @@ -37,6 +37,7 @@ namespace common { extern TSEncoding get_value_encoder(TSDataType data_type); extern CompressionType get_default_compressor(); +// TODO: remove this. typedef struct FileID { int64_t seq_; // timestamp when create int32_t version_; @@ -64,13 +65,14 @@ typedef struct FileID { #endif } FileID; +// TODO: remove this. typedef uint16_t NodeID; struct TsID { NodeID db_nid_; NodeID device_nid_; NodeID measurement_nid_; - TsID() : db_nid_(0), device_nid_(0), measurement_nid_(0){}; + TsID() : db_nid_(0), device_nid_(0), measurement_nid_(0) {}; TsID(NodeID db_nid, NodeID device_nid, NodeID measurement_nid) : db_nid_(db_nid), @@ -157,7 +159,7 @@ struct TsID { * This enumeration class defines the supported categories for columns within a * table schema, distinguishing between tag and field columns. */ -enum class ColumnCategory { TAG = 0, FIELD = 1 }; +enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 }; /** * @brief Represents the schema information for a single column. diff --git a/python/tests/resources/table_with_time_column.tsfile b/python/tests/resources/table_with_time_column.tsfile new file mode 100644 index 00000000..66be782a Binary files /dev/null and b/python/tests/resources/table_with_time_column.tsfile differ diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 09d0001b..de49bc1c 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd import pytest -from tsfile import ColumnSchema, TableSchema, TSDataType +from tsfile import ColumnSchema, TableSchema, TSDataType, TIME_COLUMN from tsfile import TsFileTableWriter, ColumnCategory from tsfile import to_dataframe from tsfile.exceptions import ColumnNotExistError, TypeMismatchError @@ -70,10 +70,10 @@ def test_write_dataframe_basic(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (100, 4) - assert df_read["time"].equals(df_sorted["time"]) + assert df_read[TIME_COLUMN].equals(df_sorted["time"]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) assert df_read["value2"].equals(df_sorted["value2"]) @@ -99,12 +99,12 @@ def test_write_dataframe_with_index(): df.index = [i * 10 for i in range(50)] # Set index as timestamps writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = df.sort_index() df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True)) time_series = pd.Series(df.sort_index().index.values, dtype='Int64') assert df_read.shape == (50, 3) - assert df_read["time"].equals(time_series) + assert df_read[TIME_COLUMN].equals(time_series) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: @@ -130,10 +130,10 @@ def test_write_dataframe_case_insensitive(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('Time').reset_index(drop=True)) assert df_read.shape == (30, 3) - assert df_read["time"].equals(df_sorted["Time"]) + assert df_read[TIME_COLUMN].equals(df_sorted["Time"]) assert df_read["device"].equals(df_sorted["Device"]) assert df_read["value"].equals(df_sorted["VALUE"]) finally: @@ -218,7 +218,7 @@ def test_write_dataframe_all_datatypes(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (50, 11) assert df_read["bool_col"].equals(df_sorted["bool_col"]) @@ -257,10 +257,10 @@ def test_write_dataframe_schema_time_column(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (50, 3) - assert df_read["time"].equals(df_sorted["time"]) + assert df_read[TIME_COLUMN].equals(df_sorted[TIME_COLUMN]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: @@ -286,7 +286,7 @@ def test_write_dataframe_schema_time_and_dataframe_time(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types( df.sort_values('Time').rename(columns=str.lower).reset_index(drop=True) ) @@ -312,7 +312,7 @@ def test_write_dataframe_empty(): 'time': [], 'value': [] }) - with pytest.raises(ValueError) as err: + with pytest.raises(ValueError): writer.write_dataframe(df) finally: diff --git a/python/tests/test_load_tsfile_from_iotdb.py b/python/tests/test_load_tsfile_from_iotdb.py index d865dd35..8dcc0b1c 100644 --- a/python/tests/test_load_tsfile_from_iotdb.py +++ b/python/tests/test_load_tsfile_from_iotdb.py @@ -15,12 +15,13 @@ # specific language governing permissions and limitations # under the License. # - +import math import os import numpy as np import tsfile as ts +from tsfile import TIME_COLUMN def test_load_tsfile_from_iotdb(): @@ -31,8 +32,8 @@ def test_load_tsfile_from_iotdb(): ## -------- assert len(df) == 105, "row count mismatch" - assert df["time"].isna().sum() == 0 - assert int(df["time"].sum()) == 15960 + assert df[TIME_COLUMN].isna().sum() == 0 + assert int(df[TIME_COLUMN].sum()) == 15960 assert df["temperature"].isna().sum() == 5 assert df["status"].isna().sum() == 5 assert (df["status"] == True).sum() == 50 @@ -44,8 +45,8 @@ def test_load_tsfile_from_iotdb(): df = ts.to_dataframe(simple_tabl1_path) ## --------- assert len(df) == 60 - assert df["time"].isna().sum() == 0 - assert df["time"].sum() == ( + assert df[TIME_COLUMN].isna().sum() == 0 + assert df[TIME_COLUMN].sum() == ( (1760106020000 + 1760106049000) * 30 // 2 + (1760106080000 + 1760106109000) * 30 // 2 ) @@ -78,8 +79,8 @@ def test_load_tsfile_from_iotdb(): df = ts.to_dataframe(simple_tabl2_path) ## --------- assert len(df) == 40 - assert df["time"].isna().sum() == 0 - assert int(df["time"].sum()) == 70404242080000 + assert df[TIME_COLUMN].isna().sum() == 0 + assert int(df[TIME_COLUMN].sum()) == 70404242080000 assert df["s0"].isna().sum() == 0 assert df["s1"].isna().sum() == 0 @@ -109,3 +110,11 @@ def test_load_tsfile_from_iotdb(): assert df["s9"].isna().sum() == 5 ## --------- + table_with_time_column_path = os.path.join(dir_path, 'table_with_time_column.tsfile') + df = ts.to_dataframe(table_with_time_column_path) + + assert len(df) == 25 + assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) + assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) + assert (df["region_id"] == "loc").sum() == 25 + diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py index c3a970e3..a35d5e89 100644 --- a/python/tests/test_to_tsfile.py +++ b/python/tests/test_to_tsfile.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd import pytest -from tsfile import to_dataframe, TsFileReader, ColumnCategory +from tsfile import to_dataframe, TsFileReader, ColumnCategory, TIME_COLUMN from tsfile.utils import dataframe_to_tsfile @@ -132,11 +132,11 @@ def test_dataframe_to_tsfile_custom_time_column(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table", time_column="timestamp") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('timestamp').reset_index(drop=True)) assert df_read.shape == (30, 3) - assert df_read["time"].equals(df_sorted["timestamp"]) + assert df_read[TIME_COLUMN].equals(df_sorted["timestamp"]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: @@ -181,7 +181,7 @@ def test_dataframe_to_tsfile_with_tag_columns(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device", "location"]) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (20, 4) @@ -214,7 +214,7 @@ def test_dataframe_to_tsfile_tag_time_unsorted(): assert df_read.shape == (10, 3) assert df_read["device"].equals(df_expected["device"]) - assert df_read["time"].equals(df_expected["time"]) + assert df_read[TIME_COLUMN].equals(df_expected["time"]) assert df_read["value"].equals(df_expected["value"]) finally: if os.path.exists(tsfile_path): @@ -244,7 +244,7 @@ def test_dataframe_to_tsfile_all_datatypes(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (50, 11) diff --git a/python/tests/test_write_and_read.py b/python/tests/test_write_and_read.py index 3cef99c4..57294a84 100644 --- a/python/tests/test_write_and_read.py +++ b/python/tests/test_write_and_read.py @@ -25,7 +25,7 @@ import pytest from pandas import Float64Dtype from pandas.core.dtypes.common import is_integer_dtype -from tsfile import ColumnSchema, TableSchema, TSEncoding +from tsfile import ColumnSchema, TableSchema, TSEncoding, TIME_COLUMN from tsfile import Compressor from tsfile import TSDataType from tsfile import Tablet, RowRecord, Field @@ -170,7 +170,7 @@ def test_tree_query_to_dataframe_variants(): assert df_all.shape[0] == total_rows for measurement in all_measurements: assert measurement in df_all.columns - assert "time" in df_all.columns + assert TIME_COLUMN in df_all.columns path_columns = sorted( [col for col in df_all.columns if col.startswith("col_")], key=lambda name: int(name.split("_")[1]), @@ -179,7 +179,7 @@ def test_tree_query_to_dataframe_variants(): for _, row in df_all.iterrows(): device = _extract_device(row, path_columns) - timestamp = int(row["time"]) + timestamp = int(row[TIME_COLUMN]) assert (device, timestamp) in expected_values expected_row = expected_values[(device, timestamp)] for measurement in all_measurements: @@ -201,7 +201,7 @@ def test_tree_query_to_dataframe_variants(): assert measurement not in df_subset.columns for _, row in df_subset.iterrows(): device = _extract_device(row, path_columns) - timestamp = int(row["time"]) + timestamp = int(row[TIME_COLUMN]) expected_row = expected_values[(device, timestamp)] for measurement in requested_columns: value = row.get(measurement) @@ -227,7 +227,7 @@ def test_tree_query_to_dataframe_variants(): iter_rows = 0 for batch in iterator: assert isinstance(batch, pd.DataFrame) - assert set(batch.columns).issuperset({"time", "level"}) + assert set(batch.columns).issuperset({TIME_COLUMN, "level"}) iter_rows += len(batch) assert iter_rows == 18 @@ -242,7 +242,7 @@ def test_tree_query_to_dataframe_variants(): iter_rows = 0 for batch in iterator: assert isinstance(batch, pd.DataFrame) - assert set(batch.columns).issuperset({"time", "level"}) + assert set(batch.columns).issuperset({TIME_COLUMN, "level"}) iter_rows += len(batch) assert iter_rows == 9 @@ -384,7 +384,7 @@ def test_table_writer_and_reader(): 0, 10) as result: cur_line = 0 while result.next(): - cur_time = result.get_value_by_name("time") + cur_time = result.get_value_by_name(TIME_COLUMN) assert result.get_value_by_name("device") == "device" + str(cur_time) assert result.is_null_by_name("device") == False assert result.is_null_by_name("value") == False @@ -545,7 +545,7 @@ def test_tsfile_to_df(): df1 = to_dataframe("table_write_to_df.tsfile") assert df1.shape == (4097, 4) assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096 - assert is_integer_dtype(df1["time"]) + assert is_integer_dtype(df1[TIME_COLUMN]) assert df1["value"].dtype == Float64Dtype() assert is_integer_dtype(df1["value2"]) df2 = to_dataframe("table_write_to_df.tsfile", column_names=["device", "value2"]) diff --git a/python/tsfile/constants.py b/python/tsfile/constants.py index 6f233e27..18da3aef 100644 --- a/python/tsfile/constants.py +++ b/python/tsfile/constants.py @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. # -from datetime import datetime from enum import unique, IntEnum + import numpy as np +TIME_COLUMN = "time" + @unique class TSDataType(IntEnum): BOOLEAN = 0 @@ -103,7 +105,7 @@ class TSDataType(IntEnum): return cls.STRING except (ImportError, AttributeError): pass - + if hasattr(dtype, 'type'): dtype = dtype.type if dtype is np.bool_: @@ -118,12 +120,12 @@ class TSDataType(IntEnum): return cls.DOUBLE elif dtype is np.object_: return cls.STRING - + dtype_str = str(dtype) if 'stringdtype' in dtype_str.lower() or dtype_str.startswith('string'): return cls.STRING - + dtype_map = { 'bool': cls.BOOLEAN, 'boolean': cls.BOOLEAN, @@ -137,17 +139,17 @@ class TSDataType(IntEnum): 'object': cls.STRING, 'string': cls.STRING, } - + if dtype_str in dtype_map: return dtype_map[dtype_str] - + dtype_lower = dtype_str.lower() if dtype_lower in dtype_map: return dtype_map[dtype_lower] if 'object_' in dtype_lower or dtype_str == "<class 'numpy.object_'>": return cls.STRING - + if dtype_str.startswith('datetime64'): return cls.TIMESTAMP @@ -163,8 +165,6 @@ _TSDATATYPE_COMPATIBLE_SOURCES = { } - - @unique class TSEncoding(IntEnum): PLAIN = 0 diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index 91732eee..f0fa39b1 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -119,15 +119,17 @@ class TableSchema: self.table_name = table_name.lower() if len(columns) == 0: raise ValueError("Columns cannot be empty") - self.columns = columns - for column in self.columns: + self.columns = [] + for column in columns: if column.get_category() == ColumnCategory.TIME: if self.time_column is not None: raise ValueError( f"Table '{self.table_name}' cannot have multiple time columns: " - f"'{self.time_column.name}' and '{column.name}'" + f"'{self.time_column.get_column_name()}' and '{column.get_column_name()}'" ) self.time_column = column + else: + self.columns.append(column) def get_table_name(self): return self.table_name
