jt2594838 commented on code in PR #734:
URL: https://github.com/apache/tsfile/pull/734#discussion_r2881480654
##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -808,26 +874,38 @@ int TsFileWriter::write_table(Tablet& tablet) {
value_chunk_writers))) {
return ret;
}
+ // Row-by-row write so that when time page seals (e.g. by memory
+ // threshold), we can seal all value pages together (Java
+ // semantics).
for (int i = start_idx; i < end_idx; i++) {
+ int32_t time_pages_before = time_chunk_writer->num_of_pages();
if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[i])))
{
return ret;
}
- }
- uint32_t field_col_count = 0;
- for (uint32_t i = 0; i < tablet.get_column_count(); ++i) {
- if (tablet.column_categories_[i] ==
- common::ColumnCategory::FIELD) {
- ValueChunkWriter* value_chunk_writer =
- value_chunk_writers[field_col_count];
- if (IS_NULL(value_chunk_writer)) {
- continue;
+ uint32_t field_col_count = 0;
+ for (uint32_t col = 0; col < tablet.get_column_count(); ++col)
{
+ if (tablet.column_categories_[col] ==
+ common::ColumnCategory::FIELD) {
+ ValueChunkWriter* value_chunk_writer =
+ value_chunk_writers[field_col_count];
+ if (!IS_NULL(value_chunk_writer) &&
+ RET_FAIL(value_write_column(
+ value_chunk_writer, tablet, col, i, i + 1))) {
+ return ret;
+ }
+ field_col_count++;
}
-
- if (RET_FAIL(value_write_column(value_chunk_writer, tablet,
- i, start_idx, end_idx))) {
- return ret;
+ }
+ int32_t time_pages_after = time_chunk_writer->num_of_pages();
+ if (time_pages_after > time_pages_before) {
+ for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
+ if (!IS_NULL(value_chunk_writers[k]) &&
+ value_chunk_writers[k]->get_point_numer() > 0 &&
+ RET_FAIL(
+ value_chunk_writers[k]->seal_current_page())) {
+ return ret;
+ }
}
Review Comment:
Why is the number of value pages not checked.
##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -716,15 +764,33 @@ int TsFileWriter::write_tablet_aligned(const Tablet&
tablet) {
data_types))) {
return ret;
}
- time_write_column(time_chunk_writer, tablet);
- ASSERT(value_chunk_writers.size() == tablet.get_column_count());
- for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
- ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
- if (IS_NULL(value_chunk_writer)) {
- continue;
+ for (uint32_t row = 0; row < tablet.get_cur_row_size(); row++) {
+ int32_t time_pages_before = time_chunk_writer->num_of_pages();
+ std::vector<int32_t> value_pages_before(value_chunk_writers.size(), 0);
+ for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
+ ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
+ if (!IS_NULL(value_chunk_writer)) {
+ value_pages_before[c] = value_chunk_writer->num_of_pages();
+ }
+ }
Review Comment:
Changing from a column-based order to a row-based order may cause a
performance downgrade.
May add a configuration to optionally sacrifice the page size limit:
If strict_page_size=true, use the row-based insertion with checking the page
size each time.
Otherwise, insert each column wholly without the page size check, and
perform it at the very end.
##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -716,15 +764,33 @@ int TsFileWriter::write_tablet_aligned(const Tablet&
tablet) {
data_types))) {
return ret;
}
- time_write_column(time_chunk_writer, tablet);
- ASSERT(value_chunk_writers.size() == tablet.get_column_count());
- for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
- ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
- if (IS_NULL(value_chunk_writer)) {
- continue;
+ for (uint32_t row = 0; row < tablet.get_cur_row_size(); row++) {
+ int32_t time_pages_before = time_chunk_writer->num_of_pages();
+ std::vector<int32_t> value_pages_before(value_chunk_writers.size(), 0);
+ for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
+ ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
+ if (!IS_NULL(value_chunk_writer)) {
+ value_pages_before[c] = value_chunk_writer->num_of_pages();
+ }
+ }
Review Comment:
Or, when there is no string/blob/text column, the memory size of each column
can be directly computed with its length. Thus, we can divide the incoming
column into splits that will cause a page seal, and write them one by one in a
column-based order.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]