Re: [PR] feature: finish multiple flush for c++_tsfile [tsfile]

via GitHub Fri, 14 Jun 2024 03:39:01 -0700


jt2594838 commented on code in PR #110:
URL: https://github.com/apache/tsfile/pull/110#discussion_r1639515547



##########
cpp/src/encoding/bitpack_encoder.h:
##########
@@ -58,9 +58,7 @@ class BitPackEncoder {
         packer_ = nullptr;
     }
 
-    void destroy() { /* do nothing for BitPackEncoder */
-        delete (packer_);
-    }
+    void destroy() { /* do nothing for BitPackEncoder */ delete (packer_); }

Review Comment:
   Why does it say "do nothing"?



##########
cpp/src/encoding/encoder.h:
##########
@@ -38,6 +38,14 @@ class Encoder {
     virtual int encode(float value, common::ByteStream &out_stream) = 0;
     virtual int encode(double value, common::ByteStream &out_stream) = 0;
     virtual int flush(common::ByteStream &out_stream) = 0;
+
+    /**
+     * The maximal possible memory size occupied by current Encoder. This
+     * statistic value doesn't involve OutputStream.
+     *
+     * @return the maximal size of possible memory occupied by current encoder
+     */
+    virtual int get_max_byte_size() = 0;

Review Comment:
   The implementation in BitPackEncoder is not the memory size because the 
memory data is not encoded; it calculates the encoded size.



##########
cpp/src/common/global.cc:
##########
@@ -50,6 +50,8 @@ void init_config_value() {
     g_config_value_.page_writer_max_memory_bytes_ = 128 * 1024;  // 128 k
     g_config_value_.max_degree_of_index_node_ = 256;
     g_config_value_.tsfile_index_bloom_filter_error_percent_ = 0.05;
+    g_config_value_.record_count_for_next_mem_check_ = 100;
+    g_config_value_.chunk_group_size_threshold_ = 128 * 1024;

Review Comment:
   It is not reasonable enough to set the size of a chunk group to 128 KB when 
the max memory of a single page is also 128 KB. At least it should be one or 
two magnitudes higher.



##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -393,15 +433,35 @@ int TsFileWriter::flush() {
        so map itself is ordered by device name. */
     std::map<std::string, MeasurementSchemaGroup *>::iterator device_iter;
     for (device_iter = schemas_.begin(); device_iter != schemas_.end();
-         device_iter++) {  // cppcheck-suppress postfixOperator
+         device_iter++) {
+
+        if (check_chunk_group_empty(device_iter->second)) {
+            continue;
+        }
+
         if (RET_FAIL(io_writer_->start_flush_chunk_group(device_iter->first))) 
{
         } else if (RET_FAIL(flush_chunk_group(device_iter->second))) {
         } else if (RET_FAIL(io_writer_->end_flush_chunk_group())) {
         }

Review Comment:
   If the return value indicates an error, it should be returned immediately.



##########
cpp/src/encoding/bitpack_encoder.h:
##########
@@ -167,6 +165,24 @@ class BitPackEncoder {
         bytes_buffer_.clear();
         bitpacked_group_count_ = 0;
     }
+
+    int get_max_byte_size() {
+        if (values_.empty()) {
+            return 0;
+        }
+        int maxBitWidth = get_int_max_bit_width(values_);
+        int totalValues = values_.size();
+        int fullGroups = totalValues / 8;
+        int remainingValues = totalValues % 8;
+        int bytesPerGroup = (maxBitWidth * 8 + 7) / 8;
+        int maxSize = 0;
+        maxSize += fullGroups * bytesPerGroup;
+        if (remainingValues > 0) {
+            maxSize += bytesPerGroup;
+        }
+        maxSize += fullGroups * (1 + 1) + (remainingValues > 0 ? (1 + 1) : 0);

Review Comment:
   Please add some comments for this (1 + 1).



##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -242,6 +241,44 @@ int TsFileWriter::do_check_schema(const std::string 
&device_name,
     return ret;
 }
 
+int64_t TsFileWriter::calculate_mem_size_for_all_group() {
+    int64_t mem_total_size = 0;
+    DeviceSchemaIter device_iter;
+    for (device_iter = schemas_.begin(); device_iter != schemas_.end();
+         device_iter++) {
+        MeasurementSchemaGroup *chunk_group = device_iter->second;
+        MeasurementSchemaMap &map = chunk_group->measurement_schema_map_;
+        for (MeasurementSchemaMapIter ms_iter = map.begin();
+             ms_iter != map.end(); ms_iter++) {
+            MeasurementSchema *m_schema = ms_iter->second;
+            ChunkWriter *&chunk_writer = m_schema->chunk_writer_;
+            mem_total_size += chunk_writer->estimate_max_series_mem_size();
+        }

Review Comment:
   Is there a chance that chunk_writer is null?



##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -419,6 +479,8 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup 
*chunk_group) {
                        chunk_writer->get_chunk_data()))) {
         } else if (RET_FAIL(io_writer_->end_flush_chunk(
                        chunk_writer->get_chunk_statistic()))) {
+        } else {
+            chunk_writer->destroy();
         }

Review Comment:
   If the return value indicates an error, it should be returned immediately.



##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -220,8 +220,7 @@ int TsFileWriter::do_check_schema(const std::string 
&device_name,
         if (UNLIKELY(ms_iter == msm.end())) {
             chunk_writers.push_back(NULL);
         } else {
-            // Here we may check data_type against ms_iter. But in Java
-            // libtsfile, no check here.
+            // In Java we will check data_type. But in C++, no check here.

Review Comment:
   Please give a reason for this.



##########
cpp/src/encoding/dictionary_encoder.h:
##########
@@ -95,6 +95,11 @@ class DictionaryEncoder {
     void write_encoded_data(common::ByteStream &out) {
         values_encoder_.encode_flush(out);
     }
+
+    int get_max_byte_size() 
+    {
+        return 4 + map_size_ + values_encoder_.get_max_byte_size();

Review Comment:
   Please also add comments here.
   
   I notice that the DictionaryEncoder uses a BitPackingEncoder as the inner 
encoder. However, in the Jave edition, the inner encoder is an RleEncoder (with 
bit-packing). This may cause incompatibility.



##########
cpp/src/writer/tsfile_writer.cc:
##########
@@ -242,6 +241,44 @@ int TsFileWriter::do_check_schema(const std::string 
&device_name,
     return ret;
 }
 
+int64_t TsFileWriter::calculate_mem_size_for_all_group() {
+    int64_t mem_total_size = 0;
+    DeviceSchemaIter device_iter;
+    for (device_iter = schemas_.begin(); device_iter != schemas_.end();
+         device_iter++) {
+        MeasurementSchemaGroup *chunk_group = device_iter->second;
+        MeasurementSchemaMap &map = chunk_group->measurement_schema_map_;
+        for (MeasurementSchemaMapIter ms_iter = map.begin();
+             ms_iter != map.end(); ms_iter++) {
+            MeasurementSchema *m_schema = ms_iter->second;
+            ChunkWriter *&chunk_writer = m_schema->chunk_writer_;
+            mem_total_size += chunk_writer->estimate_max_series_mem_size();
+        }
+    }
+    return mem_total_size;
+}
+
+/**
+ * check occupied memory size, if it exceeds the chunkGroupSize threshold, 
flush
+ * them to given OutputStream.
+ * 
+ * @return true - size of tsfile or metadata reaches the threshold. false -
+ * otherwise
+ */
+int TsFileWriter::check_memory_size_and_may_flush_chunks() {

Review Comment:
   I do not think the return value is boolean.



##########
cpp/src/encoding/bitpack_encoder.h:
##########
@@ -167,6 +165,24 @@ class BitPackEncoder {
         bytes_buffer_.clear();
         bitpacked_group_count_ = 0;
     }
+
+    int get_max_byte_size() {
+        if (values_.empty()) {
+            return 0;
+        }
+        int maxBitWidth = get_int_max_bit_width(values_);

Review Comment:
   Optional: the max_bit_width can be calculated in a streaming manner so that 
you will not have to calculate it every time.



##########
cpp/src/encoding/gorilla_encoder.h:
##########
@@ -120,6 +120,7 @@ class GorillaEncoder : public Encoder {
     }
 
     int get_one_item_max_size();
+    int get_max_byte_size();

Review Comment:
   What is the difference between the two? Their implementations are identical.



##########
cpp/src/file/tsfile_io_writer.cc:
##########
@@ -169,7 +169,7 @@ int TsFileIOWriter::flush_chunk(ByteStream &chunk_data) {
         // log_err("flush stream error, ret=%d", ret);
     }
 
-    chunk_data.destroy();
+    // chunk_data.destroy();
     return ret;

Review Comment:
   Remove it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feature: finish multiple flush for c++_tsfile [tsfile]

Reply via email to