This is an automated email from the ASF dual-hosted git repository.

yangzhg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4d64612  [ARRAY]Save array's size instead of offset. (#5983)
4d64612 is described below

commit 4d64612b96ac1093095c3f3a6310160826988d37
Author: Lijia Liu <[email protected]>
AuthorDate: Thu Jun 10 12:32:58 2021 +0800

    [ARRAY]Save array's size instead of offset. (#5983)
    
    * Save array's size instead of offset.
    
    * Optimize variable name
    
    * Fix comment
---
 be/src/olap/collection.h                           |   7 +-
 be/src/olap/column_vector.cpp                      |  64 ++++---
 be/src/olap/column_vector.h                        |  89 ++++++++--
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 162 +++++++++--------
 be/src/olap/rowset/segment_v2/column_reader.h      |  55 +++++-
 be/src/olap/rowset/segment_v2/column_writer.cpp    | 196 +++++++++++++++------
 be/src/olap/rowset/segment_v2/column_writer.h      |  22 ++-
 be/src/olap/rowset/segment_v2/parsed_page.h        |   4 +-
 be/src/olap/rowset/segment_v2/segment_writer.cpp   |  10 +-
 be/src/olap/rowset/segment_v2/segment_writer.h     |   3 +-
 be/src/olap/tablet_schema.cpp                      |   1 +
 be/src/olap/tablet_schema.h                        |   2 +
 be/src/olap/types.cpp                              |   5 +-
 be/test/olap/column_vector_test.cpp                | 101 ++++++-----
 .../segment_v2/column_reader_writer_test.cpp       |  28 +--
 gensrc/proto/segment_v2.proto                      |   6 +-
 16 files changed, 506 insertions(+), 249 deletions(-)

diff --git a/be/src/olap/collection.h b/be/src/olap/collection.h
index dbdc717..328869b 100644
--- a/be/src/olap/collection.h
+++ b/be/src/olap/collection.h
@@ -25,16 +25,17 @@ namespace doris {
 struct Collection {
     // child column data
     void* data;
-    uint32_t length;
+    uint64_t length;
     // item has no null value if has_null is false.
     // item ```may``` has null value if has_null is true.
+    // null_count is better?
     bool has_null;
     // null bitmap
     bool* null_signs;
 
     Collection() : data(nullptr), length(0), has_null(false), 
null_signs(nullptr) {}
 
-    explicit Collection(uint32_t length)
+    explicit Collection(uint64_t length)
             : data(nullptr), length(length), has_null(false), 
null_signs(nullptr) {}
 
     Collection(void* data, size_t length)
@@ -46,7 +47,7 @@ struct Collection {
     Collection(void* data, size_t length, bool has_null, bool* null_signs)
             : data(data), length(length), has_null(has_null), 
null_signs(null_signs) {}
 
-    bool is_null_at(uint32_t index) { return this->has_null && 
this->null_signs[index]; }
+    bool is_null_at(uint64_t index) { return this->has_null && 
this->null_signs[index]; }
 
     bool operator==(const Collection& y) const;
     bool operator!=(const Collection& value) const;
diff --git a/be/src/olap/column_vector.cpp b/be/src/olap/column_vector.cpp
index 15f158d..e21952f 100644
--- a/be/src/olap/column_vector.cpp
+++ b/be/src/olap/column_vector.cpp
@@ -127,8 +127,22 @@ Status ColumnVectorBatch::create(size_t init_capacity, 
bool is_nullable, const T
                 return Status::NotSupported(
                         "When create ArrayColumnVectorBatch, `Field` is 
indispensable");
             }
+
+            std::unique_ptr<ColumnVectorBatch> elements;
+            auto array_type_info = reinterpret_cast<const 
ArrayTypeInfo*>(type_info);
+            RETURN_IF_ERROR(ColumnVectorBatch::create(init_capacity * 2, 
field->get_sub_field(0)->is_nullable(),
+                    array_type_info->item_type_info(), 
field->get_sub_field(0), &elements));
+
+            std::unique_ptr<ColumnVectorBatch> offsets;
+            TypeInfo* bigint_type_info = 
get_scalar_type_info(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT);
+            RETURN_IF_ERROR(ColumnVectorBatch::create(init_capacity + 1, false,
+                                      bigint_type_info, nullptr, &offsets));
+
             std::unique_ptr<ColumnVectorBatch> local(
-                    new ArrayColumnVectorBatch(type_info, is_nullable, 
init_capacity, field));
+                    new ArrayColumnVectorBatch(type_info,
+                            is_nullable,
+                            
reinterpret_cast<ScalarColumnVectorBatch<uint64_t>*>(offsets.release()),
+                            elements.release()));
             RETURN_IF_ERROR(local->resize(init_capacity));
             *column_vector_batch = std::move(local);
             return Status::OK();
@@ -158,13 +172,12 @@ Status ScalarColumnVectorBatch<ScalarType>::resize(size_t 
new_cap) {
 }
 
 ArrayColumnVectorBatch::ArrayColumnVectorBatch(const TypeInfo* type_info, bool 
is_nullable,
-                                               size_t init_capacity, Field* 
field)
-        : ColumnVectorBatch(type_info, is_nullable), _data(0), 
_item_offsets(1) {
-    auto array_type_info = reinterpret_cast<const ArrayTypeInfo*>(type_info);
-    _item_offsets[0] = 0;
-    ColumnVectorBatch::create(init_capacity * 2, 
field->get_sub_field(0)->is_nullable(),
-                              array_type_info->item_type_info(), 
field->get_sub_field(0),
-                              &_elements);
+                                               
ScalarColumnVectorBatch<uint64_t>* offsets,
+                                               ColumnVectorBatch* elements)
+        : ColumnVectorBatch(type_info, is_nullable), _data(0) {
+    _offsets.reset(offsets);
+    *(_offsets->scalar_cell_ptr(0)) = 0;
+    _elements.reset(elements);
 }
 
 ArrayColumnVectorBatch::~ArrayColumnVectorBatch() = default;
@@ -173,31 +186,32 @@ Status ArrayColumnVectorBatch::resize(size_t new_cap) {
     if (capacity() < new_cap) {
         RETURN_IF_ERROR(ColumnVectorBatch::resize(new_cap));
         _data.resize(new_cap);
-        _item_offsets.resize(new_cap + 1);
+        _offsets->resize(new_cap + 1);
     }
     return Status::OK();
 }
 
-void ArrayColumnVectorBatch::put_item_ordinal(segment_v2::ordinal_t* ordinals, 
size_t start_idx,
-                                              size_t size) {
-    size_t first_offset = _item_offsets[start_idx];
-    segment_v2::ordinal_t first_ordinal = ordinals[0];
-    size_t i = 0;
-    while (++i < size) {
-        _item_offsets[start_idx + i] = first_offset + (ordinals[i] - 
first_ordinal);
+void ArrayColumnVectorBatch::get_offset_by_length(size_t start_idx, size_t 
size) {
+    DCHECK(start_idx >= 0);
+    DCHECK(start_idx + size < _offsets->capacity());
+
+    for (size_t i = start_idx; i < start_idx + size; ++i) {
+        *(_offsets->scalar_cell_ptr(i + 1)) =
+                *(_offsets->scalar_cell_ptr(i)) + 
*(_offsets->scalar_cell_ptr(i + 1));
     }
 }
 
-void ArrayColumnVectorBatch::prepare_for_read(size_t start_idx, size_t end_idx,
-                                              bool item_has_null) {
-    for (size_t idx = start_idx; idx < end_idx; ++idx) {
-        if (!is_null_at(idx)) {
-            _data[idx] = Collection(
-                    _elements->mutable_cell_ptr(_item_offsets[idx]),
-                    _item_offsets[idx + 1] - _item_offsets[idx], item_has_null,
+void ArrayColumnVectorBatch::prepare_for_read(size_t start_idx, size_t size, 
bool item_has_null) {
+    DCHECK(start_idx + size <= capacity());
+    for (size_t i = 0; i < size; ++i) {
+        if (!is_null_at(start_idx + i)) {
+            _data[start_idx + i] = Collection(
+                    
_elements->mutable_cell_ptr(*(_offsets->scalar_cell_ptr(start_idx + i))),
+                    *(_offsets->scalar_cell_ptr(start_idx + i + 1)) - 
*(_offsets->scalar_cell_ptr(start_idx + i)),
+                    item_has_null,
                     _elements->is_nullable()
-                            ? 
const_cast<bool*>(&_elements->null_signs()[_item_offsets[idx]])
-                            : nullptr);
+                    ? 
const_cast<bool*>(&_elements->null_signs()[*(_offsets->scalar_cell_ptr(start_idx
 + i))])
+                    : nullptr);
         }
     }
 }
diff --git a/be/src/olap/column_vector.h b/be/src/olap/column_vector.h
index ed4a878..67a79fb 100644
--- a/be/src/olap/column_vector.h
+++ b/be/src/olap/column_vector.h
@@ -155,19 +155,54 @@ public:
         return reinterpret_cast<uint8_t*>(&_data[idx]);
     }
 
+    ScalarCppType* scalar_cell_ptr(size_t idx) {
+        return &_data[idx];
+    }
+
 private:
     DataBuffer<ScalarCppType> _data;
 };
 
+// util class for read array's null signs.
+class ArrayNullColumnVectorBatch : public ColumnVectorBatch {
+public:
+    explicit ArrayNullColumnVectorBatch(ColumnVectorBatch* array) :
+    
ColumnVectorBatch(get_scalar_type_info(FieldType::OLAP_FIELD_TYPE_TINYINT), 
false), _array(array) {}
+
+    ~ArrayNullColumnVectorBatch() override = default;
+
+    Status resize(size_t new_cap) override {
+        return Status::NotSupported("unsupported for resize 
ArrayNullColumnVectorBatch");
+    }
+
+    uint8_t* data() const override {
+        return const_cast<uint8_t*>(reinterpret_cast<const 
uint8_t*>(_array->null_signs()));
+    }
+
+    const uint8_t* cell_ptr(size_t idx) const override {
+        return reinterpret_cast<const uint8_t*>(_array->null_signs() + idx);
+    }
+
+    uint8_t* mutable_cell_ptr(size_t idx) override {
+        return const_cast<uint8_t*>(reinterpret_cast<const 
uint8_t*>(_array->null_signs() + idx));
+    }
+
+private:
+    ColumnVectorBatch* _array;
+};
+
 class ArrayColumnVectorBatch : public ColumnVectorBatch {
 public:
     explicit ArrayColumnVectorBatch(const TypeInfo* type_info, bool 
is_nullable,
-                                    size_t init_capacity, Field* field);
+                                    ScalarColumnVectorBatch<uint64_t>* offsets,
+                                    ColumnVectorBatch* elements);
     ~ArrayColumnVectorBatch() override;
     Status resize(size_t new_cap) override;
 
     ColumnVectorBatch* elements() const { return _elements.get(); }
 
+    ColumnVectorBatch* offsets() const { return _offsets.get(); }
+
     // Get the start of the data.
     uint8_t* data() const override {
         return reinterpret_cast<uint8*>(const_cast<Collection*>(_data.data()));
@@ -181,14 +216,48 @@ public:
     // Get thr idx's cell_ptr for write
     uint8_t* mutable_cell_ptr(size_t idx) override { return 
reinterpret_cast<uint8*>(&_data[idx]); }
 
-    size_t item_offset(size_t idx) const { return _item_offsets[idx]; }
+    size_t item_offset(size_t idx) const {
+        return *(_offsets->scalar_cell_ptr(idx));
+    }
 
-    // From `start_idx`, put `size` ordinals to _item_offsets
-    // Ex:
-    // original _item_offsets: 0 3 5 9; ordinals to be added: 100 105 111; 
size: 3; satart_idx: 3
-    // --> _item_offsets: 0 3 5 (9 + 100 - 100) (9 + 105 - 100) (9 + 111 - 100)
-    // _item_offsets becomes 0 3 5 9 14 20
-    void put_item_ordinal(segment_v2::ordinal_t* ordinals, size_t start_idx, 
size_t size);
+    /**
+     * Change array size to offset in this batch
+     *
+     * We should ensure that _offset[start_idx] is the sum of the lengths of 
the arrays from 0 to start_idx - 1
+     * and that the lengths of the arrays from start_idx to start_idx + size - 
1 has been written correctly
+     * to _offset[start_idx + 1 ... start_idx + size] before exec this method
+     *
+     * Ex:
+     * get_offset_by_length(2, 3)
+     *
+     * before exec:
+     *
+     * _offsets: [ 0      3      5      2      1      3 ]
+     *
+     * 1)
+     *
+     * _offsets: [ 0      3      5     (7)     1      3 ]
+     *
+     * 2)
+     *
+     * _offsets: [ 0      3      5      7     (8)     3 ]
+     *
+     * 3)
+     *
+     * _offsets: [ 0      3      5      7      8     (11) ]
+     *
+     * @param start_idx the starting position of the first array that we want 
to change
+     * @param size the number of array that we want to change
+     */
+    void get_offset_by_length(size_t start_idx, size_t size);
+
+    size_t get_item_size(size_t start_idx, size_t size) {
+        return *(_offsets->scalar_cell_ptr(start_idx + size)) - 
*(_offsets->scalar_cell_ptr(start_idx));
+    }
+
+    ArrayNullColumnVectorBatch get_null_as_batch() {
+        return ArrayNullColumnVectorBatch(this);
+    }
 
     // Generate collection slots.
     void prepare_for_read(size_t start_idx, size_t end_idx, bool 
item_has_null);
@@ -198,8 +267,8 @@ private:
 
     std::unique_ptr<ColumnVectorBatch> _elements;
 
-    // Stores each collection's start offsets in _elements.
-    DataBuffer<size_t> _item_offsets;
+    // Stores each array's start offsets in _elements.
+    std::unique_ptr<ScalarColumnVectorBatch<uint64_t>> _offsets;
 };
 
 template class ScalarColumnVectorBatch<bool>;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 496a391..37ead54 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -49,18 +49,37 @@ Status ColumnReader::create(const ColumnReaderOptions& 
opts, const ColumnMetaPB&
         auto type = (FieldType)meta.type();
         switch (type) {
         case FieldType::OLAP_FIELD_TYPE_ARRAY: {
+            DCHECK(meta.children_columns_size() == 2 || 
meta.children_columns_size() == 3);
+
             std::unique_ptr<ColumnReader> item_reader;
-            DCHECK(meta.children_columns_size() == 1);
             RETURN_IF_ERROR(ColumnReader::create(opts, 
meta.children_columns(0),
                                                  
meta.children_columns(0).num_rows(), file_name,
                                                  &item_reader));
             RETURN_IF_ERROR(item_reader->init());
 
+            std::unique_ptr<ColumnReader> offset_reader;
+            RETURN_IF_ERROR(ColumnReader::create(opts, 
meta.children_columns(1),
+                                                 
meta.children_columns(1).num_rows(), file_name,
+                                                 &offset_reader));
+            RETURN_IF_ERROR(offset_reader->init());
+
+            std::unique_ptr<ColumnReader> null_reader;
+            if (meta.is_nullable()) {
+                RETURN_IF_ERROR(ColumnReader::create(opts, 
meta.children_columns(2),
+                                                     
meta.children_columns(2).num_rows(), file_name,
+                                                     &null_reader));
+                RETURN_IF_ERROR(null_reader->init());
+            }
+
             std::unique_ptr<ColumnReader> array_reader(
                     new ColumnReader(opts, meta, num_rows, file_name));
-            RETURN_IF_ERROR(array_reader->init());
-            array_reader->_sub_readers.resize(1);
+            //  array reader do not need to init
+            array_reader->_sub_readers.resize(meta.children_columns_size());
             array_reader->_sub_readers[0] = std::move(item_reader);
+            array_reader->_sub_readers[1] = std::move(offset_reader);
+            if (meta.is_nullable()) {
+                array_reader->_sub_readers[2] = std::move(null_reader);
+            }
             *reader = std::move(array_reader);
             return Status::OK();
         }
@@ -330,8 +349,15 @@ Status ColumnReader::new_iterator(ColumnIterator** 
iterator) {
         case FieldType::OLAP_FIELD_TYPE_ARRAY: {
             ColumnIterator* item_iterator;
             RETURN_IF_ERROR(_sub_readers[0]->new_iterator(&item_iterator));
-            FileColumnIterator* offset_iterator = new FileColumnIterator(this);
-            *iterator = new ArrayFileColumnIterator(offset_iterator, 
item_iterator);
+
+            ColumnIterator* offset_iterator;
+            RETURN_IF_ERROR(_sub_readers[1]->new_iterator(&offset_iterator));
+
+            ColumnIterator* null_iterator = nullptr;
+            if (is_nullable()) {
+                RETURN_IF_ERROR(_sub_readers[2]->new_iterator(&null_iterator));
+            }
+            *iterator = new ArrayFileColumnIterator(this, 
reinterpret_cast<FileColumnIterator*>(offset_iterator), item_iterator, 
null_iterator);
             return Status::OK();
         }
         default:
@@ -343,93 +369,82 @@ Status ColumnReader::new_iterator(ColumnIterator** 
iterator) {
 
 
////////////////////////////////////////////////////////////////////////////////
 
-ArrayFileColumnIterator::ArrayFileColumnIterator(FileColumnIterator* 
offset_reader,
-                                                 ColumnIterator* 
item_iterator) {
-    _offset_iterator.reset(offset_reader);
+ArrayFileColumnIterator::ArrayFileColumnIterator(ColumnReader* reader,
+        FileColumnIterator* offset_reader,
+        ColumnIterator* item_iterator,
+        ColumnIterator* null_iterator) : _array_reader(reader) {
+    _length_iterator.reset(offset_reader);
     _item_iterator.reset(item_iterator);
+    if (_array_reader->is_nullable()) {
+        _null_iterator.reset(null_iterator);
+    }
 }
 
 Status ArrayFileColumnIterator::init(const ColumnIteratorOptions& opts) {
-    RETURN_IF_ERROR(_offset_iterator->init(opts));
+    RETURN_IF_ERROR(_length_iterator->init(opts));
     RETURN_IF_ERROR(_item_iterator->init(opts));
-    TypeInfo* bigint_type_info = 
get_scalar_type_info(FieldType::OLAP_FIELD_TYPE_BIGINT);
-    RETURN_IF_ERROR(ColumnVectorBatch::create(1024, 
_offset_iterator->is_nullable(),
-                                              bigint_type_info, nullptr, 
&_offset_batch));
+    if (_array_reader->is_nullable()) {
+        RETURN_IF_ERROR(_null_iterator->init(opts));
+    }
+    TypeInfo* bigint_type_info = 
get_scalar_type_info(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT);
+    RETURN_IF_ERROR(ColumnVectorBatch::create(1024, false, bigint_type_info, 
nullptr, &_length_batch));
     return Status::OK();
 }
 
-// every invoke this method, _offset_batch will be cover, so this method is 
not thread safe.
 Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, 
bool* has_null) {
-    // 1. read n offsets into  _offset_batch;
-    _offset_batch->resize(*n + 1);
-    ColumnBlock ordinal_block(_offset_batch.get(), nullptr);
-    ColumnBlockView ordinal_view(&ordinal_block);
-    RETURN_IF_ERROR(_offset_iterator->next_batch(n, &ordinal_view, has_null));
+    ColumnBlock* array_block = dst->column_block();
+    auto* array_batch = 
dynamic_cast<ArrayColumnVectorBatch*>(array_block->vector_batch());
+
+    // 1. read n offsets
+    ColumnBlock offset_block(array_batch->offsets(), nullptr);
+    ColumnBlockView offset_view(&offset_block, dst->current_offset() + 1); // 
offset应该比collection的游标多1
+    bool offset_has_null = false;
+    RETURN_IF_ERROR(_length_iterator->next_batch(n, &offset_view, 
&offset_has_null));
+    DCHECK(!offset_has_null);
 
     if (*n == 0) {
         return Status::OK();
     }
+    array_batch->get_offset_by_length(dst->current_offset(), *n);
 
-    // 2. Because we should read n + 1 offsets, so read one more here.
-    PageDecoder* offset_page_decoder = 
_offset_iterator->get_current_page()->data_decoder;
-    if (offset_page_decoder->has_remaining()) { // not _page->has_remaining()
-        size_t i = 1;
-        offset_page_decoder->peek_next_batch(&i, &ordinal_view); // not null
-        DCHECK(i == 1);
+    // 2. read null
+    if (dst->is_nullable()) {
+        auto null_batch = array_batch->get_null_as_batch();
+        ColumnBlock null_block(&null_batch, nullptr);
+        ColumnBlockView null_view(&null_block, dst->current_offset());
+        size_t size = *n;
+        bool null_signs_has_null = false;
+        _null_iterator->next_batch(&size, &null_view, &null_signs_has_null);
+        DCHECK(!null_signs_has_null);
+        *has_null = true; // just set has_null to is_nullable
     } else {
-        *(reinterpret_cast<ordinal_t*>(ordinal_view.data())) =
-                _offset_iterator->get_current_page()->next_array_item_ordinal;
-    }
-    ordinal_view.set_null_bits(1, false);
-    ordinal_view.advance(1);
-
-    // 3. For nullable data,fill null ordinals from last to start: 0 N N 3 N 5 
-> 0 3 3 3 5 5
-    if (_offset_iterator->is_nullable()) {
-        size_t j = *n + 1;
-        while (--j > 0) { // j can not be less than 0
-            ColumnBlockCell cell = ordinal_block.cell(j - 1);
-            if (cell.is_null()) {
-                ordinal_t pre =
-                        
*(reinterpret_cast<ordinal_t*>(ordinal_block.cell(j).mutable_cell_ptr()));
-                *(reinterpret_cast<ordinal_t*>(cell.mutable_cell_ptr())) = pre;
-            }
-        }
+        *has_null = false;
     }
 
-    // 4. read child column's data and generate collections.
-    ColumnBlock* collection_block = dst->column_block();
-    auto* collection_batch =
-            
reinterpret_cast<ArrayColumnVectorBatch*>(collection_block->vector_batch());
-    size_t start_offset = dst->current_offset();
-    size_t end_offset = start_offset + *n;
-    auto* ordinals = reinterpret_cast<ordinal_t*>(ordinal_block.data());
-    collection_batch->put_item_ordinal(ordinals, start_offset, *n + 1);
-
-    size_t size_to_read = ordinals[*n] - ordinals[0];
-    bool item_has_null = false;
-    if (size_to_read > 0) {
-        _item_iterator->seek_to_ordinal(ordinals[0]);
-        ColumnVectorBatch* item_vector_batch = collection_batch->elements();
-        
RETURN_IF_ERROR(item_vector_batch->resize(collection_batch->item_offset(end_offset)));
+    // read item
+    size_t item_size = array_batch->get_item_size(dst->current_offset(), *n);
+    if (item_size > 0) {
+        bool item_has_null = false;
+        ColumnVectorBatch* item_vector_batch = array_batch->elements();
+
+        bool rebuild_array_from0 = false;
+        if (item_vector_batch->capacity() < 
array_batch->item_offset(dst->current_offset() + *n)) {
+            
item_vector_batch->resize(array_batch->item_offset(dst->current_offset() + *n));
+            rebuild_array_from0 = true;
+        }
+
         ColumnBlock item_block = ColumnBlock(item_vector_batch, dst->pool());
-        ColumnBlockView item_view =
-                ColumnBlockView(&item_block, 
collection_batch->item_offset(start_offset));
-        size_t real_read = size_to_read;
+        ColumnBlockView item_view = ColumnBlockView(&item_block, 
array_batch->item_offset(dst->current_offset()));
+        size_t real_read = item_size;
         RETURN_IF_ERROR(_item_iterator->next_batch(&real_read, &item_view, 
&item_has_null));
-        DCHECK(size_to_read == real_read);
-    }
+        DCHECK(item_size == real_read);
 
-    if (dst->is_nullable()) {
-        bool* collection_nulls =
-                
const_cast<bool*>(&collection_batch->null_signs()[dst->current_offset()]);
-        memcpy(collection_nulls, ordinal_block.vector_batch()->null_signs(), 
sizeof(bool) * *n);
-        dst->advance(*n);
-    } else {
-        dst->set_null_bits(*n, false);
-        dst->advance(*n);
+        size_t rebuild_start_offset = rebuild_array_from0 ? 0 : 
dst->current_offset();
+        size_t rebuild_size = rebuild_array_from0 ? dst->current_offset() + *n 
: *n;
+        array_batch->prepare_for_read(rebuild_start_offset, rebuild_size, 
item_has_null);
     }
 
-    collection_batch->prepare_for_read(0, end_offset, item_has_null);
+    dst->advance(*n);
     return Status::OK();
 }
 
@@ -461,6 +476,13 @@ Status FileColumnIterator::seek_to_ordinal(ordinal_t ord) {
     return Status::OK();
 }
 
+Status FileColumnIterator::seek_to_page_start() {
+    if (_page == nullptr) {
+        return Status::NotSupported("Can not seek to page first when page is 
NULL");
+    }
+    return seek_to_ordinal(_page->first_ordinal);
+}
+
 void FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, ordinal_t 
offset_in_page) {
     if (page->offset_in_page == offset_in_page) {
         // fast path, do nothing
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index fa4af6a..bb873e9 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -269,6 +269,8 @@ public:
 
     Status seek_to_ordinal(ordinal_t ord) override;
 
+    Status seek_to_page_start();
+
     Status next_batch(size_t* n, ColumnBlockView* dst, bool* has_null) 
override;
 
     ordinal_t get_current_ordinal() const override { return _current_ordinal; }
@@ -318,8 +320,10 @@ private:
 
 class ArrayFileColumnIterator final : public ColumnIterator {
 public:
-    explicit ArrayFileColumnIterator(FileColumnIterator* offset_iterator,
-                                     ColumnIterator* item_iterator);
+    explicit ArrayFileColumnIterator(ColumnReader* reader,
+                                     FileColumnIterator* length_reader,
+                                     ColumnIterator* item_iterator,
+                                     ColumnIterator* null_iterator);
 
     ~ArrayFileColumnIterator() override = default;
 
@@ -327,20 +331,55 @@ public:
 
     Status next_batch(size_t* n, ColumnBlockView* dst, bool* has_null) 
override;
 
-    Status seek_to_first() override { return 
_offset_iterator->seek_to_first(); };
+    Status seek_to_first() override {
+        RETURN_IF_ERROR(_length_iterator->seek_to_first());
+        RETURN_IF_ERROR(_item_iterator->seek_to_first()); // lazy???
+        if (_array_reader->is_nullable()) {
+            RETURN_IF_ERROR(_null_iterator->seek_to_first());
+        }
+        return Status::OK();
+    }
 
     Status seek_to_ordinal(ordinal_t ord) override {
-        return _offset_iterator->seek_to_ordinal(ord);
-    };
+        RETURN_IF_ERROR(_length_iterator->seek_to_ordinal(ord));
+        if (_array_reader->is_nullable()) {
+            RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
+        }
+
+        RETURN_IF_ERROR(_length_iterator->seek_to_page_start());
+        if (_length_iterator->get_current_ordinal() == ord) {
+            
RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(_length_iterator->get_current_page()->first_array_item_ordinal));
+        } else {
+            ordinal_t start_offset_in_this_page = 
_length_iterator->get_current_page()->first_array_item_ordinal;
+            ColumnBlock ordinal_block(_length_batch.get(), nullptr);
+            ordinal_t size_to_read = ord - start_offset_in_this_page;
+            bool has_null = false;
+            ordinal_t item_ordinal = start_offset_in_this_page;
+            while (size_to_read > 0) {
+                size_t this_read = _length_batch->capacity() < size_to_read ? 
_length_batch->capacity() : size_to_read;
+                ColumnBlockView ordinal_view(&ordinal_block);
+                RETURN_IF_ERROR(_length_iterator->next_batch(&this_read, 
&ordinal_view, &has_null));
+                auto* ordinals = 
reinterpret_cast<ordinal_t*>(_length_batch->data());
+                for (int i = 0; i < this_read; ++i) {
+                    item_ordinal += ordinals[i];
+                }
+                size_to_read -= this_read;
+            }
+            RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(item_ordinal));
+        }
+        return Status::OK();
+    }
 
     ordinal_t get_current_ordinal() const override {
-        return _offset_iterator->get_current_ordinal();
+        return _length_iterator->get_current_ordinal();
     }
 
 private:
-    std::unique_ptr<FileColumnIterator> _offset_iterator;
+    ColumnReader* _array_reader;
+    std::unique_ptr<FileColumnIterator> _length_iterator;
+    std::unique_ptr<ColumnIterator> _null_iterator;
     std::unique_ptr<ColumnIterator> _item_iterator;
-    std::unique_ptr<ColumnVectorBatch> _offset_batch;
+    std::unique_ptr<ColumnVectorBatch> _length_batch;
 };
 
 // This iterator is used to read default value column
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp 
b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 3e9654d..e2c6189 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -91,6 +91,7 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, 
const TabletColumn*
             DCHECK(column->get_subtype_count() == 1);
             const TabletColumn& item_column = column->get_sub_column(0);
 
+            // create item writer
             ColumnWriterOptions item_options;
             item_options.meta = opts.meta->mutable_children_columns(0);
             item_options.need_zone_map = false;
@@ -104,20 +105,65 @@ Status ColumnWriter::create(const ColumnWriterOptions& 
opts, const TabletColumn*
                     return Status::NotSupported("Do not support bitmap index 
for array type");
                 }
             }
-
             std::unique_ptr<ColumnWriter> item_writer;
             RETURN_IF_ERROR(
                     ColumnWriter::create(item_options, &item_column, _wblock, 
&item_writer));
 
+            // create length writer
+            FieldType length_type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
+
+            ColumnWriterOptions length_options;
+            length_options.meta = opts.meta->add_children_columns();
+            length_options.meta->set_column_id(2);
+            length_options.meta->set_unique_id(2);
+            length_options.meta->set_type(length_type);
+            length_options.meta->set_is_nullable(false);
+            
length_options.meta->set_length(get_scalar_type_info(length_type)->size());
+            length_options.meta->set_encoding(DEFAULT_ENCODING);
+            length_options.meta->set_compression(LZ4F);
+
+            length_options.need_zone_map = false;
+            length_options.need_bloom_filter = false;
+            length_options.need_bitmap_index = false;
+
+            TabletColumn length_column = 
TabletColumn(OLAP_FIELD_AGGREGATION_NONE, length_type, 
length_options.meta->is_nullable(),
+                                                      
length_options.meta->unique_id(), length_options.meta->length());
+            length_column.set_name("length");
+            length_column.set_index_length(-1); // no short key index
             std::unique_ptr<Field> bigint_field(
-                    
FieldFactory::create_by_type(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT));
-
-            ScalarColumnWriter* offset_writer =
-                    new ScalarColumnWriter(opts, std::move(bigint_field), 
_wblock);
+                    FieldFactory::create(length_column));
+            auto* length_writer = new ScalarColumnWriter(length_options, 
std::move(bigint_field), _wblock);
+
+            // if nullable, create null writer
+            ScalarColumnWriter* null_writer = nullptr;
+            if (opts.meta->is_nullable()) {
+                FieldType null_type = FieldType::OLAP_FIELD_TYPE_TINYINT;
+                ColumnWriterOptions null_options;
+                null_options.meta = opts.meta->add_children_columns();
+                null_options.meta->set_column_id(3);
+                null_options.meta->set_unique_id(3);
+                null_options.meta->set_type(null_type);
+                null_options.meta->set_is_nullable(false);
+                
null_options.meta->set_length(get_scalar_type_info(null_type)->size());
+                null_options.meta->set_encoding(DEFAULT_ENCODING);
+                null_options.meta->set_compression(LZ4F);
+
+                null_options.need_zone_map = false;
+                null_options.need_bloom_filter = false;
+                null_options.need_bitmap_index = false;
+
+                TabletColumn null_column = 
TabletColumn(OLAP_FIELD_AGGREGATION_NONE, null_type, 
length_options.meta->is_nullable(),
+                                                          
null_options.meta->unique_id(), null_options.meta->length());
+                null_column.set_name("nullable");
+                null_column.set_index_length(-1); // no short key index
+                std::unique_ptr<Field> null_field(
+                        FieldFactory::create(null_column));
+                null_writer = new ScalarColumnWriter(null_options, 
std::move(null_field), _wblock);
+            }
 
             std::unique_ptr<ColumnWriter> writer_local =
                     std::unique_ptr<ColumnWriter>(new ArrayColumnWriter(
-                            opts, std::move(field), offset_writer, 
std::move(item_writer)));
+                            opts, std::move(field), length_writer, 
null_writer, std::move(item_writer)));
             *writer = std::move(writer_local);
             return Status::OK();
         }
@@ -144,10 +190,6 @@ Status ColumnWriter::append_nullable(const uint8_t* 
is_null_bits, const void* da
     return Status::OK();
 }
 
-Status ColumnWriter::append_not_nulls(const void* data, size_t num_rows) {
-    return append_data((const uint8_t**)&data, num_rows);
-}
-
 
///////////////////////////////////////////////////////////////////////////////////
 
 ScalarColumnWriter::ScalarColumnWriter(const ColumnWriterOptions& opts,
@@ -240,26 +282,10 @@ Status ScalarColumnWriter::append_data(const uint8_t** 
ptr, size_t num_rows) {
     size_t remaining = num_rows;
     while (remaining > 0) {
         size_t num_written = remaining;
-        RETURN_IF_ERROR(_page_builder->add(*ptr, &num_written));
-        if (_opts.need_zone_map) {
-            _zone_map_index_builder->add_values(*ptr, num_written);
-        }
-        if (_opts.need_bitmap_index) {
-            _bitmap_index_builder->add_values(*ptr, num_written);
-        }
-        if (_opts.need_bloom_filter) {
-            _bloom_filter_index_builder->add_values(*ptr, num_written);
-        }
+        RETURN_IF_ERROR(append_data_in_current_page(ptr, &num_written));
 
         bool is_page_full = (num_written < remaining);
         remaining -= num_written;
-        _next_rowid += num_written;
-        *ptr += get_field()->size() * num_written;
-        // we must write null bits after write data, because we don't
-        // know how many rows can be written into current page
-        if (is_nullable()) {
-            _null_bitmap_builder->add_run(false, num_written);
-        }
 
         if (is_page_full) {
             RETURN_IF_ERROR(finish_current_page());
@@ -268,6 +294,28 @@ Status ScalarColumnWriter::append_data(const uint8_t** 
ptr, size_t num_rows) {
     return Status::OK();
 }
 
+Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** ptr, 
size_t* num_written) {
+    RETURN_IF_ERROR(_page_builder->add(*ptr, num_written));
+    if (_opts.need_zone_map) {
+        _zone_map_index_builder->add_values(*ptr, *num_written);
+    }
+    if (_opts.need_bitmap_index) {
+        _bitmap_index_builder->add_values(*ptr, *num_written);
+    }
+    if (_opts.need_bloom_filter) {
+        _bloom_filter_index_builder->add_values(*ptr, *num_written);
+    }
+
+    _next_rowid += *num_written;
+    *ptr += get_field()->size() * (*num_written);
+    // we must write null bits after write data, because we don't
+    // know how many rows can be written into current page
+    if (is_nullable()) {
+        _null_bitmap_builder->add_run(false, *num_written);
+    }
+    return Status::OK();
+}
+
 uint64_t ScalarColumnWriter::estimate_buffer_size() {
     uint64_t size = _data_size;
     size += _page_builder->size();
@@ -417,22 +465,29 @@ Status ScalarColumnWriter::finish_current_page() {
 
////////////////////////////////////////////////////////////////////////////////
 
 ArrayColumnWriter::ArrayColumnWriter(const ColumnWriterOptions& opts, 
std::unique_ptr<Field> field,
-                                     ScalarColumnWriter* offset_writer,
+                                     ScalarColumnWriter* length_writer,
+                                     ScalarColumnWriter* null_writer,
                                      std::unique_ptr<ColumnWriter> item_writer)
         : ColumnWriter(std::move(field), opts.meta->is_nullable()),
           _item_writer(std::move(item_writer)) {
-    _offset_writer.reset(offset_writer);
+    _length_writer.reset(length_writer);
+    if (is_nullable()) {
+        _null_writer.reset(null_writer);
+    }
 }
 
 Status ArrayColumnWriter::init() {
-    RETURN_IF_ERROR(_offset_writer->init());
+    RETURN_IF_ERROR(_length_writer->init());
+    if (is_nullable()) {
+        RETURN_IF_ERROR(_null_writer->init());
+    }
     RETURN_IF_ERROR(_item_writer->init());
-    _offset_writer->register_flush_page_callback(this);
+    _length_writer->register_flush_page_callback(this);
     return Status::OK();
 }
 
 Status ArrayColumnWriter::put_extra_info_in_page(DataPageFooterPB* footer) {
-    footer->set_next_array_item_ordinal(_item_writer->get_next_rowid());
+    footer->set_first_array_item_ordinal(_current_length_page_first_ordinal);
     return Status::OK();
 }
 
@@ -440,59 +495,96 @@ Status 
ArrayColumnWriter::put_extra_info_in_page(DataPageFooterPB* footer) {
 Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) {
     size_t remaining = num_rows;
     const auto* col_cursor = reinterpret_cast<const Collection*>(*ptr);
+
     while (remaining > 0) {
         // TODO llj: bulk write
         size_t num_written = 1;
-        ordinal_t next_item_ordinal = _item_writer->get_next_rowid();
-        ordinal_t* next_item_ordinal_ptr = &next_item_ordinal;
-        RETURN_IF_ERROR(
-                _offset_writer->append_data((const 
uint8_t**)&next_item_ordinal_ptr, num_written));
-
-        // write child item.
-        if (_item_writer->is_nullable()) {
-            auto* item_data_ptr = col_cursor->data;
-            for (size_t i = 0; i < col_cursor->length; ++i) {
-                
RETURN_IF_ERROR(_item_writer->append(col_cursor->null_signs[i], item_data_ptr));
-                item_data_ptr = (uint8_t*)item_data_ptr + 
_item_writer->get_field()->size();
-            }
+        auto size_ptr = &(col_cursor->length);
+        RETURN_IF_ERROR(_length_writer->append_data_in_current_page((const 
uint8_t**)&size_ptr, &num_written));
+        if (num_written < 1) { // page is full, write first item offset and 
update current length page's start ordinal
+            RETURN_IF_ERROR(_length_writer->finish_current_page());
+            _current_length_page_first_ordinal += _lengh_sum_in_cur_page;
+            _lengh_sum_in_cur_page = 0;
         } else {
-            RETURN_IF_ERROR(_item_writer->append_not_nulls(col_cursor->data, 
col_cursor->length));
+            // write child item.
+            if (_item_writer->is_nullable()) {
+                auto* item_data_ptr = col_cursor->data;
+                for (size_t i = 0; i < col_cursor->length; ++i) {
+                    
RETURN_IF_ERROR(_item_writer->append(col_cursor->null_signs[i], item_data_ptr));
+                    item_data_ptr = (uint8_t*)item_data_ptr + 
_item_writer->get_field()->size();
+                }
+            } else {
+                RETURN_IF_ERROR(_item_writer->append_data((const 
uint8_t**)&(col_cursor->data), col_cursor->length));
+            }
+            _lengh_sum_in_cur_page += col_cursor->length;
         }
-
         remaining -= num_written;
         col_cursor += num_written;
     }
+    if (is_nullable()) {
+        return write_null_column(num_rows, false);
+    }
     return Status::OK();
 }
 
 uint64_t ArrayColumnWriter::estimate_buffer_size() {
-    return _offset_writer->estimate_buffer_size() + 
_item_writer->estimate_buffer_size();
+    return _length_writer->estimate_buffer_size() +
+    (is_nullable() ? _null_writer->estimate_buffer_size() : 0) +
+    _item_writer->estimate_buffer_size();
 }
 
 Status ArrayColumnWriter::finish() {
-    RETURN_IF_ERROR(_offset_writer->finish());
+    RETURN_IF_ERROR(_length_writer->finish());
+    if (is_nullable()) {
+        RETURN_IF_ERROR(_null_writer->finish());
+    }
     RETURN_IF_ERROR(_item_writer->finish());
     return Status::OK();
 }
 
 Status ArrayColumnWriter::write_data() {
-    RETURN_IF_ERROR(_offset_writer->write_data());
+    RETURN_IF_ERROR(_length_writer->write_data());
+    if (is_nullable()) {
+        RETURN_IF_ERROR(_null_writer->write_data());
+    }
     RETURN_IF_ERROR(_item_writer->write_data());
     return Status::OK();
 }
 
 Status ArrayColumnWriter::write_ordinal_index() {
-    RETURN_IF_ERROR(_offset_writer->write_ordinal_index());
+    RETURN_IF_ERROR(_length_writer->write_ordinal_index());
+    if (is_nullable()) {
+        RETURN_IF_ERROR(_null_writer->write_ordinal_index());
+    }
     RETURN_IF_ERROR(_item_writer->write_ordinal_index());
     return Status::OK();
 }
 
 Status ArrayColumnWriter::append_nulls(size_t num_rows) {
-    return _offset_writer->append_nulls(num_rows);
+    size_t num_lengths = num_rows;
+    const ordinal_t zero = 0;
+    while(num_lengths > 0) {
+        // TODO llj bulk write
+        const auto* zero_ptr = reinterpret_cast<const uint8_t *>(&zero);
+        RETURN_IF_ERROR(_length_writer->append_data(&zero_ptr, 1));
+        --num_lengths;
+    }
+    return write_null_column(num_rows, true);
+}
+
+Status ArrayColumnWriter::write_null_column(size_t num_rows, bool is_null) {
+    uint8_t null_sign = is_null ? 1 : 0;
+    while(num_rows > 0) {
+        // TODO llj bulk write
+        const uint8_t* null_sign_ptr = &null_sign;
+        RETURN_IF_ERROR(_null_writer->append_data(&null_sign_ptr, 1));
+        --num_rows;
+    }
+    return Status::OK();
 }
 
 Status ArrayColumnWriter::finish_current_page() {
-    return _offset_writer->finish_current_page();
+    return Status::NotSupported("array writer has no data, can not 
finish_current_page");
 }
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h 
b/be/src/olap/rowset/segment_v2/column_writer.h
index 022245f..ba904cc 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -89,7 +89,8 @@ public:
             BitmapChange(&nullmap, 0, cell.is_null());
             return append_nullable(&nullmap, cell.cell_ptr(), 1);
         } else {
-            return append_not_nulls(cell.cell_ptr(), 1);
+            auto* cel_ptr = cell.cell_ptr();
+            return append_data((const uint8_t**)&cel_ptr, 1);
         }
     }
 
@@ -103,8 +104,6 @@ public:
 
     Status append_nullable(const uint8_t* nullmap, const void* data, size_t 
num_rows);
 
-    Status append_not_nulls(const void* data, size_t num_rows);
-
     virtual Status append_nulls(size_t num_rows) = 0;
 
     virtual Status finish_current_page() = 0;
@@ -130,6 +129,9 @@ public:
     // used for append not null data.
     virtual Status append_data(const uint8_t** ptr, size_t num_rows) = 0;
 
+    // used for append not null data. When page is full, will append data not 
reach num_rows.
+    virtual Status append_data_in_current_page(const uint8_t** ptr, size_t* 
num_rows) = 0;
+
     bool is_nullable() const { return _is_nullable; }
 
     Field* get_field() const { return _field.get(); }
@@ -180,6 +182,8 @@ public:
     }
     Status append_data(const uint8_t** ptr, size_t num_rows) override;
 
+    Status append_data_in_current_page(const uint8_t** ptr, size_t* num_rows) 
override;
+
 private:
     std::unique_ptr<PageBuilder> _page_builder;
 
@@ -250,12 +254,16 @@ class ArrayColumnWriter final : public ColumnWriter, 
public FlushPageCallback {
 public:
     explicit ArrayColumnWriter(const ColumnWriterOptions& opts, 
std::unique_ptr<Field> field,
                                ScalarColumnWriter* offset_writer,
+                               ScalarColumnWriter* null_writer,
                                std::unique_ptr<ColumnWriter> item_writer);
     ~ArrayColumnWriter() override = default;
 
     Status init() override;
 
     Status append_data(const uint8_t** ptr, size_t num_rows) override;
+    Status append_data_in_current_page(const uint8_t** ptr, size_t* num_rows) 
override {
+        return Status::NotSupported("array writer has no data, can not 
append_data_in_current_page");
+    }
 
     uint64_t estimate_buffer_size() override;
 
@@ -272,14 +280,18 @@ public:
 
     Status write_bloom_filter_index() override { return Status::OK(); }
 
-    ordinal_t get_next_rowid() const override { return 
_offset_writer->get_next_rowid(); }
+    ordinal_t get_next_rowid() const override { return 
_length_writer->get_next_rowid(); }
 
 private:
     Status put_extra_info_in_page(DataPageFooterPB* header) override;
+    inline Status write_null_column(size_t num_rows, bool is_null); // 
写入num_rows个null标记
 
 private:
-    std::unique_ptr<ScalarColumnWriter> _offset_writer;
+    std::unique_ptr<ScalarColumnWriter> _length_writer;
+    std::unique_ptr<ScalarColumnWriter> _null_writer;
     std::unique_ptr<ColumnWriter> _item_writer;
+    ordinal_t _current_length_page_first_ordinal = 0;
+    ordinal_t _lengh_sum_in_cur_page = 0;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h 
b/be/src/olap/rowset/segment_v2/parsed_page.h
index 0f64a87..cf49b89 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -61,7 +61,7 @@ struct ParsedPage {
         page->page_pointer = page_pointer;
         page->page_index = page_index;
 
-        page->next_array_item_ordinal = footer.next_array_item_ordinal();
+        page->first_array_item_ordinal = footer.first_array_item_ordinal();
 
         *result = std::move(page);
         return Status::OK();
@@ -81,7 +81,7 @@ struct ParsedPage {
     // number of rows including nulls and not-nulls
     ordinal_t num_rows = 0;
     // just for array type
-    ordinal_t next_array_item_ordinal = 0;
+    ordinal_t first_array_item_ordinal = 0;
 
     PagePointer page_pointer;
     uint32_t page_index = 0;
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index be7099d..6ac3538 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -47,7 +47,7 @@ SegmentWriter::~SegmentWriter() {
     _mem_tracker->Release(_mem_tracker->consumption());
 };
 
-void SegmentWriter::_init_column_meta(ColumnMetaPB* meta, uint32_t* column_id,
+void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t* column_id,
                                       const TabletColumn& column) {
     // TODO(zc): Do we need this column_id??
     meta->set_column_id((*column_id)++);
@@ -59,7 +59,7 @@ void SegmentWriter::_init_column_meta(ColumnMetaPB* meta, 
uint32_t* column_id,
     meta->set_is_nullable(column.is_nullable());
     if (column.get_subtype_count() > 0) {
         for (uint32_t i = 0; i < column.get_subtype_count(); ++i) {
-            _init_column_meta(meta->add_children_columns(), column_id, 
column.get_sub_column(i));
+            init_column_meta(meta->add_children_columns(), column_id, 
column.get_sub_column(i));
         }
     }
 }
@@ -71,17 +71,15 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec 
__attribute__((unused))
         ColumnWriterOptions opts;
         opts.meta = _footer.add_columns();
 
-        _init_column_meta(opts.meta, &column_id, column);
+        init_column_meta(opts.meta, &column_id, column);
 
         // now we create zone map for key columns in AGG_KEYS or all column in 
UNIQUE_KEYS or DUP_KEYS
         // and not support zone map for array type.
         opts.need_zone_map = column.is_key() || _tablet_schema->keys_type() != 
KeysType::AGG_KEYS;
-        if (column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) {
-            opts.need_zone_map = false;
-        }
         opts.need_bloom_filter = column.is_bf_column();
         opts.need_bitmap_index = column.has_bitmap_index();
         if (column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) {
+            opts.need_zone_map = false;
             if (opts.need_bloom_filter) {
                 return Status::NotSupported("Do not support bloom filter for 
array type");
             }
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h 
b/be/src/olap/rowset/segment_v2/segment_writer.h
index a075ca2..d060099 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.h
+++ b/be/src/olap/rowset/segment_v2/segment_writer.h
@@ -67,6 +67,8 @@ public:
 
     Status finalize(uint64_t* segment_file_size, uint64_t* index_size);
 
+    static void init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, 
const TabletColumn& column);
+
 private:
     DISALLOW_COPY_AND_ASSIGN(SegmentWriter);
     Status _write_data();
@@ -77,7 +79,6 @@ private:
     Status _write_short_key_index();
     Status _write_footer();
     Status _write_raw_data(const std::vector<Slice>& slices);
-    void _init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, const 
TabletColumn& column);
 
 private:
     uint32_t _segment_id;
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index 3ffa513..67279f9 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -275,6 +275,7 @@ TabletColumn::TabletColumn(FieldAggregationMethod agg, 
FieldType filed_type, boo
     _unique_id = unique_id;
     _length = length;
 }
+
 void TabletColumn::init_from_pb(const ColumnPB& column) {
     _unique_id = column.unique_id();
     _col_name = column.name();
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index 61a1991..2a5053f 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -39,6 +39,7 @@ public:
 
     inline int32_t unique_id() const { return _unique_id; }
     inline std::string name() const { return _col_name; }
+    inline void set_name(std::string col_name) { _col_name = col_name; }
     inline FieldType type() const { return _type; }
     inline bool is_key() const { return _is_key; }
     inline bool is_nullable() const { return _is_nullable; }
@@ -51,6 +52,7 @@ public:
     std::string referenced_column() const { return _referenced_column; }
     size_t length() const { return _length; }
     size_t index_length() const { return _index_length; }
+    inline void set_index_length(size_t index_length) { _index_length = 
index_length; }
     FieldAggregationMethod aggregation() const { return _aggregation; }
     int precision() const { return _precision; }
     int frac() const { return _frac; }
diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp
index 030eb81..f52fe5e 100644
--- a/be/src/olap/types.cpp
+++ b/be/src/olap/types.cpp
@@ -152,8 +152,9 @@ TypeInfo* get_type_info(segment_v2::ColumnMetaPB* 
column_meta_pb) {
     } else {
         switch (type) {
         case OLAP_FIELD_TYPE_ARRAY: {
-            DCHECK(column_meta_pb->children_columns_size() == 1) << "more than 
1 child type.";
-            FieldType child_type = 
(FieldType)column_meta_pb->children_columns(0).type();
+            DCHECK(column_meta_pb->children_columns_size() >= 1 && 
column_meta_pb->children_columns_size() <=3)
+            << "more than 3 children or no children.";
+            auto child_type = 
(FieldType)column_meta_pb->children_columns(0).type();
             return 
ArrayTypeInfoResolver::instance()->get_type_info(child_type);
         }
         default:
diff --git a/be/test/olap/column_vector_test.cpp 
b/be/test/olap/column_vector_test.cpp
index 7523098..39e6b02 100644
--- a/be/test/olap/column_vector_test.cpp
+++ b/be/test/olap/column_vector_test.cpp
@@ -72,52 +72,61 @@ void test_read_write_scalar_column_vector(const TypeInfo* 
type_info, const uint8
 }
 
 template <FieldType item_type>
-void test_read_write_list_column_vector(const ArrayTypeInfo* list_type_info,
-                                        segment_v2::ordinal_t* ordinals, // n 
+ 1
-                                        size_t list_size, const uint8_t* 
src_item_data,
-                                        Collection* result) {
-    DCHECK(list_size > 1);
+void test_read_write_array_column_vector(const ArrayTypeInfo* array_type_info, 
size_t array_size, Collection* result) {
+    DCHECK(array_size > 1);
 
     using ItemType = typename TypeTraits<item_type>::CppType;
-    ItemType* src_item = (ItemType*)src_item_data;
     size_t ITEM_TYPE_SIZE = sizeof(ItemType);
 
-    TabletColumn list_column(OLAP_FIELD_AGGREGATION_NONE, 
OLAP_FIELD_TYPE_ARRAY);
+    TabletColumn array_column(OLAP_FIELD_AGGREGATION_NONE, 
OLAP_FIELD_TYPE_ARRAY);
     TabletColumn item_column(OLAP_FIELD_AGGREGATION_NONE, item_type, true, 0, 
0);
-    list_column.add_sub_column(item_column);
-    Field* field = FieldFactory::create(list_column);
+    array_column.add_sub_column(item_column);
+    Field* field = FieldFactory::create(array_column);
 
-    size_t list_init_size = list_size / 2;
+    size_t array_init_size = array_size / 2;
     std::unique_ptr<ColumnVectorBatch> cvb;
-    ASSERT_TRUE(ColumnVectorBatch::create(list_init_size, true, 
list_type_info, field, &cvb).ok());
+    ASSERT_TRUE(ColumnVectorBatch::create(array_init_size, true, 
array_type_info, field, &cvb).ok());
 
-    ArrayColumnVectorBatch* list_cvb = 
reinterpret_cast<ArrayColumnVectorBatch*>(cvb.get());
-    ColumnVectorBatch* item_cvb = list_cvb->elements();
+    auto* array_cvb = reinterpret_cast<ArrayColumnVectorBatch*>(cvb.get());
+    ColumnVectorBatch* item_cvb = array_cvb->elements();
+    ColumnVectorBatch* offset_cvb = array_cvb->offsets();
 
     // first write
-    list_cvb->put_item_ordinal(ordinals, 0, list_init_size + 1);
-    list_cvb->set_null_bits(0, list_init_size, false);
-    size_t first_write_item = ordinals[list_init_size] - ordinals[0];
+    for (size_t i = 0; i < array_init_size; ++i) {
+        memcpy(offset_cvb->mutable_cell_ptr(1 + i), &(result[i].length), 
sizeof(segment_v2::ordinal_t));
+    }
+    array_cvb->set_null_bits(0, array_init_size, false);
+    array_cvb->get_offset_by_length(0, array_init_size);
+
+    size_t first_write_item = array_cvb->item_offset(array_init_size) - 
array_cvb->item_offset(0);
     ASSERT_TRUE(item_cvb->resize(first_write_item).ok());
-    memcpy(item_cvb->mutable_cell_ptr(0), src_item, first_write_item * 
ITEM_TYPE_SIZE);
+    for (size_t i = 0; i < array_init_size; ++i) {
+        memcpy(item_cvb->mutable_cell_ptr(array_cvb->item_offset(i)), 
result[i].data, result[i].length * ITEM_TYPE_SIZE);
+    }
+
     item_cvb->set_null_bits(0, first_write_item, false);
-    list_cvb->prepare_for_read(0, list_init_size, false);
+    array_cvb->prepare_for_read(0, array_init_size, false);
 
     // second write
-    ASSERT_TRUE(list_cvb->resize(list_size).ok());
-    list_cvb->put_item_ordinal(ordinals + list_init_size, list_init_size,
-                               list_size - list_init_size + 1);
-    list_cvb->set_null_bits(list_init_size, list_size - list_init_size, false);
-    size_t item_size = ordinals[list_size] - ordinals[0];
-    ASSERT_TRUE(item_cvb->resize(item_size).ok());
-    size_t second_write_item = item_size - first_write_item;
-    memcpy(item_cvb->mutable_cell_ptr(first_write_item), src_item + 
first_write_item,
-           second_write_item * ITEM_TYPE_SIZE);
+    ASSERT_TRUE(array_cvb->resize(array_size).ok());
+    for (int i = array_init_size; i < array_size; ++i) {
+        memcpy(offset_cvb->mutable_cell_ptr(i + 1), &(result[i].length), 
sizeof(segment_v2::ordinal_t));
+    }
+    array_cvb->set_null_bits(array_init_size, array_size - array_init_size, 
false);
+    array_cvb->get_offset_by_length(array_init_size, array_size - 
array_init_size);
+
+    size_t total_item_size = array_cvb->item_offset(array_size);
+    ASSERT_TRUE(item_cvb->resize(total_item_size).ok());
+
+    for (size_t i = array_init_size; i < array_size; ++i) {
+        memcpy(item_cvb->mutable_cell_ptr(array_cvb->item_offset(i)), 
result[i].data, result[i].length * ITEM_TYPE_SIZE);
+    }
+    size_t second_write_item = total_item_size - first_write_item;
     item_cvb->set_null_bits(first_write_item, second_write_item, false);
-    list_cvb->prepare_for_read(0, list_size, false);
+    array_cvb->prepare_for_read(0, array_size, false);
 
-    for (size_t idx = 0; idx < list_size; ++idx) {
-        ASSERT_TRUE(list_type_info->equal(&result[idx], 
list_cvb->cell_ptr(idx))) << "idx:" << idx;
+    for (size_t idx = 0; idx < array_size; ++idx) {
+        ASSERT_TRUE(array_type_info->equal(&result[idx], 
array_cvb->cell_ptr(idx))) << "idx:" << idx;
     }
     delete field;
 }
@@ -125,7 +134,7 @@ void test_read_write_list_column_vector(const 
ArrayTypeInfo* list_type_info,
 TEST_F(ColumnVectorTest, scalar_column_vector_test) {
     {
         size_t size = 1024;
-        uint8_t* val = new uint8_t[size];
+        auto* val = new uint8_t[size];
         for (int i = 0; i < size; ++i) {
             val[i] = i;
         }
@@ -135,7 +144,7 @@ TEST_F(ColumnVectorTest, scalar_column_vector_test) {
     }
     {
         size_t size = 1024;
-        Slice* char_vals = new Slice[size];
+        auto* char_vals = new Slice[size];
         for (int i = 0; i < size; ++i) {
             set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, i, 
(char*)&char_vals[i], &_pool, 8);
         }
@@ -145,33 +154,29 @@ TEST_F(ColumnVectorTest, scalar_column_vector_test) {
     }
 }
 
-TEST_F(ColumnVectorTest, list_column_vector_test) {
-    size_t num_list = 1024;
-    size_t num_item = num_list * 3;
+TEST_F(ColumnVectorTest, array_column_vector_test) {
+    size_t num_array = 1024;
+    size_t num_item = num_array * 3;
     {
-        Collection* list_val = new Collection[num_list];
-        uint8_t* item_val = new uint8_t[num_item];
-        segment_v2::ordinal_t* ordinals = new segment_v2::ordinal_t[num_list + 
1];
+        auto* array_val = new Collection[num_array];
         bool null_signs[3] = {false, false, false};
+
+        auto* item_val = new uint8_t[num_item];
         memset(null_signs, 0, sizeof(bool) * 3);
         for (int i = 0; i < num_item; ++i) {
             item_val[i] = i;
             if (i % 3 == 0) {
-                size_t list_index = i / 3;
-                list_val[list_index].data = &item_val[i];
-                list_val[list_index].null_signs = null_signs;
-                list_val[list_index].length = 3;
-                ordinals[list_index] = i;
+                size_t array_index = i / 3;
+                array_val[array_index].data = &item_val[i];
+                array_val[array_index].null_signs = null_signs;
+                array_val[array_index].length = 3;
             }
         }
-        ordinals[num_list] = num_item;
         auto type_info = reinterpret_cast<ArrayTypeInfo*>(
                 
ArrayTypeInfoResolver::instance()->get_type_info(OLAP_FIELD_TYPE_TINYINT));
-        test_read_write_list_column_vector<OLAP_FIELD_TYPE_TINYINT>(type_info, 
ordinals, num_list,
-                                                                    item_val, 
list_val);
+        
test_read_write_array_column_vector<OLAP_FIELD_TYPE_TINYINT>(type_info, 
num_array, array_val);
 
-        delete[] ordinals;
-        delete[] list_val;
+        delete[] array_val;
         delete[] item_val;
     }
 }
diff --git a/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp 
b/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp
index a49d21c..546b47c 100644
--- a/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp
@@ -369,36 +369,36 @@ void test_array_nullable_data(Collection* src_data, 
uint8_t* src_is_null, int nu
 }
 
 TEST_F(ColumnReaderWriterTest, test_array_type) {
-    size_t num_list = LOOP_LESS_OR_MORE(1024, 24 * 1024);
-    size_t num_item = num_list * 3;
+    size_t num_array = LOOP_LESS_OR_MORE(1024, 24 * 1024);
+    size_t num_item = num_array * 3;
 
-    uint8_t* array_is_null = new uint8_t[BitmapSize(num_list)];
-    Collection* array_val = new Collection[num_list];
+    uint8_t* array_is_null = new uint8_t[BitmapSize(num_array)];
+    Collection* array_val = new Collection[num_array];
     bool* item_is_null = new bool[num_item];
     uint8_t* item_val = new uint8_t[num_item];
     for (int i = 0; i < num_item; ++i) {
         item_val[i] = i;
         item_is_null[i] = (i % 4) == 0;
         if (i % 3 == 0) {
-            size_t list_index = i / 3;
-            bool is_null = (list_index % 4) == 1;
-            BitmapChange(array_is_null, list_index, is_null);
+            size_t array_index = i / 3;
+            bool is_null = (array_index % 4) == 1;
+            BitmapChange(array_is_null, array_index, is_null);
             if (is_null) {
                 continue;
             }
-            array_val[list_index].data = &item_val[i];
-            array_val[list_index].null_signs = &item_is_null[i];
-            array_val[list_index].length = 3;
+            array_val[array_index].data = &item_val[i];
+            array_val[array_index].null_signs = &item_is_null[i];
+            array_val[array_index].length = 3;
         }
     }
     test_array_nullable_data<OLAP_FIELD_TYPE_TINYINT, BIT_SHUFFLE, 
BIT_SHUFFLE>(
-            array_val, array_is_null, num_list, "null_array_bs");
+            array_val, array_is_null, num_array, "null_array_bs");
 
     delete[] array_val;
     delete[] item_val;
     delete[] item_is_null;
 
-    array_val = new Collection[num_list];
+    array_val = new Collection[num_array];
     Slice* varchar_vals = new Slice[3];
     item_is_null = new bool[3];
     for (int i = 0; i < 3; ++i) {
@@ -407,7 +407,7 @@ TEST_F(ColumnReaderWriterTest, test_array_type) {
             set_column_value_by_type(OLAP_FIELD_TYPE_VARCHAR, i, 
(char*)&varchar_vals[i], &_pool);
         }
     }
-    for (int i = 0; i < num_list; ++i) {
+    for (int i = 0; i < num_array; ++i) {
         bool is_null = (i % 4) == 1;
         BitmapChange(array_is_null, i, is_null);
         if (is_null) {
@@ -418,7 +418,7 @@ TEST_F(ColumnReaderWriterTest, test_array_type) {
         array_val[i].length = 3;
     }
     test_array_nullable_data<OLAP_FIELD_TYPE_VARCHAR, DICT_ENCODING, 
BIT_SHUFFLE>(
-            array_val, array_is_null, num_list, "null_array_chars");
+            array_val, array_is_null, num_array, "null_array_chars");
 
     delete[] array_val;
     delete[] varchar_vals;
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index cc4f9a4..5696285 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -68,9 +68,9 @@ message DataPageFooterPB {
     optional uint64 num_values = 2;
     // required: size of nullmap, 0 if the page doesn't contain NULL
     optional uint32 nullmap_size = 3;
-    // only for array column, largest array item ordinal + 1,
-    // used to calculate the length of last array in this page
-    optional uint64 next_array_item_ordinal = 4;
+    // only for array column
+    // Save the first array's first item's ordinal.
+    optional uint64 first_array_item_ordinal = 4;
 }
 
 message IndexPageFooterPB {

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to