github-actions[bot] commented on code in PR #63498:
URL: https://github.com/apache/doris/pull/63498#discussion_r3285124854
##########
be/src/core/column/predicate_column.h:
##########
@@ -293,6 +293,41 @@ class PredicateColumnType final : public
COWHelper<IColumn, PredicateColumnType<
}
}
+ // Insert `num` entries with only length information (no actual char data).
+ // The chars buffer is zero-filled so that filter_by_selector can safely
+ // memcpy without reading meaningful content. Used in OFFSET_ONLY reading
+ // mode where only string lengths (for length() function) are needed.
+ void insert_offsets_from_lengths(const uint32_t* lengths, size_t num)
override {
+ if constexpr (std::is_same_v<T, StringRef>) {
+ if (UNLIKELY(num == 0)) {
+ return;
+ }
+ size_t total_bytes = 0;
+ for (size_t i = 0; i < num; ++i) {
+ total_bytes += lengths[i];
+ }
+ // Allocate and zero-fill a single backing buffer so that each
StringRef
+ // points to valid (though meaningless) memory. filter_by_selector
will
+ // memcpy from these pointers, so they must not be null for
non-zero lengths.
+ char* buf = total_bytes > 0 ? _arena.alloc(total_bytes) : nullptr;
+ if (total_bytes > 0) {
+ memset(buf, 0, total_bytes);
+ }
+ size_t org_elem_num = data.size();
+ data.resize(org_elem_num + num);
+ size_t offset = 0;
+ for (size_t i = 0; i < num; ++i) {
+ // For zero-length strings, data pointer is null;
insert_many_strings
+ // and filter_by_selector both guard on size > 0 before
dereferencing.
+ data[org_elem_num + i].data = (lengths[i] > 0) ? (buf +
offset) : nullptr;
Review Comment:
This leaves zero-length rows with a null `StringRef::data`, but
`PredicateColumnType::filter_by_selector()` later forwards these refs to
`ColumnString::insert_many_strings_without_reserve()`. That code does pointer
arithmetic/comparison on `strings[i].data + len` before checking `length != 0`,
so an empty string read through this OFFSET_ONLY predicate path can invoke
undefined behavior when the predicate column is materialized by selector
filtering. Please keep zero-length refs pointing at valid arena memory as well,
for example by allocating at least one byte when `total_bytes == 0` and
assigning `buf + offset` for every row.
##########
regression-test/suites/query_p0/sql_functions/string_functions/test_length_dict_encoded.groovy:
##########
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression test for: length() / char_length() on dict-encoded
(low-cardinality)
+// varchar columns must not throw NOT_IMPLEMENTED_ERROR when the
"only_read_offsets"
+// optimisation is active (i.e. when the storage layer resolves dict codes to
string
+// lengths without materialising actual character data).
+suite("test_length_dict_encoded") {
+ sql "DROP TABLE IF EXISTS test_length_dict_varchar"
+ sql """
+ CREATE TABLE test_length_dict_varchar (
+ col_int_undef_signed int NULL,
+ col_int_undef_signed_not_null int NOT NULL,
+ col_date_undef_signed date NULL,
+ col_date_undef_signed_not_null date NOT NULL,
+ col_varchar_5__undef_signed varchar(5) NULL,
+ col_varchar_5__undef_signed_not_null varchar(5) NOT NULL,
+ pk int NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(col_int_undef_signed)
+ PARTITION BY RANGE(col_int_undef_signed) (
+ PARTITION p0 VALUES [('-2147483648'), ('4')),
+ PARTITION p1 VALUES [('4'), ('6')),
+ PARTITION p2 VALUES [('6'), ('7')),
+ PARTITION p3 VALUES [('7'), ('8')),
+ PARTITION p4 VALUES [('8'), ('10')),
+ PARTITION p5 VALUES [('10'), ('83647')),
+ PARTITION p100 VALUES [('83647'), ('2147483647'))
+ )
+ DISTRIBUTED BY HASH(pk) BUCKETS 10
+ PROPERTIES ('replication_allocation' = 'tag.location.default: 1')
+ """
+
+ sql """
+ INSERT INTO test_length_dict_varchar VALUES
+ (6, 5, '2023-12-13', '2023-12-11', 'o', 'i', 30),
+ (6, 6, NULL, '2023-12-18', 'w', 'l', 24),
+ (8, -8278102, '2023-12-13', '2023-12-11', 'x', 'c', 28),
+ (15971, 8, NULL, '2015-06-11', 'h', 'r', 41),
+ (6, 5, '2023-12-11', '2023-12-17', 'd', 'q', 5)
Review Comment:
The test comment below says `length()` returns NULL for NULL varchar values,
but every inserted value for `col_varchar_5__undef_signed` is non-NULL; only
`col_date_undef_signed` has NULLs. This means the nullable NULL path for
dict-encoded OFFSET_ONLY `length()` is not exercised. Please insert at least
one NULL in the nullable varchar column and regenerate the `.out` file so this
regression is covered.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]