morningman commented on code in PR #42004:
URL: https://github.com/apache/doris/pull/42004#discussion_r1808671305
##########
fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java:
##########
@@ -1677,6 +1683,46 @@ public void setEnableLeftZigZag(boolean
enableLeftZigZag) {
public boolean enableOrcLazyMat = true;
+ @VariableMgr.VarAttr(
+ name = ORC_TINY_STRIPE_THRESHOLD,
+ description =
{"在orc文件中如果一个stripe的字节大小小于`orc_tiny_stripe_threshold`,"
+ + "我们认为该stripe为 tiny stripe。对于多个连续的tiny
stripe我们会进行读取优化,即一次性读多个tiny stripe."
+ + "如果你不想使用该优化,可以将该值设置为0。默认为 8M。",
+ "In an orc file, if the byte size of a stripe is less than
`orc_tiny_stripe_threshold`,"
+ + "we consider the stripe to be a tiny stripe. For
multiple consecutive tiny stripes,"
+ + "we will perform read optimization, that is,
read multiple tiny stripes at a time."
+ + "If you do not want to use this optimization,
you can set this value to 0."
+ + "The default is 8M."},
+ needForward = true,
+ setter = "setOrcTinyStripeThreshold")
+ public long orcTinyStripeThreshold = 8L * 1024L * 1024L;
Review Comment:
```suggestion
public long orcTinyStripeThresholdBytes = 8L * 1024L * 1024L;
```
##########
be/src/vec/exec/format/orc/vorc_reader.cpp:
##########
@@ -853,6 +853,58 @@ Status OrcReader::set_fill_columns(
_lazy_read_ctx.can_lazy_read = false;
}
+ _row_reader_options.range(_range_start_offset, _range_size);
+ _row_reader_options.setTimezoneName(_ctz == "CST" ? "Asia/Shanghai" :
_ctz);
+ _row_reader_options.include(_read_cols);
+ _row_reader_options.setEnableLazyDecoding(true);
+
+ uint64_t number_of_stripes = _reader->getNumberOfStripes();
+ auto allStripesNeeded = _reader->getNeedReadStripes(_row_reader_options);
+
+ int64_t range_end_offset = _range_start_offset + _range_size;
+
+ // 三个参数 todo
+ int orc_tiny_stripe_threshold = 8L * 1024L * 1024L;
+ int orc_once_max_read_size = 8L * 1024L * 1024L;
+ int orc_max_merge_distance = 1L * 1024L * 1024L;
+
+ bool all_tiny_stripes = true;
+ std::vector<io::PrefetchRange> tiny_stripe_ranges;
+
+ for (uint64_t i = 0; i < number_of_stripes; i++) {
+ std::unique_ptr<orc::StripeInformation> strip_info =
_reader->getStripe(i);
+ uint64_t strip_start_offset = strip_info->getOffset();
+ uint64_t strip_end_offset = strip_start_offset +
strip_info->getLength();
+
+ if (strip_start_offset >= range_end_offset || strip_end_offset <
_range_start_offset ||
+ !allStripesNeeded[i]) {
+ continue;
+ }
+ if (strip_info->getLength() > orc_tiny_stripe_threshold) {
+ all_tiny_stripes = false;
+ break;
+ }
+
+ tiny_stripe_ranges.emplace_back(strip_start_offset, strip_end_offset);
+ }
+ if (all_tiny_stripes && number_of_stripes > 0) {
+ std::vector<io::PrefetchRange> prefetch_merge_ranges =
+ io::PrefetchRange::mergeAdjacentSeqRanges(
+ tiny_stripe_ranges, orc_max_merge_distance,
orc_once_max_read_size);
+
+ auto range_finder =
+
std::make_shared<io::LinearProbeRangeFinder>(std::move(prefetch_merge_ranges));
+
+ auto* orcInputStreamPtr =
static_cast<ORCFileInputStream*>(_reader->getStream());
+ orcInputStreamPtr->set_all_tiny_stripes();
+ auto& orc_file_reader = orcInputStreamPtr->get_file_reader();
+ orc_file_reader->collect_profile_before_close();
Review Comment:
Why calling this here?
##########
fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java:
##########
@@ -1677,6 +1683,46 @@ public void setEnableLeftZigZag(boolean
enableLeftZigZag) {
public boolean enableOrcLazyMat = true;
+ @VariableMgr.VarAttr(
+ name = ORC_TINY_STRIPE_THRESHOLD,
+ description =
{"在orc文件中如果一个stripe的字节大小小于`orc_tiny_stripe_threshold`,"
+ + "我们认为该stripe为 tiny stripe。对于多个连续的tiny
stripe我们会进行读取优化,即一次性读多个tiny stripe."
+ + "如果你不想使用该优化,可以将该值设置为0。默认为 8M。",
+ "In an orc file, if the byte size of a stripe is less than
`orc_tiny_stripe_threshold`,"
+ + "we consider the stripe to be a tiny stripe. For
multiple consecutive tiny stripes,"
+ + "we will perform read optimization, that is,
read multiple tiny stripes at a time."
+ + "If you do not want to use this optimization,
you can set this value to 0."
+ + "The default is 8M."},
+ needForward = true,
+ setter = "setOrcTinyStripeThreshold")
+ public long orcTinyStripeThreshold = 8L * 1024L * 1024L;
+
+
+ @VariableMgr.VarAttr(
+ name = ORC_ONCE_MAX_READ_SIZE,
+ description = {"在使用tiny stripe读取优化的时候,会对多个tiny stripe合并成一次IO,"
+ +
"该参数用来控制每次IO请求的最大字节大小。你不应该将值设置的小于`orc_tiny_stripe_threshold`。默认为 8M。",
+ "When using tiny stripe read optimization, multiple tiny
stripes will be merged into one IO."
+ + "This parameter is used to control the maximum
byte size of each IO request."
+ + "You should not set the value less than
`orc_tiny_stripe_threshold`."
+ + "The default is 8M."},
+ needForward = true,
+ setter = "setOrcOnceMaxReadSize")
+ public long orcOnceMaxReadSize = 8L * 1024L * 1024L;
Review Comment:
```suggestion
public long orcOnceMaxReadBytes = 8L * 1024L * 1024L;
```
##########
fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java:
##########
@@ -1677,6 +1683,46 @@ public void setEnableLeftZigZag(boolean
enableLeftZigZag) {
public boolean enableOrcLazyMat = true;
+ @VariableMgr.VarAttr(
+ name = ORC_TINY_STRIPE_THRESHOLD,
+ description =
{"在orc文件中如果一个stripe的字节大小小于`orc_tiny_stripe_threshold`,"
+ + "我们认为该stripe为 tiny stripe。对于多个连续的tiny
stripe我们会进行读取优化,即一次性读多个tiny stripe."
+ + "如果你不想使用该优化,可以将该值设置为0。默认为 8M。",
+ "In an orc file, if the byte size of a stripe is less than
`orc_tiny_stripe_threshold`,"
+ + "we consider the stripe to be a tiny stripe. For
multiple consecutive tiny stripes,"
+ + "we will perform read optimization, that is,
read multiple tiny stripes at a time."
+ + "If you do not want to use this optimization,
you can set this value to 0."
+ + "The default is 8M."},
+ needForward = true,
+ setter = "setOrcTinyStripeThreshold")
+ public long orcTinyStripeThreshold = 8L * 1024L * 1024L;
+
+
+ @VariableMgr.VarAttr(
+ name = ORC_ONCE_MAX_READ_SIZE,
+ description = {"在使用tiny stripe读取优化的时候,会对多个tiny stripe合并成一次IO,"
+ +
"该参数用来控制每次IO请求的最大字节大小。你不应该将值设置的小于`orc_tiny_stripe_threshold`。默认为 8M。",
+ "When using tiny stripe read optimization, multiple tiny
stripes will be merged into one IO."
+ + "This parameter is used to control the maximum
byte size of each IO request."
+ + "You should not set the value less than
`orc_tiny_stripe_threshold`."
+ + "The default is 8M."},
+ needForward = true,
+ setter = "setOrcOnceMaxReadSize")
+ public long orcOnceMaxReadSize = 8L * 1024L * 1024L;
+
+
+ @VariableMgr.VarAttr(
+ name = ORC_MAX_MERGE_DISTANCE,
+ description = {"在使用tiny stripe读取优化的时候,由于tiny stripe并不一定连续。"
+ + "当两个tiny stripe之间距离大于该参数时,我们不会将其合并成一次IO。默认为 1M。",
+ "When using tiny stripe read optimization, since tiny
stripes are not necessarily continuous,"
+ + "when the distance between two tiny stripes is
greater than this parameter,"
+ + "we will not merge them into one IO. The default
value is 1M."},
+ needForward = true,
+ setter = "setOrcMaxMergeDistance")
+ public long orcMaxMergeDistance = 1024L * 1024L;
Review Comment:
```suggestion
public long orcMaxMergeDistanceBytes = 1024L * 1024L;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]