This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 449c595f9d [opt](FileReader) InMemoryReader is only used in s3 (#23486)
449c595f9d is described below
commit 449c595f9d862e5d2e2fcbb79f0d06122fbf2b76
Author: Ashin Gau <[email protected]>
AuthorDate: Wed Aug 30 20:43:39 2023 +0800
[opt](FileReader) InMemoryReader is only used in s3 (#23486)
If file size < 8MB, the file will be read into memory, and this idea is
from
https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md#s3inmemoryinputstream.
However, in some cases, we only read one or two columns in a file, and the
actually required bytes is only 1%, resulting in a multiple fold increase in
the amount of data read. Therefore, `InMemoryReader` can only used in object
storage, and reduce the threshold.
---
be/src/common/config.cpp | 3 +++
be/src/common/config.h | 3 +++
be/src/io/fs/buffered_reader.cpp | 8 ++++++--
be/src/io/fs/buffered_reader.h | 4 +---
4 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 2c6dc99876..3ada8cb82f 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -850,6 +850,9 @@ DEFINE_Validator(jsonb_type_length_soft_limit_bytes,
// is greater than object_pool_buffer_size, release the object in the
unused_object_pool.
DEFINE_Int32(object_pool_buffer_size, "100");
+// Threshold of reading a small file into memory
+DEFINE_mInt32(in_memory_file_size, "1048576"); // 1MB
+
// ParquetReaderWrap prefetch buffer size
DEFINE_Int32(parquet_reader_max_buffer_size, "50");
// Max size of parquet page header in bytes
diff --git a/be/src/common/config.h b/be/src/common/config.h
index e66c618369..beca3957ef 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -897,6 +897,9 @@ DECLARE_mInt32(jsonb_type_length_soft_limit_bytes);
// is greater than object_pool_buffer_size, release the object in the
unused_object_pool.
DECLARE_Int32(object_pool_buffer_size);
+// Threshold fo reading a small file into memory
+DECLARE_mInt32(in_memory_file_size);
+
// ParquetReaderWrap prefetch buffer size
DECLARE_Int32(parquet_reader_max_buffer_size);
// Max size of parquet page header in bytes
diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp
index 726f5331c9..00f88c7515 100644
--- a/be/src/io/fs/buffered_reader.cpp
+++ b/be/src/io/fs/buffered_reader.cpp
@@ -778,8 +778,12 @@ Status DelegateReader::create_file_reader(RuntimeProfile*
profile,
io::FileReaderSPtr reader;
RETURN_IF_ERROR(FileFactory::create_file_reader(system_properties,
file_description,
reader_options,
file_system, &reader, profile));
- if (reader->size() < IN_MEMORY_FILE_SIZE) {
- *file_reader = std::make_shared<InMemoryFileReader>(reader);
+ if (reader->size() < config::in_memory_file_size) {
+ if (typeid_cast<io::S3FileReader*>(reader.get())) {
+ *file_reader = std::make_shared<InMemoryFileReader>(reader);
+ } else {
+ *file_reader = std::move(reader);
+ }
} else if (access_mode == AccessMode::SEQUENTIAL) {
bool is_thread_safe = false;
if (typeid_cast<io::S3FileReader*>(reader.get())) {
diff --git a/be/src/io/fs/buffered_reader.h b/be/src/io/fs/buffered_reader.h
index 34e1ff34fe..25a6811330 100644
--- a/be/src/io/fs/buffered_reader.h
+++ b/be/src/io/fs/buffered_reader.h
@@ -238,7 +238,7 @@ private:
/**
* Create a file reader suitable for accessing scenarios:
- * 1. When file size < 8MB, create InMemoryFileReader file reader
+ * 1. When file size < config::in_memory_file_size, create InMemoryFileReader
file reader
* 2. When reading sequential file(csv/json), create PrefetchBufferedReader
* 3. When reading random access file(parquet/orc), create normal file reader
*/
@@ -246,8 +246,6 @@ class DelegateReader {
public:
enum AccessMode { SEQUENTIAL, RANDOM };
- static constexpr size_t IN_MEMORY_FILE_SIZE = 8 * 1024 * 1024;
-
static Status create_file_reader(
RuntimeProfile* profile, const FileSystemProperties&
system_properties,
const FileDescription& file_description, const
io::FileReaderOptions& reader_options,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]