This is an automated email from the ASF dual-hosted git repository.
yangzhg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 926540c [feature] Support return bitmp/hll data in select statement
(#7276)
926540c is described below
commit 926540c561b60326161036236057426c863bdd8f
Author: Zhengguo Yang <[email protected]>
AuthorDate: Wed Dec 15 09:48:27 2021 +0800
[feature] Support return bitmp/hll data in select statement (#7276)
Support return bitmp/hll data in select statement, this can be used when
set show_object_data=true;
---
be/src/exec/parquet_writer.cpp | 38 ++++++++++++-
be/src/exec/parquet_writer.h | 4 +-
be/src/runtime/file_result_writer.cpp | 30 ++++++++--
be/src/runtime/file_result_writer.h | 9 ++-
be/src/runtime/mysql_result_writer.cpp | 17 +++++-
be/src/runtime/mysql_result_writer.h | 6 +-
be/src/runtime/result_file_sink.cpp | 4 +-
be/src/runtime/result_sink.cpp | 7 ++-
be/src/runtime/result_writer.h | 8 ++-
be/src/runtime/runtime_state.h | 4 ++
docs/.vuepress/sidebar/en.js | 5 +-
docs/.vuepress/sidebar/zh-CN.js | 3 +-
docs/en/administrator-guide/variables.md | 9 ++-
docs/en/developer-guide/bitmap-hll-file-format.md | 64 ++++++++++++++++++++++
docs/zh-CN/administrator-guide/variables.md | 5 +-
.../developer-guide/bitmap-hll-file-format.md | 64 ++++++++++++++++++++++
fe/fe-common/pom.xml | 5 ++
.../org/apache/doris/common/io}/BitmapValue.java | 2 +-
.../java/org/apache/doris/common/io}/Codec.java | 2 +-
.../main/java/org/apache/doris/common/io}/Hll.java | 2 +-
.../org/apache/doris/common/io}/Roaring64Map.java | 5 +-
.../apache/doris/common/io}/BitmapValueTest.java | 5 +-
.../java/org/apache/doris/common/io}/HllTest.java | 4 +-
.../org/apache/doris/analysis/OutFileClause.java | 17 ++++++
.../java/org/apache/doris/qe/SessionVariable.java | 14 +++++
fe/spark-dpp/pom.xml | 4 --
.../load/loadv2/dpp/DorisKryoRegistrator.java | 3 +
.../doris/load/loadv2/dpp/SparkRDDAggregator.java | 2 +
gensrc/thrift/PaloInternalService.thrift | 4 ++
29 files changed, 296 insertions(+), 50 deletions(-)
diff --git a/be/src/exec/parquet_writer.cpp b/be/src/exec/parquet_writer.cpp
index 2e428ec..13cafac 100644
--- a/be/src/exec/parquet_writer.cpp
+++ b/be/src/exec/parquet_writer.cpp
@@ -88,11 +88,13 @@ void ParquetOutputStream::set_written_len(int64_t
written_len) {
ParquetWriterWrapper::ParquetWriterWrapper(FileWriter* file_writer,
const std::vector<ExprContext*>&
output_expr_ctxs,
const std::map<std::string,
std::string>& properties,
- const
std::vector<std::vector<std::string>>& schema)
+ const
std::vector<std::vector<std::string>>& schema,
+ bool output_object_data)
: _output_expr_ctxs(output_expr_ctxs),
_str_schema(schema),
_cur_writed_rows(0),
- _rg_writer(nullptr) {
+ _rg_writer(nullptr),
+ _output_object_data(output_object_data) {
_outstream = std::shared_ptr<ParquetOutputStream>(new
ParquetOutputStream(file_writer));
parse_properties(properties);
parse_schema(schema);
@@ -354,6 +356,38 @@ Status ParquetWriterWrapper::_write_one_row(TupleRow* row)
{
}
break;
}
+
+ case TYPE_HLL:
+ case TYPE_OBJECT: {
+ if (_output_object_data) {
+ if (_str_schema[index][1] != "byte_array") {
+ std::stringstream ss;
+ ss << "project field type is hll/bitmap, should use
byte_array, but the "
+ "definition type of column "
+ << _str_schema[index][2] << " is " <<
_str_schema[index][1];
+ return Status::InvalidArgument(ss.str());
+ }
+ parquet::RowGroupWriter* rgWriter = get_rg_writer();
+ parquet::ByteArrayWriter* col_writer =
+
static_cast<parquet::ByteArrayWriter*>(rgWriter->column(index));
+ if (item != nullptr) {
+ const StringValue* string_val = (const
StringValue*)(item);
+ parquet::ByteArray value;
+ value.ptr = reinterpret_cast<const
uint8_t*>(string_val->ptr);
+ value.len = string_val->len;
+ col_writer->WriteBatch(1, nullptr, nullptr, &value);
+ } else {
+ parquet::ByteArray value;
+ col_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+ } else {
+ std::stringstream ss;
+ ss << "unsupported file format: "
+ << _output_expr_ctxs[index]->root()->type().type;
+ return Status::InvalidArgument(ss.str());
+ }
+ break;
+ }
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_STRING: {
diff --git a/be/src/exec/parquet_writer.h b/be/src/exec/parquet_writer.h
index d5c021d..c076aed 100644
--- a/be/src/exec/parquet_writer.h
+++ b/be/src/exec/parquet_writer.h
@@ -72,7 +72,8 @@ class ParquetWriterWrapper {
public:
ParquetWriterWrapper(FileWriter* file_writer, const
std::vector<ExprContext*>& output_expr_ctxs,
const std::map<std::string, std::string>& properties,
- const std::vector<std::vector<std::string>>& schema);
+ const std::vector<std::vector<std::string>>& schema,
+ bool output_object_data);
virtual ~ParquetWriterWrapper();
Status write(const RowBatch& row_batch);
@@ -101,6 +102,7 @@ private:
int64_t _cur_writed_rows = 0;
parquet::RowGroupWriter* _rg_writer;
const int64_t _max_row_per_group = 10;
+ bool _output_object_data;
};
} // namespace doris
diff --git a/be/src/runtime/file_result_writer.cpp
b/be/src/runtime/file_result_writer.cpp
index 23cd6b0..6b20f5d 100644
--- a/be/src/runtime/file_result_writer.cpp
+++ b/be/src/runtime/file_result_writer.cpp
@@ -39,6 +39,7 @@
#include "util/mysql_row_buffer.h"
#include "util/types.h"
#include "util/uid_util.h"
+#include "util/url_coding.h"
namespace doris {
@@ -47,7 +48,8 @@ const size_t FileResultWriter::OUTSTREAM_BUFFER_SIZE_BYTES =
1024 * 1024;
// deprecated
FileResultWriter::FileResultWriter(const ResultFileOptions* file_opts,
const std::vector<ExprContext*>&
output_expr_ctxs,
- RuntimeProfile* parent_profile,
BufferControlBlock* sinker)
+ RuntimeProfile* parent_profile,
BufferControlBlock* sinker,
+ bool output_object_data)
: _file_opts(file_opts),
_output_expr_ctxs(output_expr_ctxs),
_parent_profile(parent_profile),
@@ -62,6 +64,7 @@ FileResultWriter::FileResultWriter(const ResultFileOptions*
file_opts,
// resulting in no such attribute. So we need a mock here.
_fragment_instance_id.hi = 12345678987654321;
_fragment_instance_id.lo = 98765432123456789;
+ _output_object_data = output_object_data;
}
FileResultWriter::FileResultWriter(const ResultFileOptions* file_opts,
@@ -69,14 +72,16 @@ FileResultWriter::FileResultWriter(const ResultFileOptions*
file_opts,
const TUniqueId fragment_instance_id,
const std::vector<ExprContext*>&
output_expr_ctxs,
RuntimeProfile* parent_profile,
BufferControlBlock* sinker,
- RowBatch* output_batch)
+ RowBatch* output_batch, bool
output_object_data)
: _file_opts(file_opts),
_storage_type(storage_type),
_fragment_instance_id(fragment_instance_id),
_output_expr_ctxs(output_expr_ctxs),
_parent_profile(parent_profile),
_sinker(sinker),
- _output_batch(output_batch) {}
+ _output_batch(output_batch) {
+ _output_object_data = output_object_data;
+}
FileResultWriter::~FileResultWriter() {
_close_file_writer(true);
@@ -151,7 +156,8 @@ Status FileResultWriter::_create_file_writer(const
std::string& file_name) {
break;
case TFileFormatType::FORMAT_PARQUET:
_parquet_writer = new ParquetWriterWrapper(_file_writer,
_output_expr_ctxs,
-
_file_opts->file_properties, _file_opts->schema);
+
_file_opts->file_properties, _file_opts->schema,
+ _output_object_data);
break;
default:
return Status::InternalError(
@@ -329,6 +335,22 @@ Status FileResultWriter::_write_one_row_as_csv(TupleRow*
row) {
_plain_text_outstream << decimal_str;
break;
}
+ case TYPE_OBJECT:
+ case TYPE_HLL: {
+ if (_output_object_data) {
+ const StringValue* string_val = (const StringValue*)(item);
+ if (string_val->ptr == nullptr) {
+ _plain_text_outstream << NULL_IN_CSV;
+ } else {
+ std::string base64_str;
+ base64_encode(string_val->to_string(), &base64_str);
+ _plain_text_outstream << base64_str;
+ }
+ } else {
+ _plain_text_outstream << NULL_IN_CSV;
+ }
+ break;
+ }
default: {
// not supported type, like BITMAP, HLL, just export null
_plain_text_outstream << NULL_IN_CSV;
diff --git a/be/src/runtime/file_result_writer.h
b/be/src/runtime/file_result_writer.h
index 98ccfda..ea0c363 100644
--- a/be/src/runtime/file_result_writer.h
+++ b/be/src/runtime/file_result_writer.h
@@ -79,15 +79,14 @@ class FileResultWriter final : public ResultWriter {
public:
FileResultWriter(const ResultFileOptions* file_option,
const std::vector<ExprContext*>& output_expr_ctxs,
- RuntimeProfile* parent_profile,
- BufferControlBlock* sinker);
+ RuntimeProfile* parent_profile, BufferControlBlock*
sinker,
+ bool output_object_data);
FileResultWriter(const ResultFileOptions* file_option,
const TStorageBackendType::type storage_type,
const TUniqueId fragment_instance_id,
const std::vector<ExprContext*>& output_expr_ctxs,
- RuntimeProfile* parent_profile,
- BufferControlBlock* sinker,
- RowBatch* output_batch);
+ RuntimeProfile* parent_profile, BufferControlBlock*
sinker,
+ RowBatch* output_batch, bool output_object_data);
virtual ~FileResultWriter();
virtual Status init(RuntimeState* state) override;
diff --git a/be/src/runtime/mysql_result_writer.cpp
b/be/src/runtime/mysql_result_writer.cpp
index 7d3c522..1c2589c 100644
--- a/be/src/runtime/mysql_result_writer.cpp
+++ b/be/src/runtime/mysql_result_writer.cpp
@@ -33,8 +33,8 @@ namespace doris {
MysqlResultWriter::MysqlResultWriter(BufferControlBlock* sinker,
const std::vector<ExprContext*>&
output_expr_ctxs,
- RuntimeProfile* parent_profile)
- : ResultWriter(),
+ RuntimeProfile* parent_profile, bool
output_object_data)
+ : ResultWriter(output_object_data),
_sinker(sinker),
_output_expr_ctxs(output_expr_ctxs),
_row_buffer(nullptr),
@@ -115,7 +115,18 @@ int MysqlResultWriter::_add_row_value(int index, const
TypeDescriptor& type, voi
case TYPE_HLL:
case TYPE_OBJECT: {
- buf_ret = _row_buffer->push_null();
+ if (_output_object_data) {
+ const StringValue* string_val = (const StringValue*)(item);
+
+ if (string_val->ptr == nullptr) {
+ buf_ret = _row_buffer->push_null();
+ } else {
+ buf_ret = _row_buffer->push_string(string_val->ptr,
string_val->len);
+ }
+ } else {
+ buf_ret = _row_buffer->push_null();
+ }
+
break;
}
diff --git a/be/src/runtime/mysql_result_writer.h
b/be/src/runtime/mysql_result_writer.h
index 6811488..2754e0c 100644
--- a/be/src/runtime/mysql_result_writer.h
+++ b/be/src/runtime/mysql_result_writer.h
@@ -17,9 +17,9 @@
#pragma once
+#include "primitive_type.h"
#include "runtime/result_writer.h"
#include "runtime/runtime_state.h"
-#include "primitive_type.h"
namespace doris {
@@ -31,14 +31,14 @@ class BufferControlBlock;
class RuntimeProfile;
namespace vectorized {
- class VExprContext;
+class VExprContext;
}
// convert the row batch to mysql protocol row
class MysqlResultWriter final : public ResultWriter {
public:
MysqlResultWriter(BufferControlBlock* sinker, const
std::vector<ExprContext*>& output_expr_ctxs,
- RuntimeProfile* parent_profile);
+ RuntimeProfile* parent_profile, bool output_object_data);
virtual ~MysqlResultWriter();
diff --git a/be/src/runtime/result_file_sink.cpp
b/be/src/runtime/result_file_sink.cpp
index 9ee71eb..697a452 100644
--- a/be/src/runtime/result_file_sink.cpp
+++ b/be/src/runtime/result_file_sink.cpp
@@ -96,7 +96,7 @@ Status ResultFileSink::prepare(RuntimeState* state) {
// create writer
_writer.reset(new (std::nothrow) FileResultWriter(
_file_opts.get(), _storage_type,
state->fragment_instance_id(), _output_expr_ctxs,
- _profile, _sender.get(), nullptr));
+ _profile, _sender.get(), nullptr,
state->return_object_data_as_binary()));
} else {
// init channel
_profile = _pool->add(new RuntimeProfile(title.str()));
@@ -113,7 +113,7 @@ Status ResultFileSink::prepare(RuntimeState* state) {
_output_batch = new RowBatch(_output_row_descriptor, 1024,
_mem_tracker.get());
_writer.reset(new (std::nothrow) FileResultWriter(
_file_opts.get(), _storage_type,
state->fragment_instance_id(), _output_expr_ctxs,
- _profile, nullptr, _output_batch));
+ _profile, nullptr, _output_batch,
state->return_object_data_as_binary()));
}
RETURN_IF_ERROR(_writer->init(state));
for (int i = 0; i < _channels.size(); ++i) {
diff --git a/be/src/runtime/result_sink.cpp b/be/src/runtime/result_sink.cpp
index 1392037..206b0f0 100644
--- a/be/src/runtime/result_sink.cpp
+++ b/be/src/runtime/result_sink.cpp
@@ -75,14 +75,15 @@ Status ResultSink::prepare(RuntimeState* state) {
// create writer based on sink type
switch (_sink_type) {
case TResultSinkType::MYSQL_PROTOCAL:
- _writer.reset(new (std::nothrow)
- MysqlResultWriter(_sender.get(),
_output_expr_ctxs, _profile));
+ _writer.reset(new (std::nothrow) MysqlResultWriter(
+ _sender.get(), _output_expr_ctxs, _profile,
state->return_object_data_as_binary()));
break;
// deprecated
case TResultSinkType::FILE:
CHECK(_file_opts.get() != nullptr);
_writer.reset(new (std::nothrow) FileResultWriter(_file_opts.get(),
_output_expr_ctxs,
- _profile,
_sender.get()));
+ _profile,
_sender.get(),
+
state->return_object_data_as_binary()));
break;
default:
return Status::InternalError("Unknown result sink type");
diff --git a/be/src/runtime/result_writer.h b/be/src/runtime/result_writer.h
index 3c914d0..5e2bf33 100644
--- a/be/src/runtime/result_writer.h
+++ b/be/src/runtime/result_writer.h
@@ -28,14 +28,15 @@ class RuntimeState;
class TypeDescriptor;
namespace vectorized {
- class Block;
+class Block;
}
// abstract class of the result writer
class ResultWriter {
public:
ResultWriter() {};
- ~ResultWriter(){};
+ ResultWriter(bool output_object_data) :
_output_object_data(output_object_data) {};
+ ~ResultWriter() {};
virtual Status init(RuntimeState* state) = 0;
// convert and write one row batch
@@ -49,10 +50,13 @@ public:
virtual int64_t get_written_rows() const { return _written_rows; }
+ virtual bool output_object_data() const { return _output_object_data; }
+
static const std::string NULL_IN_CSV;
protected:
int64_t _written_rows = 0; // number of rows written
+ bool _output_object_data = false;
};
} // namespace doris
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index 19b2a78..3e4a014 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -351,6 +351,10 @@ public:
bool enable_vectorized_exec() const { return
_query_options.enable_vectorized_engine; }
+ bool return_object_data_as_binary() const {
+ return _query_options.return_object_data_as_binary;
+ }
+
bool enable_exchange_node_parallel_merge() const {
return _query_options.enable_enable_exchange_node_parallel_merge;
}
diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js
index 1f00708..a772152 100644
--- a/docs/.vuepress/sidebar/en.js
+++ b/docs/.vuepress/sidebar/en.js
@@ -625,11 +625,12 @@ module.exports = [
"benchmark-tool",
"fe-eclipse-dev",
"fe-idea-dev",
- "be-vscode-dev",
+ "be-vscode-dev",
"java-format-code",
"cpp-format-code",
"How-to-Share-blogs",
- "minidump"
+ "minidump",
+ "bitmap-hll-file-format",
],
},
{
diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js
index 9e11e7d..46f2154 100644
--- a/docs/.vuepress/sidebar/zh-CN.js
+++ b/docs/.vuepress/sidebar/zh-CN.js
@@ -633,7 +633,8 @@ module.exports = [
"java-format-code",
"cpp-format-code",
"How-to-Share-blogs",
- "minidump"
+ "minidump",
+ "bitmap-hll-file-format",
],
},
{
diff --git a/docs/en/administrator-guide/variables.md
b/docs/en/administrator-guide/variables.md
index e998a3e..a77b658 100644
--- a/docs/en/administrator-guide/variables.md
+++ b/docs/en/administrator-guide/variables.md
@@ -122,7 +122,7 @@ Note that the comment must start with /*+ and can only
follow the SELECT.
* `character_set_client`
- Used for compatibility with MySQL clients. No practical effect.
+ Used for compatibility with MySQL clients. No practical effect.
* `character_set_connection`
@@ -222,7 +222,7 @@ Note that the comment must start with /*+ and can only
follow the SELECT.
Forward to Master to view information about the relevant PROC stored
in the Master FE metadata. Mainly used for metadata comparison.
* `init_connect`
-
+
Used for compatibility with MySQL clients. No practical effect.
* `interactive_timeout`
@@ -416,7 +416,7 @@ Translated with www.DeepL.com/Translator (free version)
When execute insert statement, doris will wait for the transaction to
commit and visible after the import is completed.
This parameter controls the timeout of waiting for transaction to be
visible. The default value is 10000, and the minimum value is 1000.
-* `enable_exchange_node_parallel_merge`
+* `enable_exchange_node_parallel_merge`
In a sort query, when an upper level node receives the ordered data of the
lower level node, it will sort the corresponding data on the exchange node to
ensure that the final data is ordered. However, when a single thread merges
multiple channels of data, if the amount of data is too large, it will lead to
a single point of exchange node merge bottleneck.
@@ -443,3 +443,6 @@ Translated with www.DeepL.com/Translator (free version)
* `disable_join_reorder`
Used to turn off all automatic join reorder algorithms in the system.
There are two values: true and false.It is closed by default, that is, the
automatic join reorder algorithm of the system is adopted. After set to true,
the system will close all automatic sorting algorithms, adopt the original SQL
table order, and execute join
+
+* `return_object_data_as_binary`
+ Used to identify whether to return the bitmap/hll result in the select
result. In the select into outfile statement, if the export file format is csv,
the bimap/hll data will be base64-encoded, if it is the parquet file format,
the data will be stored as a byte array
\ No newline at end of file
diff --git a/docs/en/developer-guide/bitmap-hll-file-format.md
b/docs/en/developer-guide/bitmap-hll-file-format.md
new file mode 100644
index 0000000..201e99a
--- /dev/null
+++ b/docs/en/developer-guide/bitmap-hll-file-format.md
@@ -0,0 +1,64 @@
+---
+{
+"title": "Bitmap/HLL data format",
+"language": "en"
+}
+
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## Bitmap format
+### Format description
+The bitmap in Doris uses roaring bitmap storage, and the be side uses
CRoaring. The serialization format of `Roaring` is compatible in languages
such as C++/java/go, while the serialization result of the format of C++
`Roaring64Map` is the same as that of `Roaring64NavigableMap` in Java. Not
compatible. There are 5 types of Doris bimap, each of which is represented by
one byte
+
+The bitmap serialization format in Doris is explained as follows:
+
+```
+ | flag | data .....|
+ <--1Byte--><--n bytes-->
+```
+
+The Flag value description is as follows:
+
+| Value | Description |
+| ---- | ------------------------------------------------------------ |
+| 0 | EMPTY, empty bitmap, the following data part is empty, the whole
serialization result is only one byte |
+| 1 | SINGLE32, there is only one 32-bit unsigned integer value in the bitmap,
and the next 4 bytes represent the 32-bit unsigned integer value |
+| 2 | BITMAP32, 32-bit bitmap corresponds to the type
`org.roaringbitmap.RoaringBitmap` in java. The type is `roaring::Roaring` in
C++, and the following data is the structure after the sequence of
roaring::Roaring. You can use `org in java. .roaringbitmap.RoaringBitmap` or
`roaring::Roaring` in c++ directly deserialize |
+| 3 | SINGLE64, there is only one 64-bit unsigned integer value in the bitmap,
and the next 8 bytes represent the 64-bit unsigned integer value |
+| 4 | BITMAP64, 64-bit bitmap corresponds to `org.roaringbitmap.RoaringBitmap`
in java; `Roaring64Map` in doris in c++. The data structure is the same as the
result in the roaring library, but the serialization and deserialization
methods It is different, there will be 1-8 bytes of variable-length encoding
uint64 in the bitmap representation of the size. The following data is a series
of multiple high-order representations of 4 bytes and 32-bit roaring bitmap
serialized data repeated |
+
+C++ serialization and deserialization examples are in the
`BitmapValue::write()` method in `be/src/util/bitmap_value.h` and the Java
examples are in the `serialize()` `deserialize()` method in
`fe/fe-common/src/main/java/org/apache/doris/common/io/BitmapValue.java`.
+
+## HLL format description
+
+Serialized data in HLL format is implemented in Doris itself. Similar to the
Bitmap type, the HLL format is composed of a 1-byte flag followed by multiple
bytes of data. The meaning of the flag is as follows
+
+
+
+| Value | Description |
+| ---- | ------------------------------------------------------------ |
+| 0 | HLL_DATA_EMPTY, empty HLL, the following data part is empty, the entire
serialization result is only one byte |
+| 1 | HLL_DATA_EXPLICIT, the next byte is explicit The number of data blocks,
followed by multiple data blocks, each data block is composed of 8 bytes in
length and data, |
+| 2 | HLL_DATA_SPARSE, only non-zero values are stored, the next 4 bytes
indicate the number of registers, and there are multiple register structures in
the back. Each register is composed of the index of the first 2 bytes and the
value of 1 byte |
+| 3 | HLL_DATA_FULL, which means that all 16 * 1024 registers have values,
followed by 16 * 1024 bytes of value data |
+
+C++ serialization and deserialization examples are in the `serialize()`
`deserialize()` method of `be/src/olap/hll.h`, and the Java examples are in the
`serialize()` `deserialize()` method in
`fe/fe-common/src/main/java/org/apache/doris/common/io/hll.java`.
diff --git a/docs/zh-CN/administrator-guide/variables.md
b/docs/zh-CN/administrator-guide/variables.md
index 665c606..cc63cdf 100644
--- a/docs/zh-CN/administrator-guide/variables.md
+++ b/docs/zh-CN/administrator-guide/variables.md
@@ -409,7 +409,7 @@ SELECT /*+ SET_VAR(query_timeout = 1,
enable_partition_cache=true) */ sleep(3);
在执行insert语句时,导入动作(查询和插入)完成后,还需要等待事务提交,使数据可见。此参数控制等待数据可见的超时时间,默认为10000,最小为1000。
-* `enable_exchange_node_parallel_merge`
+* `enable_exchange_node_parallel_merge`
在一个排序的查询之中,一个上层节点接收下层节点有序数据时,会在exchange
node上进行对应的排序来保证最终的数据是有序的。但是单线程进行多路数据归并时,如果数据量过大,会导致exchange node的单点的归并瓶颈。
@@ -436,3 +436,6 @@ SELECT /*+ SET_VAR(query_timeout = 1,
enable_partition_cache=true) */ sleep(3);
* `disable_join_reorder`
用于关闭所有系统自动的 join reorder 算法。取值有两种:true 和 false。默认行况下关闭,也就是采用系统自动的 join
reorder 算法。设置为 true 后,系统会关闭所有自动排序的算法,采用 SQL 原始的表顺序,执行 join
+
+* `return_object_data_as_binary`
+ 用于标识是否在select 结果中返回bitmap/hll 结果。在 select into outfile 语句中,如果导出文件格式为csv 则会将
bimap/hll 数据进行base64编码,如果是parquet 文件格式 将会把数据作为byte array 存储
diff --git a/docs/zh-CN/developer-guide/bitmap-hll-file-format.md
b/docs/zh-CN/developer-guide/bitmap-hll-file-format.md
new file mode 100644
index 0000000..8e2aacd
--- /dev/null
+++ b/docs/zh-CN/developer-guide/bitmap-hll-file-format.md
@@ -0,0 +1,64 @@
+---
+{
+"title": "Bitmap/HLL 数据格式",
+"language": "zh-CN"
+}
+
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## Bitmap 格式
+### 格式说明
+Doris 中的bitmap 采用的是roaring bitmap 存储, be 端使用CRoaring,`Roaring`
的序列化格式在C++/java/go 等语言中兼容, 而C++ `Roaring64Map`
的格式序列化结果和Java中`Roaring64NavigableMap` 不兼容。Doris bimap 有5种类型, 分别用一个字节表示
+
+Doris 中的bitmap 序列化格式说明如下:
+
+```
+ | flag | data .....|
+ <--1Byte--><--n bytes-->
+```
+
+Flag 值说明如下:
+
+| 值 | 描述 |
+| ---- | ------------------------------------------------------------ |
+| 0 | EMPTY,空 bitmap, 后面data 部分为空,整个序列化结果只有一个字节 |
+| 1 | SINGLE32,bitmap 中只有一个32位无符号整型值, 后面4个字节表示32位无符号整型值 |
+| 2 | BITMAP32,32 位bitmap 对应java 中类型为 `org.roaringbitmap.RoaringBitmap` C++
中类型为`roaring::Roaring`, 后面data 为roaring::Roaring 序列后的结构, 可以使用java 中的
`org.roaringbitmap.RoaringBitmap` 或c++中`roaring::Roaring` 直接反序列化 |
+| 3 | SINGLE64 ,bitmap 中只有一个64位无符号整型值,后面8个字节表示64位无符号整型值 |
+| 4 | BITMAP64, 64 位bitmap 对应java 中类型为 `org.roaringbitmap.RoaringBitmap;`
c++ 中类型为doris 中的`Roaring64Map`,数据结构和 roaring
库中的结果一致,但是序列化和反序列话方法有所不同,后面会有1-8个字节的变长编码的uint64 的表述bitmap 中size。后面的数据是各式是多个
4个字节的高位表示和32位 roaring bitmap 序列化数据重复而成 |
+
+c++ 序列化和反序列化的示例 在 `be/src/util/bitmap_value.h` 的`BitmapValue::write()`
`BitmapValue::deserialize()` 方法中, Java示例在
`fe/fe-common/src/main/java/org/apache/doris/common/io/BitmapValue.java`
中的`serialize()` `deserialize()` 方法中。
+
+## HLL 格式说明
+
+HLL 格式序列化数据在Doris 中自己实现的。同Bitmap 类型类似,HLL 格式是1个字节的flag 后面跟随多个字节数据组成,flag 含义如下
+
+
+
+| 值 | 描述 |
+| ---- | ------------------------------------------------------------ |
+| 0 | HLL_DATA_EMPTY,空 HLL, 后面data 部分为空,整个序列化结果只有一个字节 |
+| 1 | HLL_DATA_EXPLICIT, 后面1个字节 explicit 数据块个数,后面接多个数据块,每个数据块由8个字节长度和数据组成, |
+| 2 | HLL_DATA_SPARSE,只存非0 值,后面4个字节 表示 register 个数, 后面连续多个 register
结构,每个register 由前面2个字节的index 和1个字节的值组成 |
+| 3 | HLL_DATA_FULL ,表示所有16 * 1024个register都有值, 后面连续16 * 1024个字节的值数据 |
+
+c++ 序列化和反序列化的示例 在 `be/src/olap/hll.h` 的`serialize()` `deserialize()` 方法中,
Java示例在 `fe/fe-common/src/main/java/org/apache/doris/common/io/hll.java`
中的`serialize()` `deserialize()` 方法中。
\ No newline at end of file
diff --git a/fe/fe-common/pom.xml b/fe/fe-common/pom.xml
index ea7669e..1adc844 100644
--- a/fe/fe-common/pom.xml
+++ b/fe/fe-common/pom.xml
@@ -89,6 +89,11 @@ under the License.
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
+
+ <dependency>
+ <groupId>org.roaringbitmap</groupId>
+ <artifactId>RoaringBitmap</artifactId>
+ </dependency>
</dependencies>
<build>
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/BitmapValue.java
b/fe/fe-common/src/main/java/org/apache/doris/common/io/BitmapValue.java
similarity index 99%
rename from
fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/BitmapValue.java
rename to fe/fe-common/src/main/java/org/apache/doris/common/io/BitmapValue.java
index a8e0cad..fe857bf 100644
---
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/BitmapValue.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/io/BitmapValue.java
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-package org.apache.doris.load.loadv2.dpp;
+package org.apache.doris.common.io;
import org.roaringbitmap.Util;
diff --git a/fe/spark-dpp/src/main/java/org/apache/doris/common/Codec.java
b/fe/fe-common/src/main/java/org/apache/doris/common/io/Codec.java
similarity index 98%
rename from fe/spark-dpp/src/main/java/org/apache/doris/common/Codec.java
rename to fe/fe-common/src/main/java/org/apache/doris/common/io/Codec.java
index af9bad9..ceb7a51 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/common/Codec.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/io/Codec.java
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-package org.apache.doris.common;
+package org.apache.doris.common.io;
import java.io.DataInput;
import java.io.DataOutput;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/Hll.java
b/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java
similarity index 99%
rename from fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/Hll.java
rename to fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java
index c95a3a4..0f6993a 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/Hll.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-package org.apache.doris.load.loadv2.dpp;
+package org.apache.doris.common.io;
import org.apache.commons.codec.binary.StringUtils;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/Roaring64Map.java
b/fe/fe-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java
similarity index 99%
rename from
fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/Roaring64Map.java
rename to
fe/fe-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java
index a606766..5a20138 100644
---
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/Roaring64Map.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/io/Roaring64Map.java
@@ -15,10 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-package org.apache.doris.load.loadv2.dpp;
-
-import org.apache.doris.common.Codec;
-import org.apache.doris.load.loadv2.dpp.BitmapValue;
+package org.apache.doris.common.io;
import org.roaringbitmap.BitmapDataProvider;
import org.roaringbitmap.BitmapDataProviderSupplier;
diff --git
a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/BitmapValueTest.java
b/fe/fe-common/src/test/java/org/apache/doris/common/io/BitmapValueTest.java
similarity index 99%
rename from
fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/BitmapValueTest.java
rename to
fe/fe-common/src/test/java/org/apache/doris/common/io/BitmapValueTest.java
index b3c0724..8a33ced 100644
---
a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/BitmapValueTest.java
+++ b/fe/fe-common/src/test/java/org/apache/doris/common/io/BitmapValueTest.java
@@ -15,10 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-package org.apache.doris.load.loadv2.dpp;
-
-import org.apache.doris.load.loadv2.dpp.BitmapValue;
-import org.apache.doris.common.Codec;
+package org.apache.doris.common.io;
import org.junit.Assert;
import org.junit.Test;
diff --git
a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/HllTest.java
b/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java
similarity index 99%
rename from
fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/HllTest.java
rename to fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java
index 3c742c9..4253d3f 100644
--- a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/HllTest.java
+++ b/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java
@@ -15,9 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-package org.apache.doris.load.loadv2.dpp;
-
-import org.apache.doris.load.loadv2.dpp.Hll;
+package org.apache.doris.common.io;
import org.junit.Assert;
import org.junit.Test;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
index 5c907cd..c82cd1a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
@@ -27,6 +27,7 @@ import org.apache.doris.common.FeNameFormat;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.ParseUtil;
import org.apache.doris.common.util.PrintableMap;
+import org.apache.doris.qe.ConnectContext;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TResultFileSinkOptions;
@@ -251,6 +252,17 @@ public class OutFileClause {
"but the definition type of column " + i + "
is " + type);
}
break;
+ case HLL:
+ case BITMAP:
+ if (ConnectContext.get() != null &&
ConnectContext.get().getSessionVariable().isReturnObjectDataAsBinary()) {
+ if (!type.equals("byte_array")) {
+ throw new AnalysisException("project field type is
HLL/BITMAP, should use byte_array, " +
+ "but the definition type of column " + i +
" is " + type);
+ }
+ } else {
+ throw new AnalysisException("Parquet format does not
support column type: " + resultType.getPrimitiveType());
+ }
+ break;
default:
throw new AnalysisException("Parquet format does not
support column type: " + resultType.getPrimitiveType());
}
@@ -289,6 +301,11 @@ public class OutFileClause {
case DECIMALV2:
column.add("byte_array");
break;
+ case HLL:
+ case BITMAP:
+ if (ConnectContext.get() != null &&
ConnectContext.get().getSessionVariable().isReturnObjectDataAsBinary()) {
+ column.add("byte_array");
+ }
default:
throw new AnalysisException("currently parquet do not
support column type: " + expr.getType().getPrimitiveType());
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index d7a4c0f..2f3a5fc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -167,6 +167,8 @@ public class SessionVariable implements Serializable,
Writable {
public static final String SQL_QUOTE_SHOW_CREATE = "sql_quote_show_create";
+ public static final String RETURN_OBJECT_DATA_AS_BINARY =
"return_object_data_as_binary";
+
// session origin value
public Map<Field, String> sessionOriginValue = new HashMap<Field,
String>();
// check stmt is or not [select /*+ SET_VAR(...)*/ ...]
@@ -395,6 +397,9 @@ public class SessionVariable implements Serializable,
Writable {
@VariableMgr.VarAttr(name = SQL_QUOTE_SHOW_CREATE)
public boolean sqlQuoteShowCreate = true;
+ @VariableMgr.VarAttr(name = RETURN_OBJECT_DATA_AS_BINARY)
+ private boolean returnObjectDataAsBinary = false;
+
public long getMaxExecMemByte() {
return maxExecMemByte;
}
@@ -822,6 +827,14 @@ public class SessionVariable implements Serializable,
Writable {
return disableJoinReorder;
}
+ public boolean isReturnObjectDataAsBinary() {
+ return returnObjectDataAsBinary;
+ }
+
+ public void setReturnObjectDataAsBinary(boolean returnObjectDataAsBinary) {
+ this.returnObjectDataAsBinary = returnObjectDataAsBinary;
+ }
+
// Serialize to thrift object
// used for rest api
public TQueryOptions toThrift() {
@@ -838,6 +851,7 @@ public class SessionVariable implements Serializable,
Writable {
tResult.setIsReportSuccess(enableProfile);
tResult.setCodegenLevel(codegenLevel);
tResult.setEnableVectorizedEngine(enableVectorizedEngine);
+ tResult.setReturnObjectDataAsBinary(returnObjectDataAsBinary);
tResult.setBatchSize(batchSize);
tResult.setDisableStreamPreaggregations(disableStreamPreaggregations);
diff --git a/fe/spark-dpp/pom.xml b/fe/spark-dpp/pom.xml
index 49bc831..1a44c13 100644
--- a/fe/spark-dpp/pom.xml
+++ b/fe/spark-dpp/pom.xml
@@ -92,10 +92,6 @@ under the License.
<scope>test</scope>
</dependency>
- <dependency>
- <groupId>org.roaringbitmap</groupId>
- <artifactId>RoaringBitmap</artifactId>
- </dependency>
<!-- spark -->
<!--
https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.12 -->
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java
index d434c2f..f58367d 100644
---
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java
+++
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DorisKryoRegistrator.java
@@ -17,6 +17,9 @@
package org.apache.doris.load.loadv2.dpp;
+import org.apache.doris.common.io.BitmapValue;
+import org.apache.doris.common.io.Roaring64Map;
+
import com.esotericsoftware.kryo.Kryo;
import org.apache.spark.serializer.KryoRegistrator;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java
index 2d90a9d..c411883 100644
---
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java
+++
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkRDDAggregator.java
@@ -20,6 +20,8 @@ package org.apache.doris.load.loadv2.dpp;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.doris.common.SparkDppException;
+import org.apache.doris.common.io.BitmapValue;
+import org.apache.doris.common.io.Hll;
import org.apache.doris.load.loadv2.etl.EtlJobConfig;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.function.Function2;
diff --git a/gensrc/thrift/PaloInternalService.thrift
b/gensrc/thrift/PaloInternalService.thrift
index f347dec..664dab0 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -156,6 +156,10 @@ struct TQueryOptions {
// the resource limitation of this query
42: optional TResourceLimit resource_limit
+
+ // show bitmap data in result, if use this in mysql cli may make the terminal
+ // output corrupted character
+ 43: optional bool return_object_data_as_binary = false
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]