This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 50015d76f0d [feature](csv)Supports reading CSV data using LF and CRLF
as line separators. (#37687) (#39980)
50015d76f0d is described below
commit 50015d76f0d19d4effa5f6c013cd2fdcca24a86b
Author: daidai <[email protected]>
AuthorDate: Wed Aug 28 14:35:17 2024 +0800
[feature](csv)Supports reading CSV data using LF and CRLF as line
separators. (#37687) (#39980)
bp #37687
---
be/src/vec/exec/format/csv/csv_reader.cpp | 20 +-
be/src/vec/exec/format/csv/csv_reader.h | 1 +
.../file_reader/new_plain_text_line_reader.cpp | 9 +-
.../file_reader/new_plain_text_line_reader.h | 106 ++++++++-
be/src/vec/exec/format/json/new_json_reader.cpp | 5 +-
.../java/org/apache/doris/qe/SessionVariable.java | 17 ++
.../lf_crlf_and_quotes.csv | 134 ++++++++++++
.../lf_crlf_and_quotes.csv.gz | Bin 0 -> 337 bytes
.../lf_crlf_not_quotes.csv | 135 ++++++++++++
.../lf_crlf_not_quotes.csv.gz | Bin 0 -> 264 bytes
.../tvf/test_tvf_csv_line_end.out | 201 +++++++++++++++++
.../tvf/test_tvf_csv_line_end.groovy | 243 +++++++++++++++++++++
12 files changed, 849 insertions(+), 22 deletions(-)
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 46e5f176b21..cee1c99eefb 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -368,17 +368,21 @@ Status CsvReader::init_reader(bool is_load) {
_options.converted_from_string = _trim_double_quotes;
_not_trim_enclose = (!_trim_double_quotes && _enclose == '\"');
+ if (_state != nullptr) {
+ _keep_cr = _state->query_options().keep_carriage_return;
+ }
+
std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx;
if (_enclose == 0) {
- text_line_reader_ctx =
- std::make_shared<PlainTextLineReaderCtx>(_line_delimiter,
_line_delimiter_length);
+ text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(
+ _line_delimiter, _line_delimiter_length, _keep_cr);
_fields_splitter = std::make_unique<PlainCsvTextFieldSplitter>(
_trim_tailing_spaces, false, _value_separator,
_value_separator_length, -1);
} else {
text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderContext>(
_line_delimiter, _line_delimiter_length, _value_separator,
_value_separator_length,
- _file_slot_descs.size() - 1, _enclose, _escape);
+ _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr);
_fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
_trim_tailing_spaces, !_not_trim_enclose,
@@ -881,20 +885,24 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool*
is_parse_name) {
_options.map_key_delim =
_params.file_attributes.text_params.mapkv_delimiter[0];
}
+ if (_state != nullptr) {
+ _keep_cr = _state->query_options().keep_carriage_return;
+ }
+
// create decompressor.
// _decompressor may be nullptr if this is not a compressed file
RETURN_IF_ERROR(_create_decompressor());
std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx;
if (_enclose == 0) {
- text_line_reader_ctx =
- std::make_shared<PlainTextLineReaderCtx>(_line_delimiter,
_line_delimiter_length);
+ text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(
+ _line_delimiter, _line_delimiter_length, _keep_cr);
_fields_splitter = std::make_unique<PlainCsvTextFieldSplitter>(
_trim_tailing_spaces, _trim_double_quotes, _value_separator,
_value_separator_length);
} else {
text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderContext>(
_line_delimiter, _line_delimiter_length, _value_separator,
_value_separator_length,
- _file_slot_descs.size() - 1, _enclose, _escape);
+ _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr);
_fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
_trim_tailing_spaces, false,
std::static_pointer_cast<EncloseCsvLineReaderContext>(text_line_reader_ctx),
diff --git a/be/src/vec/exec/format/csv/csv_reader.h
b/be/src/vec/exec/format/csv/csv_reader.h
index 7fb3b3a1231..5e2865e45cb 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -286,6 +286,7 @@ private:
bool _trim_tailing_spaces = false;
// `should_not_trim` is to manage the case that: user do not expect to
trim double quotes but enclose is double quotes
bool _not_trim_enclose = true;
+ bool _keep_cr = false;
io::IOContext* _io_ctx = nullptr;
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 75350890aee..9a09a90d1aa 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -21,6 +21,9 @@
#include <glog/logging.h>
#include <string.h>
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
#include <algorithm>
#include <cstddef>
#include <cstring>
@@ -42,7 +45,6 @@
// leave these 2 size small for debugging
namespace doris {
-
const uint8_t* EncloseCsvLineReaderContext::read_line_impl(const uint8_t*
start,
const size_t
length) {
_total_len = length;
@@ -82,12 +84,11 @@ void EncloseCsvLineReaderContext::on_col_sep_found(const
uint8_t* start,
}
size_t EncloseCsvLineReaderContext::update_reading_bound(const uint8_t* start)
{
- _result = (uint8_t*)memmem(start + _idx, _total_len - _idx,
line_delimiter.c_str(),
- line_delimiter_len);
+ _result = call_find_line_sep(start + _idx, _total_len - _idx);
if (_result == nullptr) {
return _total_len;
}
- return _result - start + line_delimiter_len;
+ return _result - start + line_delimiter_length();
}
template <bool SingleChar>
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index 0b0d9f133fa..c91b503cbe5 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -47,7 +47,6 @@ public:
// info about the current line may be record to the ctx, like column
seprator pos.
/// @return line delimiter pos if found, otherwise return nullptr.
virtual const uint8_t* read_line(const uint8_t* start, const size_t len) =
0;
-
/// @return length of line delimiter
[[nodiscard]] virtual size_t line_delimiter_length() const = 0;
@@ -62,30 +61,117 @@ class BaseTextLineReaderContext : public
TextLineReaderContextIf {
public:
explicit BaseTextLineReaderContext(const std::string& line_delimiter_,
- const size_t line_delimiter_len_)
- : line_delimiter(line_delimiter_),
line_delimiter_len(line_delimiter_len_) {}
+ const size_t line_delimiter_len_, const
bool keep_cr_)
+ : line_delimiter(line_delimiter_),
+ line_delimiter_len(line_delimiter_len_),
+ keep_cr(keep_cr_) {
+ use_memmem = line_delimiter_len != 1 || line_delimiter != "\n" ||
keep_cr;
+ if (use_memmem) {
+ find_line_delimiter_func =
&BaseTextLineReaderContext::find_multi_char_line_sep;
+ } else {
+ find_line_delimiter_func =
&BaseTextLineReaderContext::find_lf_crlf_line_sep;
+ }
+ }
inline const uint8_t* read_line(const uint8_t* start, const size_t len)
final {
return static_cast<Ctx*>(this)->read_line_impl(start, len);
}
- [[nodiscard]] inline size_t line_delimiter_length() const final { return
line_delimiter_len; }
+ [[nodiscard]] inline size_t line_delimiter_length() const final {
+ return line_delimiter_len + line_crlf;
+ }
inline void refresh() final { return
static_cast<Ctx*>(this)->refresh_impl(); };
+ inline const uint8_t* find_multi_char_line_sep(const uint8_t* start, const
size_t length) {
+ return static_cast<uint8_t*>(
+ memmem(start, length, line_delimiter.c_str(),
line_delimiter_len));
+ }
+
+ const uint8_t* find_lf_crlf_line_sep(const uint8_t* start, const size_t
length) {
+ line_crlf = false;
+ if (start == nullptr || length == 0) {
+ return nullptr;
+ }
+ size_t i = 0;
+#ifdef __AVX2__
+ // const uint8_t* end = start + length;
+ const __m256i newline = _mm256_set1_epi8('\n');
+ const __m256i carriage_return = _mm256_set1_epi8('\r');
+
+ const size_t simd_width = 32;
+ // Process 32 bytes at a time using AVX2
+ for (; i + simd_width <= length; i += simd_width) {
+ __m256i data = _mm256_loadu_si256(reinterpret_cast<const
__m256i*>(start + i));
+
+ // Compare with '\n' and '\r'
+ __m256i cmp_newline = _mm256_cmpeq_epi8(data, newline);
+ __m256i cmp_carriage_return = _mm256_cmpeq_epi8(data,
carriage_return);
+
+ // Check if there is a match
+ int mask_newline = _mm256_movemask_epi8(cmp_newline);
+ int mask_carriage_return =
_mm256_movemask_epi8(cmp_carriage_return);
+
+ if (mask_newline != 0 || mask_carriage_return != 0) {
+ int pos_lf = (mask_newline != 0) ? i +
__builtin_ctz(mask_newline) : INT32_MAX;
+ int pos_cr = (mask_carriage_return != 0) ? i +
__builtin_ctz(mask_carriage_return)
+ : INT32_MAX;
+ if (pos_lf < pos_cr) {
+ return start + pos_lf;
+ } else if (pos_cr < pos_lf) {
+ if (pos_lf != INT32_MAX) {
+ if (pos_lf - 1 >= 0 && start[pos_lf - 1] == '\r') {
+ //check xxx\r\r\r\nxxx
+ line_crlf = true;
+ return start + pos_lf - 1;
+ }
+ // xxx\rxxxx\nxx
+ return start + pos_lf;
+ } else if (i + simd_width < length && start[i + simd_width
- 1] == '\r' &&
+ start[i + simd_width] == '\n') {
+ //check [/r/r/r/r/r/r/rxxx/r] [\nxxxx]
+ line_crlf = true;
+ return start + i + simd_width - 1;
+ }
+ }
+ }
+ }
+
+ // Process remaining bytes
+#endif
+ for (; i < length; ++i) {
+ if (start[i] == '\n') {
+ return &start[i];
+ }
+ if (start[i] == '\r' && (i + 1 < length) && start[i + 1] == '\n') {
+ line_crlf = true;
+ return &start[i];
+ }
+ }
+ return nullptr;
+ }
+ const uint8_t* call_find_line_sep(const uint8_t* start, const size_t
length) {
+ return (this->*find_line_delimiter_func)(start, length);
+ }
+
protected:
const std::string line_delimiter;
const size_t line_delimiter_len;
+ bool keep_cr = false;
+ bool line_crlf = false;
+ bool use_memmem = true;
+ using FindLineDelimiterFunc = const uint8_t*
(BaseTextLineReaderContext::*)(const uint8_t*,
+
size_t);
+ FindLineDelimiterFunc find_line_delimiter_func;
};
-
class PlainTextLineReaderCtx final : public
BaseTextLineReaderContext<PlainTextLineReaderCtx> {
public:
explicit PlainTextLineReaderCtx(const std::string& line_delimiter_,
- const size_t line_delimiter_len_)
- : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_)
{}
+ const size_t line_delimiter_len_, const
bool keep_cr_)
+ : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_,
keep_cr_) {}
inline const uint8_t* read_line_impl(const uint8_t* start, const size_t
length) {
- return (uint8_t*)memmem(start, length, line_delimiter.c_str(),
line_delimiter_len);
+ return call_find_line_sep(start, length);
}
inline void refresh_impl() {}
@@ -119,8 +205,8 @@ public:
const size_t line_delimiter_len_,
const std::string& column_sep_,
const size_t column_sep_len_, size_t
col_sep_num,
- const char enclose, const char escape)
- : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_),
+ const char enclose, const char
escape, const bool keep_cr_)
+ : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_,
keep_cr_),
_enclose(enclose),
_escape(escape),
_column_sep_len(column_sep_len_),
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 3b5affa6fb6..2aff2cb4e7e 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -420,8 +420,9 @@ Status NewJsonReader::_open_line_reader() {
}
_line_reader = NewPlainTextLineReader::create_unique(
_profile, _file_reader, _decompressor.get(),
- std::make_shared<PlainTextLineReaderCtx>(_line_delimiter,
_line_delimiter_length), size,
- _current_offset);
+ std::make_shared<PlainTextLineReaderCtx>(_line_delimiter,
_line_delimiter_length,
+ false),
+ size, _current_offset);
return Status::OK();
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 2239d585be3..60aff519443 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -593,6 +593,8 @@ public class SessionVariable implements Serializable,
Writable {
public static final String ENABLE_PUSHDOWN_MINMAX_ON_UNIQUE =
"enable_pushdown_minmax_on_unique";
+ public static final String KEEP_CARRIAGE_RETURN = "keep_carriage_return";
+
public static final String ENABLE_PUSHDOWN_STRING_MINMAX =
"enable_pushdown_string_minmax";
// When set use fix replica = true, the fixed replica maybe bad, try to
use the health one if
@@ -1880,6 +1882,12 @@ public class SessionVariable implements Serializable,
Writable {
"The maximum number of partitions created during table
creation"})
public int createTablePartitionMaxNum = 10000;
+
+ @VariableMgr.VarAttr(name = KEEP_CARRIAGE_RETURN,
+ description = {"在同时处理\r和\r\n作为CSV的行分隔符时,是否保留\r",
+ "When processing both \\n and \\r\\n as CSV line
separators, should \\r be retained?"})
+ public boolean keepCarriageReturn = false;
+
@VariableMgr.VarAttr(name = FORCE_JNI_SCANNER,
description = {"强制使用jni方式读取外表", "Force the use of jni mode to read
external table"})
private boolean forceJniScanner = false;
@@ -3355,6 +3363,14 @@ public class SessionVariable implements Serializable,
Writable {
this.enableUnicodeNameSupport = enableUnicodeNameSupport;
}
+ public boolean isKeepCarriageReturn() {
+ return keepCarriageReturn;
+ }
+
+ public void setKeepCarriageReturn(boolean keepCarriageReturn) {
+ this.keepCarriageReturn = keepCarriageReturn;
+ }
+
public boolean isDropTableIfCtasFailed() {
return dropTableIfCtasFailed;
}
@@ -3643,6 +3659,7 @@ public class SessionVariable implements Serializable,
Writable {
tResult.setEnableFallbackOnMissingInvertedIndex(enableFallbackOnMissingInvertedIndex);
tResult.setEnableSegmentCache(enableSegmentCache);
+ tResult.setKeepCarriageReturn(keepCarriageReturn);
return tResult;
}
diff --git
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv
new file mode 100644
index 00000000000..2564cafe1ad
--- /dev/null
+++
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv
@@ -0,0 +1,134 @@
+1 ,Alice , 30 ,New York
+w,w,w,"w"
+"w",w,"w","w"
+"10","abc" ,"ttt","def"
+2,Bob,25,Los Angeles
+3,"Ch a rlie ",35,Chicago
+4,abc,def,sss
+w,w,w,w
+ 5 , ttt , d e f ,sss
+w,w,w,w
+" wrweqreqer ","234 ", 32323 ,"3232"
+w, w, w,w
+w,"w",w,"w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,"w",w,w
+w,w,w,w
+w,w,w, w
+w,w, w , w
+w,w,w,w
+w,"w","w",w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w",w,"w"
+""w,"w",w,"w
+""w,"w",w,"w
+""w,"w",w,"w
+""w,"w",w,"w
+""w,"w",w,"w
+w,w,w,w
+w,w,w,w
+w,"w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,""w
+w,w,w,w
+w,w,w,w
+w,w,"w",w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,"w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+12,34,"abc ",def
+"w",w,w,"w"
+"w","w","w","w"
diff --git
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv.gz
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv.gz
new file mode 100644
index 00000000000..84f73cb2100
Binary files /dev/null and
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv.gz
differ
diff --git
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv
new file mode 100644
index 00000000000..0a504e467e9
--- /dev/null
+++
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv
@@ -0,0 +1,135 @@
+1,Alice,30,New York
+w,w,w,w
+w,w, w,w
+10,abc,ttt,def
+2,Bob,25,Los An geles
+3,Charlie,35 , Chicago
+4,abc,def,sss
+w,w,w,w
+5,tt t,def,ss s
+w,w,w,w
+ wrwe qreqe r , 23 4,32 323,3 232
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w, w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w, w
+w,w,w,w
+w, w ,w,w
+w, w, w ,w
+w,w,w,w
+w,w,w,w
+w,w,w, w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+ w ,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w, w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w, w,w,w
+w,w,w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
diff --git
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv.gz
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv.gz
new file mode 100644
index 00000000000..f24bb5b3d3e
Binary files /dev/null and
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv.gz
differ
diff --git
a/regression-test/data/external_table_p0/tvf/test_tvf_csv_line_end.out
b/regression-test/data/external_table_p0/tvf/test_tvf_csv_line_end.out
new file mode 100644
index 00000000000..b7658c0a93f
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_tvf_csv_line_end.out
@@ -0,0 +1,201 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !csv_2 --
+\rw\r w w w\r
+\rwrwe\rqreqe\rr\r \r23\r4 32\r323 3\r232
+1 Alice 30 New York\r
+10 abc ttt def
+2 Bob 25 Los\r An\rgeles\r
+3 Charlie 35\r
\r\r\rChicago\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+4 abc def sss
+5 tt\rt def ss\r\r\r\rs
+w \r\r\r\r\r\r\rw w w\r
+w \rw \rw\r w\r
+w \rw w w\r
+w \rw\r w w\r
+w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw w\r
+w w \rw w\r
+w w \rw w\r
+w w \rw w\r
+w w \rw w\r
+w w \rw w\r
+w w \rw w\r
+w w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r
+w w w \rw
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w
w\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+
+-- !csv_4 --
+\rwrwe\rqreqe\rr\r \r23\r4 32\r323 3\r232
+1 Alice 30 New York
+10 abc ttt def
+2 Bob 25 Los\r An\rgeles
+3 Charlie 35\r
\r\r\rChicago\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+4 abc def sss
+5 tt\rt def ss\r\r\r\rs
+w w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w w w \rw
+w w w
w\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+
+-- !csv_6 --
+\r\r\r\r\r\r\r\r\r5\r\r \rttt\r \rd\re\rf\r sss\r\r\r\r
+\rwrweqreqer\r 234\r \r32323\r "3232"\r\r\r\r
+""w w w "w\r
+""w w w "w\r
+""w w w "w\r
+""w w w "w\r
+""w w w "w\r
+1\r\r\r Alice\r\r\r \r\r30\r New York\r
+10 "abc"\r\r ttt def
+12 34 abc\r def
+2 Bob 25 Los Angeles\r
+3 Ch\r\ra\rrlie\r\r\r\r\r\r 35 Chicago\r\r\r\r\r\r\r\r\r
+4 abc def sss
+w \rw \rw w\r
+w w \r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r
\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r\r\r\r\r\r\r\r\r
+w w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w w w ""w\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w "w"\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+w w w w\r
+
+-- !csv_8 --
+\r\r\r\r\r\r\r\r\r5\r\r \rttt\r \rd\re\rf\r sss\r\r\r
+\rwrweqreqer\r 234\r \r32323\r "3232"\r\r\r
+""w w w "w
+""w w w "w
+""w w w "w
+""w w w "w
+""w w w "w
+1\r\r\r Alice\r\r\r \r\r30\r New York
+10 "abc"\r\r ttt def
+12 34 abc\r def
+2 Bob 25 Los Angeles
+3 Ch\r\ra\rrlie\r\r\r\r\r\r 35 Chicago\r\r\r\r\r\r\r\r
+4 abc def sss
+w w \r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r
\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r\r\r\r\r\r\r\r
+w w w \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w w w ""w
+
diff --git
a/regression-test/suites/external_table_p0/tvf/test_tvf_csv_line_end.groovy
b/regression-test/suites/external_table_p0/tvf/test_tvf_csv_line_end.groovy
new file mode 100644
index 00000000000..cb2beb6f941
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_tvf_csv_line_end.groovy
@@ -0,0 +1,243 @@
+import org.junit.Assert
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_tvf_csv_line_end", "p0,tvf") {
+ List<List<Object>> backends = sql """ show backends """
+ assertTrue(backends.size() > 0)
+ def be_id = backends[0][0]
+ def dataFilePath = context.config.dataPath +
"/external_table_p0/tvf/test_csv_line_end_lf_crlf"
+
+ def outFilePath="/test_csv_line_end_lf_crlf"
+
+ for (List<Object> backend : backends) {
+ def be_host = backend[1]
+ scpFiles ("root", be_host, dataFilePath, outFilePath, false);
+ }
+
+ String filename = "lf_crlf_not_quotes.csv"
+
+ sql """set enable_nereids_planner=true"""
+ sql """set enable_fallback_to_original_planner=false"""
+
+ sql """ set keep_carriage_return = true; """
+ // qt_csv_1"""
+ // select * from local(
+ // "file_path" = "${outFilePath}/${filename}",
+ // "backend_id" = "${be_id}",
+ // "format" = "csv",
+ // "column_separator" = ","
+ // )
+ // order by c1,c2,c3,c4;
+ // """
+
+ qt_csv_2"""
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ","
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ List<List<String>> result1 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ","
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ List<List<String>> result2 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}.gz",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "compress_type"="gz"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ log.info("result2 = ${result2}")
+ assertTrue(result1.size() == result2.size());
+ for(int i =0 ;i < result1.size();i++){
+ for(int j =0 ; j< result1.size();j++) {
+ assertTrue(result1[i][j] == result2[i][j] );
+ }
+ }
+
+ sql """ set keep_carriage_return = false; """
+
+ // qt_csv_3 """
+ // select * from local(
+ // "file_path" = "${outFilePath}/${filename}",
+ // "backend_id" = "${be_id}",
+ // "format" = "csv",
+ // "column_separator" = ","
+ // )
+ // order by c1,c2,c3,c4;
+ // """
+
+ qt_csv_4 """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ","
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ result1 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ","
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ result2 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}.gz",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "compress_type"="gz"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ log.info("result2 = ${result2}")
+ assertTrue(result1.size() == result2.size());
+ for(int i =0 ;i < result1.size();i++){
+ for(int j =0 ; j< result1.size();j++) {
+ assertTrue(result1[i][j] == result2[i][j] );
+ }
+ }
+
+
+ filename = "lf_crlf_and_quotes.csv"
+
+ sql """ set keep_carriage_return = true; """
+ // qt_csv_5"""
+ // select * from local(
+ // "file_path" = "${outFilePath}/${filename}",
+ // "backend_id" = "${be_id}",
+ // "format" = "csv",
+ // "column_separator" = ",",
+ // "trim_double_quotes"="true"
+ // )
+ // order by c1,c2,c3,c4;
+ // """
+
+ qt_csv_6"""
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "trim_double_quotes"="true"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+
+ result1 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "trim_double_quotes"="true"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ result2 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}.gz",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "trim_double_quotes"="true",
+ "compress_type"="gz"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ log.info("result2 = ${result2}")
+ assertTrue(result1.size() == result2.size());
+ for(int i =0 ;i < result1.size();i++){
+ for(int j =0 ; j< result1.size();j++) {
+ assertTrue(result1[i][j] == result2[i][j] );
+ }
+ }
+
+ sql """ set keep_carriage_return = false; """
+
+ // qt_csv_7 """
+ // select * from local(
+ // "file_path" = "${outFilePath}/${filename}",
+ // "backend_id" = "${be_id}",
+ // "format" = "csv",
+ // "column_separator" = ",",
+ // "trim_double_quotes"="true"
+ // )
+ // order by c1,c2,c3,c4;
+ // """
+
+ qt_csv_8 """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "trim_double_quotes"="true"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ result1 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "trim_double_quotes"="true"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ result2 = sql """
+ select * from local(
+ "file_path" = "${outFilePath}/${filename}.gz",
+ "backend_id" = "${be_id}",
+ "format" = "csv",
+ "column_separator" = ",",
+ "trim_double_quotes"="true",
+ "compress_type"="gz"
+ ) where length(c4) >= 2
+ order by c1,c2,c3,c4;
+ """
+ log.info("result2 = ${result2}")
+ assertTrue(result1.size() == result2.size());
+ for(int i =0 ;i < result1.size();i++){
+ for(int j =0 ; j< result1.size();j++) {
+ assertTrue(result1[i][j] == result2[i][j] );
+ }
+ }
+
+
+
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]