(doris) branch branch-3.0 updated: [feature](csv)Supports reading CSV data using LF and CRLF as line separators. (#37687) (#39980)

morningman Tue, 27 Aug 2024 23:36:06 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 50015d76f0d [feature](csv)Supports reading CSV data using LF and CRLF 
as line separators. (#37687) (#39980)
50015d76f0d is described below

commit 50015d76f0d19d4effa5f6c013cd2fdcca24a86b
Author: daidai <[email protected]>
AuthorDate: Wed Aug 28 14:35:17 2024 +0800

    [feature](csv)Supports reading CSV data using LF and CRLF as line 
separators. (#37687) (#39980)
    
    bp #37687
---
 be/src/vec/exec/format/csv/csv_reader.cpp          |  20 +-
 be/src/vec/exec/format/csv/csv_reader.h            |   1 +
 .../file_reader/new_plain_text_line_reader.cpp     |   9 +-
 .../file_reader/new_plain_text_line_reader.h       | 106 ++++++++-
 be/src/vec/exec/format/json/new_json_reader.cpp    |   5 +-
 .../java/org/apache/doris/qe/SessionVariable.java  |  17 ++
 .../lf_crlf_and_quotes.csv                         | 134 ++++++++++++
 .../lf_crlf_and_quotes.csv.gz                      | Bin 0 -> 337 bytes
 .../lf_crlf_not_quotes.csv                         | 135 ++++++++++++
 .../lf_crlf_not_quotes.csv.gz                      | Bin 0 -> 264 bytes
 .../tvf/test_tvf_csv_line_end.out                  | 201 +++++++++++++++++
 .../tvf/test_tvf_csv_line_end.groovy               | 243 +++++++++++++++++++++
 12 files changed, 849 insertions(+), 22 deletions(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 46e5f176b21..cee1c99eefb 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -368,17 +368,21 @@ Status CsvReader::init_reader(bool is_load) {
     _options.converted_from_string = _trim_double_quotes;
     _not_trim_enclose = (!_trim_double_quotes && _enclose == '\"');
 
+    if (_state != nullptr) {
+        _keep_cr = _state->query_options().keep_carriage_return;
+    }
+
     std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx;
     if (_enclose == 0) {
-        text_line_reader_ctx =
-                std::make_shared<PlainTextLineReaderCtx>(_line_delimiter, 
_line_delimiter_length);
+        text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(
+                _line_delimiter, _line_delimiter_length, _keep_cr);
 
         _fields_splitter = std::make_unique<PlainCsvTextFieldSplitter>(
                 _trim_tailing_spaces, false, _value_separator, 
_value_separator_length, -1);
     } else {
         text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderContext>(
                 _line_delimiter, _line_delimiter_length, _value_separator, 
_value_separator_length,
-                _file_slot_descs.size() - 1, _enclose, _escape);
+                _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr);
 
         _fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
                 _trim_tailing_spaces, !_not_trim_enclose,
@@ -881,20 +885,24 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool* 
is_parse_name) {
         _options.map_key_delim = 
_params.file_attributes.text_params.mapkv_delimiter[0];
     }
 
+    if (_state != nullptr) {
+        _keep_cr = _state->query_options().keep_carriage_return;
+    }
+
     // create decompressor.
     // _decompressor may be nullptr if this is not a compressed file
     RETURN_IF_ERROR(_create_decompressor());
     std::shared_ptr<TextLineReaderContextIf> text_line_reader_ctx;
     if (_enclose == 0) {
-        text_line_reader_ctx =
-                std::make_shared<PlainTextLineReaderCtx>(_line_delimiter, 
_line_delimiter_length);
+        text_line_reader_ctx = std::make_shared<PlainTextLineReaderCtx>(
+                _line_delimiter, _line_delimiter_length, _keep_cr);
         _fields_splitter = std::make_unique<PlainCsvTextFieldSplitter>(
                 _trim_tailing_spaces, _trim_double_quotes, _value_separator,
                 _value_separator_length);
     } else {
         text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderContext>(
                 _line_delimiter, _line_delimiter_length, _value_separator, 
_value_separator_length,
-                _file_slot_descs.size() - 1, _enclose, _escape);
+                _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr);
         _fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
                 _trim_tailing_spaces, false,
                 
std::static_pointer_cast<EncloseCsvLineReaderContext>(text_line_reader_ctx),
diff --git a/be/src/vec/exec/format/csv/csv_reader.h 
b/be/src/vec/exec/format/csv/csv_reader.h
index 7fb3b3a1231..5e2865e45cb 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -286,6 +286,7 @@ private:
     bool _trim_tailing_spaces = false;
     // `should_not_trim` is to manage the case that: user do not expect to 
trim double quotes but enclose is double quotes
     bool _not_trim_enclose = true;
+    bool _keep_cr = false;
 
     io::IOContext* _io_ctx = nullptr;
 
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 75350890aee..9a09a90d1aa 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -21,6 +21,9 @@
 #include <glog/logging.h>
 #include <string.h>
 
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
 #include <algorithm>
 #include <cstddef>
 #include <cstring>
@@ -42,7 +45,6 @@
 // leave these 2 size small for debugging
 
 namespace doris {
-
 const uint8_t* EncloseCsvLineReaderContext::read_line_impl(const uint8_t* 
start,
                                                            const size_t 
length) {
     _total_len = length;
@@ -82,12 +84,11 @@ void EncloseCsvLineReaderContext::on_col_sep_found(const 
uint8_t* start,
 }
 
 size_t EncloseCsvLineReaderContext::update_reading_bound(const uint8_t* start) 
{
-    _result = (uint8_t*)memmem(start + _idx, _total_len - _idx, 
line_delimiter.c_str(),
-                               line_delimiter_len);
+    _result = call_find_line_sep(start + _idx, _total_len - _idx);
     if (_result == nullptr) {
         return _total_len;
     }
-    return _result - start + line_delimiter_len;
+    return _result - start + line_delimiter_length();
 }
 
 template <bool SingleChar>
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index 0b0d9f133fa..c91b503cbe5 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -47,7 +47,6 @@ public:
     // info about the current line may be record to the ctx, like column 
seprator pos.
     /// @return line delimiter pos if found, otherwise return nullptr.
     virtual const uint8_t* read_line(const uint8_t* start, const size_t len) = 
0;
-
     /// @return length of line delimiter
     [[nodiscard]] virtual size_t line_delimiter_length() const = 0;
 
@@ -62,30 +61,117 @@ class BaseTextLineReaderContext : public 
TextLineReaderContextIf {
 
 public:
     explicit BaseTextLineReaderContext(const std::string& line_delimiter_,
-                                       const size_t line_delimiter_len_)
-            : line_delimiter(line_delimiter_), 
line_delimiter_len(line_delimiter_len_) {}
+                                       const size_t line_delimiter_len_, const 
bool keep_cr_)
+            : line_delimiter(line_delimiter_),
+              line_delimiter_len(line_delimiter_len_),
+              keep_cr(keep_cr_) {
+        use_memmem = line_delimiter_len != 1 || line_delimiter != "\n" || 
keep_cr;
+        if (use_memmem) {
+            find_line_delimiter_func = 
&BaseTextLineReaderContext::find_multi_char_line_sep;
+        } else {
+            find_line_delimiter_func = 
&BaseTextLineReaderContext::find_lf_crlf_line_sep;
+        }
+    }
 
     inline const uint8_t* read_line(const uint8_t* start, const size_t len) 
final {
         return static_cast<Ctx*>(this)->read_line_impl(start, len);
     }
 
-    [[nodiscard]] inline size_t line_delimiter_length() const final { return 
line_delimiter_len; }
+    [[nodiscard]] inline size_t line_delimiter_length() const final {
+        return line_delimiter_len + line_crlf;
+    }
 
     inline void refresh() final { return 
static_cast<Ctx*>(this)->refresh_impl(); };
 
+    inline const uint8_t* find_multi_char_line_sep(const uint8_t* start, const 
size_t length) {
+        return static_cast<uint8_t*>(
+                memmem(start, length, line_delimiter.c_str(), 
line_delimiter_len));
+    }
+
+    const uint8_t* find_lf_crlf_line_sep(const uint8_t* start, const size_t 
length) {
+        line_crlf = false;
+        if (start == nullptr || length == 0) {
+            return nullptr;
+        }
+        size_t i = 0;
+#ifdef __AVX2__
+        // const uint8_t* end = start + length;
+        const __m256i newline = _mm256_set1_epi8('\n');
+        const __m256i carriage_return = _mm256_set1_epi8('\r');
+
+        const size_t simd_width = 32;
+        // Process 32 bytes at a time using AVX2
+        for (; i + simd_width <= length; i += simd_width) {
+            __m256i data = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(start + i));
+
+            // Compare with '\n' and '\r'
+            __m256i cmp_newline = _mm256_cmpeq_epi8(data, newline);
+            __m256i cmp_carriage_return = _mm256_cmpeq_epi8(data, 
carriage_return);
+
+            // Check if there is a match
+            int mask_newline = _mm256_movemask_epi8(cmp_newline);
+            int mask_carriage_return = 
_mm256_movemask_epi8(cmp_carriage_return);
+
+            if (mask_newline != 0 || mask_carriage_return != 0) {
+                int pos_lf = (mask_newline != 0) ? i + 
__builtin_ctz(mask_newline) : INT32_MAX;
+                int pos_cr = (mask_carriage_return != 0) ? i + 
__builtin_ctz(mask_carriage_return)
+                                                         : INT32_MAX;
+                if (pos_lf < pos_cr) {
+                    return start + pos_lf;
+                } else if (pos_cr < pos_lf) {
+                    if (pos_lf != INT32_MAX) {
+                        if (pos_lf - 1 >= 0 && start[pos_lf - 1] == '\r') {
+                            //check   xxx\r\r\r\nxxx
+                            line_crlf = true;
+                            return start + pos_lf - 1;
+                        }
+                        // xxx\rxxxx\nxx
+                        return start + pos_lf;
+                    } else if (i + simd_width < length && start[i + simd_width 
- 1] == '\r' &&
+                               start[i + simd_width] == '\n') {
+                        //check [/r/r/r/r/r/r/rxxx/r]  [\nxxxx]
+                        line_crlf = true;
+                        return start + i + simd_width - 1;
+                    }
+                }
+            }
+        }
+
+        // Process remaining bytes
+#endif
+        for (; i < length; ++i) {
+            if (start[i] == '\n') {
+                return &start[i];
+            }
+            if (start[i] == '\r' && (i + 1 < length) && start[i + 1] == '\n') {
+                line_crlf = true;
+                return &start[i];
+            }
+        }
+        return nullptr;
+    }
+    const uint8_t* call_find_line_sep(const uint8_t* start, const size_t 
length) {
+        return (this->*find_line_delimiter_func)(start, length);
+    }
+
 protected:
     const std::string line_delimiter;
     const size_t line_delimiter_len;
+    bool keep_cr = false;
+    bool line_crlf = false;
+    bool use_memmem = true;
+    using FindLineDelimiterFunc = const uint8_t* 
(BaseTextLineReaderContext::*)(const uint8_t*,
+                                                                               
 size_t);
+    FindLineDelimiterFunc find_line_delimiter_func;
 };
-
 class PlainTextLineReaderCtx final : public 
BaseTextLineReaderContext<PlainTextLineReaderCtx> {
 public:
     explicit PlainTextLineReaderCtx(const std::string& line_delimiter_,
-                                    const size_t line_delimiter_len_)
-            : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_) 
{}
+                                    const size_t line_delimiter_len_, const 
bool keep_cr_)
+            : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_, 
keep_cr_) {}
 
     inline const uint8_t* read_line_impl(const uint8_t* start, const size_t 
length) {
-        return (uint8_t*)memmem(start, length, line_delimiter.c_str(), 
line_delimiter_len);
+        return call_find_line_sep(start, length);
     }
 
     inline void refresh_impl() {}
@@ -119,8 +205,8 @@ public:
                                          const size_t line_delimiter_len_,
                                          const std::string& column_sep_,
                                          const size_t column_sep_len_, size_t 
col_sep_num,
-                                         const char enclose, const char escape)
-            : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_),
+                                         const char enclose, const char 
escape, const bool keep_cr_)
+            : BaseTextLineReaderContext(line_delimiter_, line_delimiter_len_, 
keep_cr_),
               _enclose(enclose),
               _escape(escape),
               _column_sep_len(column_sep_len_),
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp 
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 3b5affa6fb6..2aff2cb4e7e 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -420,8 +420,9 @@ Status NewJsonReader::_open_line_reader() {
     }
     _line_reader = NewPlainTextLineReader::create_unique(
             _profile, _file_reader, _decompressor.get(),
-            std::make_shared<PlainTextLineReaderCtx>(_line_delimiter, 
_line_delimiter_length), size,
-            _current_offset);
+            std::make_shared<PlainTextLineReaderCtx>(_line_delimiter, 
_line_delimiter_length,
+                                                     false),
+            size, _current_offset);
     return Status::OK();
 }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 2239d585be3..60aff519443 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -593,6 +593,8 @@ public class SessionVariable implements Serializable, 
Writable {
 
     public static final String ENABLE_PUSHDOWN_MINMAX_ON_UNIQUE = 
"enable_pushdown_minmax_on_unique";
 
+    public static final String KEEP_CARRIAGE_RETURN = "keep_carriage_return";
+
     public static final String ENABLE_PUSHDOWN_STRING_MINMAX = 
"enable_pushdown_string_minmax";
 
     // When set use fix replica = true, the fixed replica maybe bad, try to 
use the health one if
@@ -1880,6 +1882,12 @@ public class SessionVariable implements Serializable, 
Writable {
                     "The maximum number of partitions created during table 
creation"})
     public int createTablePartitionMaxNum = 10000;
 
+
+    @VariableMgr.VarAttr(name = KEEP_CARRIAGE_RETURN,
+            description = {"在同时处理\r和\r\n作为CSV的行分隔符时，是否保留\r",
+                    "When processing both \\n and \\r\\n as CSV line 
separators, should \\r be retained?"})
+    public boolean keepCarriageReturn = false;
+
     @VariableMgr.VarAttr(name = FORCE_JNI_SCANNER,
             description = {"强制使用jni方式读取外表", "Force the use of jni mode to read 
external table"})
     private boolean forceJniScanner = false;
@@ -3355,6 +3363,14 @@ public class SessionVariable implements Serializable, 
Writable {
         this.enableUnicodeNameSupport = enableUnicodeNameSupport;
     }
 
+    public boolean isKeepCarriageReturn() {
+        return keepCarriageReturn;
+    }
+
+    public void setKeepCarriageReturn(boolean keepCarriageReturn) {
+        this.keepCarriageReturn = keepCarriageReturn;
+    }
+
     public boolean isDropTableIfCtasFailed() {
         return dropTableIfCtasFailed;
     }
@@ -3643,6 +3659,7 @@ public class SessionVariable implements Serializable, 
Writable {
         
tResult.setEnableFallbackOnMissingInvertedIndex(enableFallbackOnMissingInvertedIndex);
 
         tResult.setEnableSegmentCache(enableSegmentCache);
+        tResult.setKeepCarriageReturn(keepCarriageReturn);
         return tResult;
     }
 
diff --git 
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv
 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv
new file mode 100644
index 00000000000..2564cafe1ad
--- /dev/null
+++ 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv
@@ -0,0 +1,134 @@
+1   ,Alice   ,  30 ,New York
+w,w,w,"w"
+"w",w,"w","w"
+"10","abc"  ,"ttt","def"
+2,Bob,25,Los Angeles
+3,"Ch  a rlie      ",35,Chicago
+4,abc,def,sss
+w,w,w,w
+         5  , ttt , d e f ,sss
+w,w,w,w
+" wrweqreqer ","234 ", 32323 ,"3232"
+w, w, w,w
+w,"w",w,"w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,"w",w,w
+w,w,w,w
+w,w,w,                w
+w,w,              w    ,               w
+w,w,w,w
+w,"w","w",w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w",w,"w"
+""w,"w",w,"w
+""w,"w",w,"w
+""w,"w",w,"w
+""w,"w",w,"w
+""w,"w",w,"w
+w,w,w,w
+w,w,w,w
+w,"w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,""w
+w,w,w,w
+w,w,w,w
+w,w,"w",w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,"w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+"w","w","w","w"
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+12,34,"abc ",def
+"w",w,w,"w"
+"w","w","w","w"
diff --git 
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv.gz
 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv.gz
new file mode 100644
index 00000000000..84f73cb2100
Binary files /dev/null and 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_and_quotes.csv.gz
 differ
diff --git 
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv
 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv
new file mode 100644
index 00000000000..0a504e467e9
--- /dev/null
+++ 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv
@@ -0,0 +1,135 @@
+1,Alice,30,New York
+w,w,w,w
+w,w, w,w
+10,abc,ttt,def
+2,Bob,25,Los  An geles
+3,Charlie,35 ,   Chicago
+4,abc,def,sss
+w,w,w,w
+5,tt t,def,ss    s
+w,w,w,w
+ wrwe qreqe r , 23 4,32 323,3 232
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w, w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,                        w
+w,w,w,w
+w, w ,w,w
+w, w, w ,w
+w,w,w,w
+w,w,w,w
+w,w,w,                w
+w,w,                  w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+ w ,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,       w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w, w,w,w
+w,w,w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w, w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
+w,w,w,w
diff --git 
a/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv.gz
 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv.gz
new file mode 100644
index 00000000000..f24bb5b3d3e
Binary files /dev/null and 
b/regression-test/data/external_table_p0/tvf/test_csv_line_end_lf_crlf/lf_crlf_not_quotes.csv.gz
 differ
diff --git 
a/regression-test/data/external_table_p0/tvf/test_tvf_csv_line_end.out 
b/regression-test/data/external_table_p0/tvf/test_tvf_csv_line_end.out
new file mode 100644
index 00000000000..b7658c0a93f
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_tvf_csv_line_end.out
@@ -0,0 +1,201 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !csv_2 --
+\rw\r  w       w       w\r
+\rwrwe\rqreqe\rr\r     \r23\r4 32\r323 3\r232
+1      Alice   30      New York\r
+10     abc     ttt     def
+2      Bob     25      Los\r An\rgeles\r
+3      Charlie 35\r    
\r\r\rChicago\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+4      abc     def     sss
+5      tt\rt   def     ss\r\r\r\rs
+w      \r\r\r\r\r\r\rw w       w\r
+w      \rw     \rw\r   w\r
+w      \rw     w       w\r
+w      \rw\r   w       w\r
+w      w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw   w\r
+w      w       \rw     w\r
+w      w       \rw     w\r
+w      w       \rw     w\r
+w      w       \rw     w\r
+w      w       \rw     w\r
+w      w       \rw     w\r
+w      w       w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w      w       w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r
+w      w       w       \rw
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       
w\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+
+-- !csv_4 --
+\rwrwe\rqreqe\rr\r     \r23\r4 32\r323 3\r232
+1      Alice   30      New York
+10     abc     ttt     def
+2      Bob     25      Los\r An\rgeles
+3      Charlie 35\r    
\r\r\rChicago\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+4      abc     def     sss
+5      tt\rt   def     ss\r\r\r\rs
+w      w       w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w      w       w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w      w       w       \rw
+w      w       w       
w\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r
+
+-- !csv_6 --
+\r\r\r\r\r\r\r\r\r5\r\r        \rttt\r \rd\re\rf\r     sss\r\r\r\r
+\rwrweqreqer\r 234\r   \r32323\r       "3232"\r\r\r\r
+""w    w       w       "w\r
+""w    w       w       "w\r
+""w    w       w       "w\r
+""w    w       w       "w\r
+""w    w       w       "w\r
+1\r\r\r        Alice\r\r\r     \r\r30\r        New York\r
+10     "abc"\r\r       ttt     def
+12     34      abc\r   def
+2      Bob     25      Los Angeles\r
+3      Ch\r\ra\rrlie\r\r\r\r\r\r       35      Chicago\r\r\r\r\r\r\r\r\r
+4      abc     def     sss
+w      \rw     \rw     w\r
+w      w       \r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r   
\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r\r\r\r\r\r\r\r\r
+w      w       w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w      w       w       ""w\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       "w"\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+w      w       w       w\r
+
+-- !csv_8 --
+\r\r\r\r\r\r\r\r\r5\r\r        \rttt\r \rd\re\rf\r     sss\r\r\r
+\rwrweqreqer\r 234\r   \r32323\r       "3232"\r\r\r
+""w    w       w       "w
+""w    w       w       "w
+""w    w       w       "w
+""w    w       w       "w
+""w    w       w       "w
+1\r\r\r        Alice\r\r\r     \r\r30\r        New York
+10     "abc"\r\r       ttt     def
+12     34      abc\r   def
+2      Bob     25      Los Angeles
+3      Ch\r\ra\rrlie\r\r\r\r\r\r       35      Chicago\r\r\r\r\r\r\r\r
+4      abc     def     sss
+w      w       \r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r   
\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw\r\r\r\r\r\r\r\r\r\r\r
+w      w       w       \r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\rw
+w      w       w       ""w
+
diff --git 
a/regression-test/suites/external_table_p0/tvf/test_tvf_csv_line_end.groovy 
b/regression-test/suites/external_table_p0/tvf/test_tvf_csv_line_end.groovy
new file mode 100644
index 00000000000..cb2beb6f941
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_tvf_csv_line_end.groovy
@@ -0,0 +1,243 @@
+import org.junit.Assert
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_tvf_csv_line_end", "p0,tvf") {
+    List<List<Object>> backends =  sql """ show backends """
+    assertTrue(backends.size() > 0)
+    def be_id = backends[0][0]
+    def dataFilePath = context.config.dataPath + 
"/external_table_p0/tvf/test_csv_line_end_lf_crlf"
+
+    def outFilePath="/test_csv_line_end_lf_crlf"
+
+    for (List<Object> backend : backends) {
+        def be_host = backend[1]
+        scpFiles ("root", be_host, dataFilePath, outFilePath, false);
+    }
+
+    String filename = "lf_crlf_not_quotes.csv"
+
+    sql """set enable_nereids_planner=true"""
+    sql """set enable_fallback_to_original_planner=false"""
+
+    sql """ set keep_carriage_return = true; """
+    // qt_csv_1"""
+    // select * from local(
+    //     "file_path" = "${outFilePath}/${filename}",
+    //     "backend_id" = "${be_id}",
+    //     "format" = "csv",
+    //     "column_separator" = ","
+    //     )
+    //     order by c1,c2,c3,c4;            
+    // """
+
+    qt_csv_2"""
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ","
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;        
+    """
+    List<List<String>> result1 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ","
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    List<List<String>> result2 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}.gz",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "compress_type"="gz"
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    log.info("result2 = ${result2}")
+    assertTrue(result1.size() == result2.size());
+    for(int  i  =0 ;i < result1.size();i++){
+        for(int j =0 ; j< result1.size();j++) {
+            assertTrue(result1[i][j] ==  result2[i][j] );
+        }
+    }
+    
+    sql """ set keep_carriage_return = false; """
+
+    // qt_csv_3 """
+    // select * from local(
+    //     "file_path" = "${outFilePath}/${filename}",
+    //     "backend_id" = "${be_id}",
+    //     "format" = "csv",
+    //     "column_separator" = ","
+    //     )
+    //     order by c1,c2,c3,c4;            
+    // """
+
+    qt_csv_4 """
+    select * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ","
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;        
+    """
+    result1 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ","
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    result2 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}.gz",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "compress_type"="gz"
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    log.info("result2 = ${result2}")
+    assertTrue(result1.size() == result2.size());
+    for(int  i  =0 ;i < result1.size();i++){
+        for(int j =0 ; j< result1.size();j++) {
+            assertTrue(result1[i][j] ==  result2[i][j] );
+        }
+    }
+
+
+    filename = "lf_crlf_and_quotes.csv"
+
+    sql """ set keep_carriage_return = true; """
+    // qt_csv_5"""
+    // select * from local(
+    //     "file_path" = "${outFilePath}/${filename}",
+    //     "backend_id" = "${be_id}",
+    //     "format" = "csv",
+    //     "column_separator" = ",",
+    //     "trim_double_quotes"="true"
+    //     )
+    //     order by c1,c2,c3,c4;            
+    // """
+
+    qt_csv_6"""
+    select * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "trim_double_quotes"="true"
+        ) where length(c4) >= 2       
+        order by c1,c2,c3,c4;         
+    """
+
+    result1 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "trim_double_quotes"="true"
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    result2 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}.gz",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "trim_double_quotes"="true",
+        "compress_type"="gz"
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    log.info("result2 = ${result2}")
+    assertTrue(result1.size() == result2.size());
+    for(int  i  =0 ;i < result1.size();i++){
+        for(int j =0 ; j< result1.size();j++) {
+            assertTrue(result1[i][j] ==  result2[i][j] );
+        }
+    }
+    
+    sql """ set keep_carriage_return = false; """
+
+    // qt_csv_7 """
+    // select * from local(
+    //     "file_path" = "${outFilePath}/${filename}",
+    //     "backend_id" = "${be_id}",
+    //     "format" = "csv",
+    //     "column_separator" = ",",
+    //     "trim_double_quotes"="true"
+    //     )
+    //     order by c1,c2,c3,c4;            
+    // """
+
+    qt_csv_8 """
+    select * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "trim_double_quotes"="true"
+        ) where length(c4) >= 2         
+        order by c1,c2,c3,c4;      
+    """
+    result1 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "trim_double_quotes"="true"
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    result2 = sql """
+    select  * from local(
+        "file_path" = "${outFilePath}/${filename}.gz",
+        "backend_id" = "${be_id}",
+        "format" = "csv",
+        "column_separator" = ",",
+        "trim_double_quotes"="true",
+        "compress_type"="gz"
+        ) where length(c4) >= 2        
+        order by c1,c2,c3,c4;   
+    """
+    log.info("result2 = ${result2}")
+    assertTrue(result1.size() == result2.size());
+    for(int  i  =0 ;i < result1.size();i++){
+        for(int j =0 ; j< result1.size();j++) {
+            assertTrue(result1[i][j] ==  result2[i][j] );
+        }
+    }
+
+
+
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.0 updated: [feature](csv)Supports reading CSV data using LF and CRLF as line separators. (#37687) (#39980)

Reply via email to