[doris] branch master updated: [fix](csv-reader)fix bug of multi-char delimiter in csv reader

morningman Wed, 23 Aug 2023 00:19:29 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 2dda44d7b5 [fix](csv-reader)fix bug of multi-char delimiter in csv 
reader
2dda44d7b5 is described below

commit 2dda44d7b53a825f864ab9ec0743e18a113d3f68
Author: daidai <[email protected]>
AuthorDate: Wed Aug 23 15:19:13 2023 +0800

    [fix](csv-reader)fix bug of multi-char delimiter in csv reader
    
    fix bug that csv_reader parse line in order to get column.
---
 be/src/vec/exec/format/csv/csv_reader.cpp          |  69 ++++++-----
 .../load_p0/stream_load/test_csv_split_line.out    |  14 +++
 .../load_p0/stream_load/test_csv_split_line1.csv   |   1 +
 .../load_p0/stream_load/test_csv_split_line2.csv   |   4 +
 .../load_p0/stream_load/test_csv_split_line3.csv   |   4 +
 .../load_p0/stream_load/test_csv_split_line.groovy | 130 +++++++++++++++++++++
 6 files changed, 194 insertions(+), 28 deletions(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index aed7864dd8..ba5d69cb73 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -100,38 +100,51 @@ void 
PlainCsvTextFieldSplitter::_split_field_single_char(const Slice& line,
 
 void PlainCsvTextFieldSplitter::_split_field_multi_char(const Slice& line,
                                                         std::vector<Slice>* 
splitted_values) {
-    const char* data = line.data;
     size_t start = 0;  // point to the start pos of next col value.
     size_t curpos = 0; // point to the start pos of separator matching 
sequence.
-    size_t p1 = 0;     // point to the current pos of separator matching 
sequence.
-
-    // Separator: AAAA
-    //
-    //    p1
-    //     ▼
-    //     AAAA
-    //   1000AAAA2000AAAA
-    //   ▲   ▲
-    // Start │
-    //     curpos
-    while (curpos < line.size) {
-        if (curpos + p1 == line.size || *(data + curpos + p1) != 
_value_sep[p1]) {
-            // Not match, move forward:
-            curpos += (p1 == 0 ? 1 : p1);
-            p1 = 0;
-        } else {
-            p1++;
-            if (p1 == value_sep_len) {
-                // Match a separator
-                process_value_func(data, start, curpos - start, trimming_char, 
splitted_values);
-                start = curpos + value_sep_len;
-                curpos = start;
-                p1 = 0;
-            }
+
+    // value_sep : AAAA
+    // line.data : 1234AAAA5678
+    // -> 1234,5678
+
+    //    start   start
+    //      ▼       ▼
+    //      1234AAAA5678\0
+    //          ▲       ▲
+    //      curpos     curpos
+
+    //kmp
+    vector<int> next(value_sep_len);
+    next[0] = -1;
+    for (int i = 1, j = -1; i < value_sep_len; i++) {
+        while (j > -1 && _value_sep[i] != _value_sep[j + 1]) {
+            j = next[j];
+        }
+        if (_value_sep[i] == _value_sep[j + 1]) {
+            j++;
+        }
+        next[i] = j;
+    }
+
+    for (int i = 0, j = -1; i < line.size; i++) {
+        // i : line
+        // j : _value_sep
+        while (j > -1 && line[i] != _value_sep[j + 1]) {
+            j = next[j];
+        }
+        if (line[i] == _value_sep[j + 1]) {
+            j++;
+        }
+        if (j == value_sep_len - 1) {
+            curpos = i - value_sep_len + 1;
+
+            process_value_func(line.data, start, curpos - start, 
trimming_char, splitted_values);
+
+            start = i + 1;
+            j = next[j];
         }
     }
-    CHECK(curpos == line.size) << curpos << " vs " << line.size;
-    process_value_func(data, start, curpos - start, trimming_char, 
splitted_values);
+    process_value_func(line.data, start, line.size - start, trimming_char, 
splitted_values);
 }
 
 void PlainCsvTextFieldSplitter::do_split(const Slice& line, 
std::vector<Slice>* splitted_values) {
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line.out 
b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
new file mode 100644
index 0000000000..fe62ba6f12
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
@@ -0,0 +1,14 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+000e124abc3a49b18b14424ebb6ee8b5       2715668347726333217     
352b88835f0a761888314515e4de5b18        000e124abc3a49b18b14424ebb6ee8b5        
1682897543355   2023-05-01      hips_product    hips_combo      
829aafbe9b59ae408b3fbf21d8d8fb797c7f2358        \N      
c:\\windows\\system32\\tasks\\lenovo\\imcontroller\\timebasedevents\\a4612416-67a7-48f1-9f87-e1e6dd7dd87e
       a4612416-67a7-48f1-9f87-e1e6dd7dd87e    \N      0       0       \N      
0       0       \N      1       10.0.19044.256.1.0      11.00.19041.1566 
(WinBuild.160101.0800) fdid:563        Lenovo  Lenovo.Modern.ImController      
Lenovo.Modern.Im [...]
+
+-- !sql --
+1000   worldhell       10000000        ello
+2000   wohellhell      200000  ellohell
+3000   worellohell     30000000        elloab
+4000   hellwohellhell  \N      abcdeeelhllo
+10001  helloword       114466  0000011445\r
+55555  \N      14455   7711445777\r
+66666  \N      \N      113355\r
+77777  0011455 8888    114545
+
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv 
b/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv
new file mode 100644
index 0000000000..7e0a6f8144
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv
@@ -0,0 +1 @@
+000e124abc3a49b18b14424ebb6ee8b55b18511e27156683477263332175b18511e352b88835f0a761888314515e4de5b185b18511e000e124abc3a49b18b14424ebb6ee8b55b18511e16828975433555b18511e2023-05-015b18511ehips_product5b18511ehips_combo5b18511e829aafbe9b59ae408b3fbf21d8d8fb797c7f23585b18511e\N5b18511ec:\windows\system32\tasks\lenovo\imcontroller\timebasedevents\a4612416-67a7-48f1-9f87-e1e6dd7dd87e5b18511ea4612416-67a7-48f1-9f87-e1e6dd7dd87e5b18511e\N5b18511e05b18511e05b18511e\N5b18511e05b18511e05b18511e\N5b
 [...]
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv 
b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
new file mode 100644
index 0000000000..04ba509ae4
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
@@ -0,0 +1,4 @@
+1000helloworldhellhello10000000helloello
+2000hellowohellhellhello200000helloellohell
+3000helloworellohellhello30000000helloelloab
+4000hellohellwohellhellhello\Nhelloabcdeeelhllo
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv 
b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
new file mode 100644
index 0000000000..bb6949bacf
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
@@ -0,0 +1,4 @@
+10001114455helloword1144551144661144550000011445
+55555114455\N114455144551144557711445777
+66666114455\N114455\N114455113355
+7777711445500114551144558888114455114545
\ No newline at end of file
diff --git 
a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy 
b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
new file mode 100644
index 0000000000..b22e8bb319
--- /dev/null
+++ b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_csv_split_line", "p0") {
+    def tableName = "test_csv_split_line"
+       sql """ set enable_fallback_to_original_planner=false;"""
+    sql """ create database if not exists demo;""" 
+    sql """ DROP TABLE IF EXISTS ${tableName}1 """
+    sql """ CREATE TABLE ${tableName}1 (
+            `mid` varchar(255) NULL,
+            `ent_id` varchar(255) NULL,
+            `file_md5` varchar(255) NULL,
+            `m2` varchar(255) NULL,
+            `event_time` bigint(20) NULL,
+            `event_date` date NULL,
+            `product` varchar(255) NULL,
+            `combo` varchar(255) NULL,
+            `file_sha1` varchar(255) NULL,
+            `file_sha256` varchar(255) NULL,
+            `file_path` varchar(1000) NULL,
+            `file_name` varchar(1000) NULL,
+            `file_size` int(11) NULL,
+            `file_age` int(11) NULL,
+            `file_ispe` varchar(1000) NULL,
+            `file_isx64` int(11) NULL,
+            `file_level` int(11) NULL,
+            `file_sublevel` int(11) NULL,
+            `file_level_sublevel` varchar(255) NULL,
+            `client_iswin64` int(11) NULL,
+            `client_os_version` varchar(255) NULL,
+            `client_ie_version` varchar(255) NULL,
+            `rule_group_id` varchar(1000) NULL,
+            `process_sign` varchar(1000) NULL,
+            `process_product_name` varchar(1000) NULL,
+            `process_original_name` varchar(1000) NULL,
+            `process_internal_name` varchar(1000) NULL,
+            `process_pparent_path` varchar(10000) NULL,
+            `process_parent_path` varchar(10000) NULL,
+            `process_parent_command_line` varchar(60000) NULL,
+            `process_path` varchar(10000) NULL,
+            `process_command_line` varchar(10000) NULL,
+            `file_dna` varchar(1000) NULL,
+            `icon_dna` varchar(1000) NULL,
+            `client_ip` varchar(10000) NULL,
+            `assetid` varchar(255) NULL,
+            `product_ver` varchar(255) NULL,
+            `clientid` varchar(1000) NULL,
+            `process_file_size` int(11) NULL,
+            `client_id` varchar(65533) NULL,
+            `rule_hit_all` varchar(65533) NULL,
+            `__op` boolean NULL
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`mid`)
+        DISTRIBUTED BY HASH(`mid`) BUCKETS 10
+        PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1"
+        ); 
+    """
+
+    streamLoad {
+        table "${tableName}1"
+
+        set 'column_separator', '5b18511e'
+        set 'columns', """ 
mid,ent_id,file_md5,m2,event_time,event_date,product,combo,
+                    
file_sha1,file_sha256,file_path,file_name,file_size,file_age,file_ispe,
+                        
file_isx64,file_level,file_sublevel,file_level_sublevel,client_iswin64,
+                        
client_os_version,client_ie_version,rule_group_id,process_sign,
+                        
process_product_name,process_original_name,process_internal_name,
+                        
process_pparent_path,process_parent_path,process_parent_command_line,
+                        
process_path,process_command_line,file_dna,icon_dna,client_ip,assetid,
+                        
product_ver,clientid,process_file_size,client_id,rule_hit_all """ 
+                
+        file 'test_csv_split_line1.csv'
+    }
+
+    sql """sync"""
+
+    qt_sql """select * from ${tableName}1;"""
+    sql """ drop table ${tableName}1; """ 
+
+
+    sql """ DROP TABLE IF EXISTS ${tableName}2 """
+    sql """ create table ${tableName}2 (
+        a int ,
+        b varchar(30),
+        c int ,
+        d varchar(30),
+    )
+    DUPLICATE KEY(`a`)
+    DISTRIBUTED BY HASH(`a`) BUCKETS 10
+    PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1"
+    ); 
+    """
+    streamLoad {
+        table "${tableName}2"
+        set 'column_separator', 'hello'
+        file 'test_csv_split_line2.csv'
+    }
+    streamLoad {
+        table "${tableName}2"
+        set 'column_separator', '114455'
+        file 'test_csv_split_line3.csv'
+    }
+    
+    sql "sync"
+    qt_sql """select * from ${tableName}2 order by a;"""
+    
+    
+    
+    
+    
+
+    sql """ drop table ${tableName}2; """ 
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [fix](csv-reader)fix bug of multi-char delimiter in csv reader

Reply via email to