This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2dda44d7b5 [fix](csv-reader)fix bug of multi-char delimiter in csv
reader
2dda44d7b5 is described below
commit 2dda44d7b53a825f864ab9ec0743e18a113d3f68
Author: daidai <[email protected]>
AuthorDate: Wed Aug 23 15:19:13 2023 +0800
[fix](csv-reader)fix bug of multi-char delimiter in csv reader
fix bug that csv_reader parse line in order to get column.
---
be/src/vec/exec/format/csv/csv_reader.cpp | 69 ++++++-----
.../load_p0/stream_load/test_csv_split_line.out | 14 +++
.../load_p0/stream_load/test_csv_split_line1.csv | 1 +
.../load_p0/stream_load/test_csv_split_line2.csv | 4 +
.../load_p0/stream_load/test_csv_split_line3.csv | 4 +
.../load_p0/stream_load/test_csv_split_line.groovy | 130 +++++++++++++++++++++
6 files changed, 194 insertions(+), 28 deletions(-)
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index aed7864dd8..ba5d69cb73 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -100,38 +100,51 @@ void
PlainCsvTextFieldSplitter::_split_field_single_char(const Slice& line,
void PlainCsvTextFieldSplitter::_split_field_multi_char(const Slice& line,
std::vector<Slice>*
splitted_values) {
- const char* data = line.data;
size_t start = 0; // point to the start pos of next col value.
size_t curpos = 0; // point to the start pos of separator matching
sequence.
- size_t p1 = 0; // point to the current pos of separator matching
sequence.
-
- // Separator: AAAA
- //
- // p1
- // ▼
- // AAAA
- // 1000AAAA2000AAAA
- // ▲ ▲
- // Start │
- // curpos
- while (curpos < line.size) {
- if (curpos + p1 == line.size || *(data + curpos + p1) !=
_value_sep[p1]) {
- // Not match, move forward:
- curpos += (p1 == 0 ? 1 : p1);
- p1 = 0;
- } else {
- p1++;
- if (p1 == value_sep_len) {
- // Match a separator
- process_value_func(data, start, curpos - start, trimming_char,
splitted_values);
- start = curpos + value_sep_len;
- curpos = start;
- p1 = 0;
- }
+
+ // value_sep : AAAA
+ // line.data : 1234AAAA5678
+ // -> 1234,5678
+
+ // start start
+ // ▼ ▼
+ // 1234AAAA5678\0
+ // ▲ ▲
+ // curpos curpos
+
+ //kmp
+ vector<int> next(value_sep_len);
+ next[0] = -1;
+ for (int i = 1, j = -1; i < value_sep_len; i++) {
+ while (j > -1 && _value_sep[i] != _value_sep[j + 1]) {
+ j = next[j];
+ }
+ if (_value_sep[i] == _value_sep[j + 1]) {
+ j++;
+ }
+ next[i] = j;
+ }
+
+ for (int i = 0, j = -1; i < line.size; i++) {
+ // i : line
+ // j : _value_sep
+ while (j > -1 && line[i] != _value_sep[j + 1]) {
+ j = next[j];
+ }
+ if (line[i] == _value_sep[j + 1]) {
+ j++;
+ }
+ if (j == value_sep_len - 1) {
+ curpos = i - value_sep_len + 1;
+
+ process_value_func(line.data, start, curpos - start,
trimming_char, splitted_values);
+
+ start = i + 1;
+ j = next[j];
}
}
- CHECK(curpos == line.size) << curpos << " vs " << line.size;
- process_value_func(data, start, curpos - start, trimming_char,
splitted_values);
+ process_value_func(line.data, start, line.size - start, trimming_char,
splitted_values);
}
void PlainCsvTextFieldSplitter::do_split(const Slice& line,
std::vector<Slice>* splitted_values) {
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line.out
b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
new file mode 100644
index 0000000000..fe62ba6f12
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line.out
@@ -0,0 +1,14 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+000e124abc3a49b18b14424ebb6ee8b5 2715668347726333217
352b88835f0a761888314515e4de5b18 000e124abc3a49b18b14424ebb6ee8b5
1682897543355 2023-05-01 hips_product hips_combo
829aafbe9b59ae408b3fbf21d8d8fb797c7f2358 \N
c:\\windows\\system32\\tasks\\lenovo\\imcontroller\\timebasedevents\\a4612416-67a7-48f1-9f87-e1e6dd7dd87e
a4612416-67a7-48f1-9f87-e1e6dd7dd87e \N 0 0 \N
0 0 \N 1 10.0.19044.256.1.0 11.00.19041.1566
(WinBuild.160101.0800) fdid:563 Lenovo Lenovo.Modern.ImController
Lenovo.Modern.Im [...]
+
+-- !sql --
+1000 worldhell 10000000 ello
+2000 wohellhell 200000 ellohell
+3000 worellohell 30000000 elloab
+4000 hellwohellhell \N abcdeeelhllo
+10001 helloword 114466 0000011445\r
+55555 \N 14455 7711445777\r
+66666 \N \N 113355\r
+77777 0011455 8888 114545
+
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv
b/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv
new file mode 100644
index 0000000000..7e0a6f8144
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv
@@ -0,0 +1 @@
+000e124abc3a49b18b14424ebb6ee8b55b18511e27156683477263332175b18511e352b88835f0a761888314515e4de5b185b18511e000e124abc3a49b18b14424ebb6ee8b55b18511e16828975433555b18511e2023-05-015b18511ehips_product5b18511ehips_combo5b18511e829aafbe9b59ae408b3fbf21d8d8fb797c7f23585b18511e\N5b18511ec:\windows\system32\tasks\lenovo\imcontroller\timebasedevents\a4612416-67a7-48f1-9f87-e1e6dd7dd87e5b18511ea4612416-67a7-48f1-9f87-e1e6dd7dd87e5b18511e\N5b18511e05b18511e05b18511e\N5b18511e05b18511e05b18511e\N5b
[...]
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
new file mode 100644
index 0000000000..04ba509ae4
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv
@@ -0,0 +1,4 @@
+1000helloworldhellhello10000000helloello
+2000hellowohellhellhello200000helloellohell
+3000helloworellohellhello30000000helloelloab
+4000hellohellwohellhellhello\Nhelloabcdeeelhllo
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
new file mode 100644
index 0000000000..bb6949bacf
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv
@@ -0,0 +1,4 @@
+10001114455helloword1144551144661144550000011445
+55555114455\N114455144551144557711445777
+66666114455\N114455\N114455113355
+7777711445500114551144558888114455114545
\ No newline at end of file
diff --git
a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
new file mode 100644
index 0000000000..b22e8bb319
--- /dev/null
+++ b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_csv_split_line", "p0") {
+ def tableName = "test_csv_split_line"
+ sql """ set enable_fallback_to_original_planner=false;"""
+ sql """ create database if not exists demo;"""
+ sql """ DROP TABLE IF EXISTS ${tableName}1 """
+ sql """ CREATE TABLE ${tableName}1 (
+ `mid` varchar(255) NULL,
+ `ent_id` varchar(255) NULL,
+ `file_md5` varchar(255) NULL,
+ `m2` varchar(255) NULL,
+ `event_time` bigint(20) NULL,
+ `event_date` date NULL,
+ `product` varchar(255) NULL,
+ `combo` varchar(255) NULL,
+ `file_sha1` varchar(255) NULL,
+ `file_sha256` varchar(255) NULL,
+ `file_path` varchar(1000) NULL,
+ `file_name` varchar(1000) NULL,
+ `file_size` int(11) NULL,
+ `file_age` int(11) NULL,
+ `file_ispe` varchar(1000) NULL,
+ `file_isx64` int(11) NULL,
+ `file_level` int(11) NULL,
+ `file_sublevel` int(11) NULL,
+ `file_level_sublevel` varchar(255) NULL,
+ `client_iswin64` int(11) NULL,
+ `client_os_version` varchar(255) NULL,
+ `client_ie_version` varchar(255) NULL,
+ `rule_group_id` varchar(1000) NULL,
+ `process_sign` varchar(1000) NULL,
+ `process_product_name` varchar(1000) NULL,
+ `process_original_name` varchar(1000) NULL,
+ `process_internal_name` varchar(1000) NULL,
+ `process_pparent_path` varchar(10000) NULL,
+ `process_parent_path` varchar(10000) NULL,
+ `process_parent_command_line` varchar(60000) NULL,
+ `process_path` varchar(10000) NULL,
+ `process_command_line` varchar(10000) NULL,
+ `file_dna` varchar(1000) NULL,
+ `icon_dna` varchar(1000) NULL,
+ `client_ip` varchar(10000) NULL,
+ `assetid` varchar(255) NULL,
+ `product_ver` varchar(255) NULL,
+ `clientid` varchar(1000) NULL,
+ `process_file_size` int(11) NULL,
+ `client_id` varchar(65533) NULL,
+ `rule_hit_all` varchar(65533) NULL,
+ `__op` boolean NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`mid`)
+ DISTRIBUTED BY HASH(`mid`) BUCKETS 10
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ streamLoad {
+ table "${tableName}1"
+
+ set 'column_separator', '5b18511e'
+ set 'columns', """
mid,ent_id,file_md5,m2,event_time,event_date,product,combo,
+
file_sha1,file_sha256,file_path,file_name,file_size,file_age,file_ispe,
+
file_isx64,file_level,file_sublevel,file_level_sublevel,client_iswin64,
+
client_os_version,client_ie_version,rule_group_id,process_sign,
+
process_product_name,process_original_name,process_internal_name,
+
process_pparent_path,process_parent_path,process_parent_command_line,
+
process_path,process_command_line,file_dna,icon_dna,client_ip,assetid,
+
product_ver,clientid,process_file_size,client_id,rule_hit_all """
+
+ file 'test_csv_split_line1.csv'
+ }
+
+ sql """sync"""
+
+ qt_sql """select * from ${tableName}1;"""
+ sql """ drop table ${tableName}1; """
+
+
+ sql """ DROP TABLE IF EXISTS ${tableName}2 """
+ sql """ create table ${tableName}2 (
+ a int ,
+ b varchar(30),
+ c int ,
+ d varchar(30),
+ )
+ DUPLICATE KEY(`a`)
+ DISTRIBUTED BY HASH(`a`) BUCKETS 10
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+ streamLoad {
+ table "${tableName}2"
+ set 'column_separator', 'hello'
+ file 'test_csv_split_line2.csv'
+ }
+ streamLoad {
+ table "${tableName}2"
+ set 'column_separator', '114455'
+ file 'test_csv_split_line3.csv'
+ }
+
+ sql "sync"
+ qt_sql """select * from ${tableName}2 order by a;"""
+
+
+
+
+
+
+ sql """ drop table ${tableName}2; """
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]