This is an automated email from the ASF dual-hosted git repository.
Gabriel39 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a1ac4dbed75 [fix](hive) Preserve empty text records (#64671)
a1ac4dbed75 is described below
commit a1ac4dbed757d6bb299c455ed8d943dc125a0e91
Author: Gabriel <[email protected]>
AuthorDate: Thu Jun 25 15:22:51 2026 +0800
[fix](hive) Preserve empty text records (#64671)
### What problem does this PR solve?
Issue Number: close #xxx
Problem Summary:
When scanning Hive TEXTFILE tables, Doris previously skipped empty
physical lines unless `read_csv_empty_line_as_null` was enabled. This is
inconsistent with Hive TEXTFILE semantics: an empty physical line is
still a record. For a single-column text table it represents one empty
field, and for multi-column text tables missing trailing fields should
be filled using the table's null format.
This can cause Doris to return fewer rows than Hive for text files
containing empty lines, especially when the table uses `LazySimpleSerDe`
and custom or default `serialization.null.format`.
This PR fixes the behavior by adding a format-level hook for empty-line
handling:
- CSV keeps the existing default behavior and does not treat empty lines
as records.
- Hive TEXT overrides the hook and treats empty physical lines as
records.
- Empty Hive text lines are passed through normal field deserialization
so string/null handling stays consistent with `null_format`.
The PR also adds Hive regression coverage for:
- a single-column text table with custom `serialization.null.format`;
- a multi-column text table using the default Hive null marker `\N`;
- preservation of empty records and correct NULL/empty-string
classification.
In addition, the credit-data Hive fixture upload order is made
refresh-safe. The Hive regression module refresh may rerun all
`data/regression` setup scripts; `crdmm_data` now recreates the Hive
table before re-uploading its HDFS data so `DROP TABLE` cannot remove
freshly uploaded files.
### Release note
Fix Hive TEXTFILE scans to preserve empty physical lines as records,
matching Hive behavior.
### Check List (For Author)
- Test: Regression test
- Added/updated `external_table_p0/hive/test_hive_serde_prop`.
- Ran `./run-regression-test.sh --run -d external_table_p0/hive -s
test_hive_serde_prop`; local config had `enableHiveTest=false`, so the
Hive test body was skipped.
- Ran `./run-regression-test.sh --run -d external_table_p0/hive -s
test_external_credit_data`; local config had `enableHiveTest=false`, so
the Hive test body was skipped.
- Ran `bash -n
docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh`.
- Ran `git diff --check`.
- Behavior changed: Yes. Hive TEXTFILE scans now preserve empty physical
lines as records instead of skipping them.
- Does this need documentation: No
---
be/src/format/csv/csv_reader.cpp | 18 ++++++++++++----
be/src/format/csv/csv_reader.h | 1 +
be/src/format/text/text_reader.cpp | 6 ++++++
be/src/format/text/text_reader.h | 1 +
.../hive/scripts/data/regression/crdmm_data/run.sh | 7 +++---
.../hive/scripts/data/regression/serde_prop/run.sh | 25 ++++++++++++++++++++++
.../regression/serde_prop/some_serde_table.hql | 25 +++++++++++++++++++++-
.../hive/test_hive_serde_prop.out | 24 +++++++++++++++++++++
.../hive/test_hive_serde_prop.groovy | 20 +++++++++++++++++
9 files changed, 118 insertions(+), 9 deletions(-)
diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp
index c4837d65fb3..3d1e978ffe9 100644
--- a/be/src/format/csv/csv_reader.cpp
+++ b/be/src/format/csv/csv_reader.cpp
@@ -436,8 +436,10 @@ Status CsvReader::_do_get_next_block(Block* block, size_t*
read_rows, bool* eof)
continue;
}
if (size == 0) {
- if (!_line_reader_eof &&
_state->is_read_csv_empty_line_as_null()) {
- ++rows;
+ if (!_line_reader_eof) {
+ if (_empty_line_as_record() ||
_state->is_read_csv_empty_line_as_null()) {
+ ++rows;
+ }
}
// Read empty line, continue
continue;
@@ -475,8 +477,16 @@ Status CsvReader::_do_get_next_block(Block* block, size_t*
read_rows, bool* eof)
continue;
}
if (size == 0) {
- if (!_line_reader_eof &&
_state->is_read_csv_empty_line_as_null()) {
- RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+ if (!_line_reader_eof) {
+ if (_empty_line_as_record()) {
+ Slice empty_line("", 0);
+ RETURN_IF_ERROR(_validate_line(empty_line, &success));
+ if (success) {
+ RETURN_IF_ERROR(_fill_dest_columns(empty_line,
columns, &rows));
+ }
+ } else if (_state->is_read_csv_empty_line_as_null()) {
+ RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+ }
}
// Read empty line, continue
continue;
diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h
index 80938abd271..46b8ffd6718 100644
--- a/be/src/format/csv/csv_reader.h
+++ b/be/src/format/csv/csv_reader.h
@@ -207,6 +207,7 @@ protected:
virtual Status _create_line_reader();
virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn&
column, Slice& slice);
virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice);
+ virtual bool _empty_line_as_record() const { return false; }
// check the utf8 encoding of a line.
// return error status to stop processing.
// If return Status::OK but "success" is false, which means this is load
request
diff --git a/be/src/format/text/text_reader.cpp
b/be/src/format/text/text_reader.cpp
index c118c21adda..23501f94cd6 100644
--- a/be/src/format/text/text_reader.cpp
+++ b/be/src/format/text/text_reader.cpp
@@ -168,6 +168,12 @@ Status TextReader::_validate_line(const Slice& line, bool*
success) {
return Status::OK();
}
+bool TextReader::_empty_line_as_record() const {
+ // Hive TEXTFILE treats an empty physical line as a record. The splitter
maps it
+ // to one empty field and missing trailing fields are filled with
null_format.
+ return true;
+}
+
Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice)
{
// Hot path of hive text load, see
CsvReader::_deserialize_nullable_string. The
// column type was verified by the checked assert_cast in
diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h
index c0cebf5da77..dff4159208d 100644
--- a/be/src/format/text/text_reader.h
+++ b/be/src/format/text/text_reader.h
@@ -67,6 +67,7 @@ private:
Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column,
Slice& slice) override;
Status _validate_line(const Slice& line, bool* success) override;
Status _deserialize_nullable_string(IColumn& column, Slice& slice)
override;
+ bool _empty_line_as_record() const override;
};
} // namespace doris
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
index f650ead89d7..5197e8b9276 100755
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
@@ -4,10 +4,9 @@ set -x
CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-hadoop fs -mkdir -p /user/doris/suites/regression/
-hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/
-
# create table
hive -f "${CUR_DIR}"/create_table.hql
-
+hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true
+hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data
+hadoop fs -put "${CUR_DIR}"/data/crdmm_data/*
/user/doris/suites/regression/crdmm_data/
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
index ef6538563d5..c4f8e7c5d96 100755
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
@@ -3,6 +3,31 @@ set -x
CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)"
+DEFAULT_MULTI_COL_DATA_FILE="$(mktemp
/tmp/test_default_null_format_multi_col_text.XXXXXX)"
+trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT
+cat > "${SINGLE_COL_DATA_FILE}" <<'EOF'
+null_value
+null_value
+non-null
+
+\N
+EOF
+
+{
+ printf 'a\tb\n'
+ printf '\n'
+ printf '\\N\t\\N\n'
+} > "${DEFAULT_MULTI_COL_DATA_FILE}"
+
+hadoop fs -rm -r -f
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text ||
true
+hadoop fs -mkdir -p
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text
+hadoop fs -put "${SINGLE_COL_DATA_FILE}"
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000
+
+hadoop fs -rm -r -f
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
|| true
+hadoop fs -mkdir -p
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
+hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}"
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000
+
# create table
hive -f "${CUR_DIR}"/some_serde_table.hql
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
index df03f36a8da..4625f0cbb35 100644
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
@@ -226,4 +226,27 @@ STORED AS TEXTFILE;
INSERT INTO TABLE test_empty_null_defined_text VALUES
(1, 'Alice'),
(2, NULL),
- (3, '');
\ No newline at end of file
+ (3, '');
+
+drop table if exists test_single_col_null_format_text;
+
+create external table test_single_col_null_format_text (
+ name STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+WITH SERDEPROPERTIES (
+ "serialization.null.format"="null_value"
+)
+STORED AS TEXTFILE
+LOCATION
'/user/doris/suites/regression/serde_prop/test_single_col_null_format_text';
+
+drop table if exists test_default_null_format_multi_col_text;
+
+create external table test_default_null_format_multi_col_text (
+ c1 STRING,
+ c2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+LOCATION
'/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text';
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index cda92c0519a..36866613260 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
@@ -73,6 +73,18 @@ b 2.2
-- !test_empty_null_defined_text3 --
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5 3 2 1 1 1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3 2 1 1 1 2 0 1
+
-- !1 --
a 1.1
b 2.2
@@ -147,3 +159,15 @@ b 2.2
-- !test_empty_null_defined_text3 --
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5 3 2 1 1 1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3 2 1 1 1 2 0 1
+
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
index 70306968852..24efc34f448 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
@@ -63,6 +63,26 @@ suite("test_hive_serde_prop", "p0,external") {
qt_test_empty_null_defined_text """select * from
${catalog_name}.regression.test_empty_null_defined_text order by id;"""
qt_test_empty_null_defined_text2 """select * from
${catalog_name}.regression.test_empty_null_defined_text where name is null
order by id;"""
qt_test_empty_null_defined_text3 """select * from
${catalog_name}.regression.test_empty_null_defined_text where name = '' order
by id;"""
+
+ qt_test_single_col_null_format_text_count """select count(*) from
${catalog_name}.regression.test_single_col_null_format_text;"""
+ qt_test_single_col_null_format_text_values """
+ select count(*), count(name), count(case when name is null then 1
end),
+ count(case when name = '' then 1 end),
+ count(case when name = 'non-null' then 1 end),
+ count(case when name is not null and name not in ('',
'non-null') then 1 end)
+ from ${catalog_name}.regression.test_single_col_null_format_text;
+ """
+
+ qt_test_default_null_format_multi_col_text_count """select count(*)
from ${catalog_name}.regression.test_default_null_format_multi_col_text;"""
+ qt_test_default_null_format_multi_col_text_values """
+ select count(*), count(c1), count(c2),
+ count(case when c1 is null then 1 end),
+ count(case when c1 = '' then 1 end),
+ count(case when c2 is null then 1 end),
+ count(case when c2 = '' then 1 end),
+ count(case when c1 = 'a' and c2 = 'b' then 1 end)
+ from
${catalog_name}.regression.test_default_null_format_multi_col_text;
+ """
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]