This is an automated email from the ASF dual-hosted git repository.

Gabriel39 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a1ac4dbed75 [fix](hive) Preserve empty text records (#64671)
a1ac4dbed75 is described below

commit a1ac4dbed757d6bb299c455ed8d943dc125a0e91
Author: Gabriel <[email protected]>
AuthorDate: Thu Jun 25 15:22:51 2026 +0800

    [fix](hive) Preserve empty text records (#64671)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Problem Summary:
    
    When scanning Hive TEXTFILE tables, Doris previously skipped empty
    physical lines unless `read_csv_empty_line_as_null` was enabled. This is
    inconsistent with Hive TEXTFILE semantics: an empty physical line is
    still a record. For a single-column text table it represents one empty
    field, and for multi-column text tables missing trailing fields should
    be filled using the table's null format.
    
    This can cause Doris to return fewer rows than Hive for text files
    containing empty lines, especially when the table uses `LazySimpleSerDe`
    and custom or default `serialization.null.format`.
    
    This PR fixes the behavior by adding a format-level hook for empty-line
    handling:
    
    - CSV keeps the existing default behavior and does not treat empty lines
    as records.
    - Hive TEXT overrides the hook and treats empty physical lines as
    records.
    - Empty Hive text lines are passed through normal field deserialization
    so string/null handling stays consistent with `null_format`.
    
    The PR also adds Hive regression coverage for:
    
    - a single-column text table with custom `serialization.null.format`;
    - a multi-column text table using the default Hive null marker `\N`;
    - preservation of empty records and correct NULL/empty-string
    classification.
    
    In addition, the credit-data Hive fixture upload order is made
    refresh-safe. The Hive regression module refresh may rerun all
    `data/regression` setup scripts; `crdmm_data` now recreates the Hive
    table before re-uploading its HDFS data so `DROP TABLE` cannot remove
    freshly uploaded files.
    
    ### Release note
    
    Fix Hive TEXTFILE scans to preserve empty physical lines as records,
    matching Hive behavior.
    
    ### Check List (For Author)
    
    - Test: Regression test
        - Added/updated `external_table_p0/hive/test_hive_serde_prop`.
    - Ran `./run-regression-test.sh --run -d external_table_p0/hive -s
    test_hive_serde_prop`; local config had `enableHiveTest=false`, so the
    Hive test body was skipped.
    - Ran `./run-regression-test.sh --run -d external_table_p0/hive -s
    test_external_credit_data`; local config had `enableHiveTest=false`, so
    the Hive test body was skipped.
    - Ran `bash -n
    
docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh`.
        - Ran `git diff --check`.
    - Behavior changed: Yes. Hive TEXTFILE scans now preserve empty physical
    lines as records instead of skipping them.
    - Does this need documentation: No
---
 be/src/format/csv/csv_reader.cpp                   | 18 ++++++++++++----
 be/src/format/csv/csv_reader.h                     |  1 +
 be/src/format/text/text_reader.cpp                 |  6 ++++++
 be/src/format/text/text_reader.h                   |  1 +
 .../hive/scripts/data/regression/crdmm_data/run.sh |  7 +++---
 .../hive/scripts/data/regression/serde_prop/run.sh | 25 ++++++++++++++++++++++
 .../regression/serde_prop/some_serde_table.hql     | 25 +++++++++++++++++++++-
 .../hive/test_hive_serde_prop.out                  | 24 +++++++++++++++++++++
 .../hive/test_hive_serde_prop.groovy               | 20 +++++++++++++++++
 9 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp
index c4837d65fb3..3d1e978ffe9 100644
--- a/be/src/format/csv/csv_reader.cpp
+++ b/be/src/format/csv/csv_reader.cpp
@@ -436,8 +436,10 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* 
read_rows, bool* eof)
                 continue;
             }
             if (size == 0) {
-                if (!_line_reader_eof && 
_state->is_read_csv_empty_line_as_null()) {
-                    ++rows;
+                if (!_line_reader_eof) {
+                    if (_empty_line_as_record() || 
_state->is_read_csv_empty_line_as_null()) {
+                        ++rows;
+                    }
                 }
                 // Read empty line, continue
                 continue;
@@ -475,8 +477,16 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* 
read_rows, bool* eof)
                 continue;
             }
             if (size == 0) {
-                if (!_line_reader_eof && 
_state->is_read_csv_empty_line_as_null()) {
-                    RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+                if (!_line_reader_eof) {
+                    if (_empty_line_as_record()) {
+                        Slice empty_line("", 0);
+                        RETURN_IF_ERROR(_validate_line(empty_line, &success));
+                        if (success) {
+                            RETURN_IF_ERROR(_fill_dest_columns(empty_line, 
columns, &rows));
+                        }
+                    } else if (_state->is_read_csv_empty_line_as_null()) {
+                        RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
+                    }
                 }
                 // Read empty line, continue
                 continue;
diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h
index 80938abd271..46b8ffd6718 100644
--- a/be/src/format/csv/csv_reader.h
+++ b/be/src/format/csv/csv_reader.h
@@ -207,6 +207,7 @@ protected:
     virtual Status _create_line_reader();
     virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& 
column, Slice& slice);
     virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice);
+    virtual bool _empty_line_as_record() const { return false; }
     // check the utf8 encoding of a line.
     // return error status to stop processing.
     // If return Status::OK but "success" is false, which means this is load 
request
diff --git a/be/src/format/text/text_reader.cpp 
b/be/src/format/text/text_reader.cpp
index c118c21adda..23501f94cd6 100644
--- a/be/src/format/text/text_reader.cpp
+++ b/be/src/format/text/text_reader.cpp
@@ -168,6 +168,12 @@ Status TextReader::_validate_line(const Slice& line, bool* 
success) {
     return Status::OK();
 }
 
+bool TextReader::_empty_line_as_record() const {
+    // Hive TEXTFILE treats an empty physical line as a record. The splitter 
maps it
+    // to one empty field and missing trailing fields are filled with 
null_format.
+    return true;
+}
+
 Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) 
{
     // Hot path of hive text load, see 
CsvReader::_deserialize_nullable_string. The
     // column type was verified by the checked assert_cast in
diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h
index c0cebf5da77..dff4159208d 100644
--- a/be/src/format/text/text_reader.h
+++ b/be/src/format/text/text_reader.h
@@ -67,6 +67,7 @@ private:
     Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, 
Slice& slice) override;
     Status _validate_line(const Slice& line, bool* success) override;
     Status _deserialize_nullable_string(IColumn& column, Slice& slice) 
override;
+    bool _empty_line_as_record() const override;
 };
 
 } // namespace doris
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
index f650ead89d7..5197e8b9276 100755
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh
@@ -4,10 +4,9 @@ set -x
 CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 
 
-hadoop fs -mkdir -p /user/doris/suites/regression/
-hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/
-
 # create table
 hive -f "${CUR_DIR}"/create_table.hql
 
-
+hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true
+hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data
+hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* 
/user/doris/suites/regression/crdmm_data/
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
index ef6538563d5..c4f8e7c5d96 100755
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh
@@ -3,6 +3,31 @@ set -x
 
 CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 
+SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)"
+DEFAULT_MULTI_COL_DATA_FILE="$(mktemp 
/tmp/test_default_null_format_multi_col_text.XXXXXX)"
+trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT
+cat > "${SINGLE_COL_DATA_FILE}" <<'EOF'
+null_value
+null_value
+non-null
+
+\N
+EOF
+
+{
+    printf 'a\tb\n'
+    printf '\n'
+    printf '\\N\t\\N\n'
+} > "${DEFAULT_MULTI_COL_DATA_FILE}"
+
+hadoop fs -rm -r -f 
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text || 
true
+hadoop fs -mkdir -p 
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text
+hadoop fs -put "${SINGLE_COL_DATA_FILE}" 
/user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000
+
+hadoop fs -rm -r -f 
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
 || true
+hadoop fs -mkdir -p 
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
+hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" 
/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000
+
 # create table
 hive -f "${CUR_DIR}"/some_serde_table.hql
 
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
index df03f36a8da..4625f0cbb35 100644
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
@@ -226,4 +226,27 @@ STORED AS TEXTFILE;
 INSERT INTO TABLE test_empty_null_defined_text VALUES
   (1, 'Alice'),
   (2, NULL),
-  (3, '');
\ No newline at end of file
+  (3, '');
+
+drop table if exists test_single_col_null_format_text;
+
+create external table test_single_col_null_format_text (
+  name STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+WITH SERDEPROPERTIES (
+  "serialization.null.format"="null_value"
+)
+STORED AS TEXTFILE
+LOCATION 
'/user/doris/suites/regression/serde_prop/test_single_col_null_format_text';
+
+drop table if exists test_default_null_format_multi_col_text;
+
+create external table test_default_null_format_multi_col_text (
+  c1 STRING,
+  c2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+LOCATION 
'/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text';
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index cda92c0519a..36866613260 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
@@ -73,6 +73,18 @@ b    2.2
 
 -- !test_empty_null_defined_text3 --
 
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5      3       2       1       1       1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3      2       1       1       1       2       0       1
+
 -- !1 --
 a      1.1
 b      2.2
@@ -147,3 +159,15 @@ b  2.2
 
 -- !test_empty_null_defined_text3 --
 
+-- !test_single_col_null_format_text_count --
+5
+
+-- !test_single_col_null_format_text_values --
+5      3       2       1       1       1
+
+-- !test_default_null_format_multi_col_text_count --
+3
+
+-- !test_default_null_format_multi_col_text_values --
+3      2       1       1       1       2       0       1
+
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
index 70306968852..24efc34f448 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
@@ -63,6 +63,26 @@ suite("test_hive_serde_prop", "p0,external") {
         qt_test_empty_null_defined_text """select * from 
${catalog_name}.regression.test_empty_null_defined_text order by id;"""
         qt_test_empty_null_defined_text2 """select * from 
${catalog_name}.regression.test_empty_null_defined_text where name is null 
order by id;"""
         qt_test_empty_null_defined_text3 """select * from 
${catalog_name}.regression.test_empty_null_defined_text where name = '' order 
by id;"""
+
+        qt_test_single_col_null_format_text_count """select count(*) from 
${catalog_name}.regression.test_single_col_null_format_text;"""
+        qt_test_single_col_null_format_text_values """
+            select count(*), count(name), count(case when name is null then 1 
end),
+                   count(case when name = '' then 1 end),
+                   count(case when name = 'non-null' then 1 end),
+                   count(case when name is not null and name not in ('', 
'non-null') then 1 end)
+            from ${catalog_name}.regression.test_single_col_null_format_text;
+        """
+
+        qt_test_default_null_format_multi_col_text_count """select count(*) 
from ${catalog_name}.regression.test_default_null_format_multi_col_text;"""
+        qt_test_default_null_format_multi_col_text_values """
+            select count(*), count(c1), count(c2),
+                   count(case when c1 is null then 1 end),
+                   count(case when c1 = '' then 1 end),
+                   count(case when c2 is null then 1 end),
+                   count(case when c2 = '' then 1 end),
+                   count(case when c1 = 'a' and c2 = 'b' then 1 end)
+            from 
${catalog_name}.regression.test_default_null_format_multi_col_text;
+        """
     }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to