(cloudberry) branch main updated: Fix COPY FROM encoding error double-counting and enable SREH for transcoding

avamingli Tue, 03 Mar 2026 02:57:17 -0800

This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git



The following commit(s) were added to refs/heads/main by this push:
     new c5a298db737 Fix COPY FROM encoding error double-counting and enable 
SREH for transcoding
c5a298db737 is described below

commit c5a298db737fbefdb86fa99681b2e7732c90a39a
Author: Zhang Mingli <[email protected]>
AuthorDate: Tue Mar 3 08:56:25 2026 +0800

    Fix COPY FROM encoding error double-counting and enable SREH for transcoding
    
    COPY FROM with SEGMENT REJECT LIMIT had two bugs when encountering
    invalid multi-byte encoding sequences:
    
    1. Encoding errors were double-counted: HandleCopyError() incremented
       rejectcount, then RemoveInvalidDataInBuf() incremented it again for
       the same error. This caused the reject limit to be reached twice as
       fast as expected.
    
    2. SREH (Single Row Error Handling) was completely disabled when
       transcoding was required (file encoding != database encoding). Any
       encoding error during transcoding would raise an ERROR instead of
       skipping the bad row.
    
    Fix by removing the duplicate rejectcount++ from RemoveInvalidDataInBuf(),
    removing the !need_transcoding guard that blocked SREH for transcoding,
    and adding proper buffer cleanup for the transcoding case (advance
    raw_buf past the bad line using FindEolInUnverifyRawBuf).
    
    Add regression tests covering both non-transcoding (invalid UTF-8) and
    transcoding (invalid EUC_CN to UTF-8) cases with various reject limits.
    
    Fixes https://github.com/apache/cloudberry/issues/1425
---
 src/backend/commands/copyfromparse.c               |  46 ++++--
 src/test/regress/data/copy_enc_err_euccn.data      |   3 +
 .../regress/data/copy_enc_err_euccn_multi.data     |   5 +
 src/test/regress/data/copy_enc_err_utf8.data       |   3 +
 src/test/regress/data/copy_enc_err_utf8_multi.data |   5 +
 src/test/regress/expected/.gitignore               |   1 +
 src/test/regress/greenplum_schedule                |   1 +
 src/test/regress/input/copy_encoding_error.source  | 107 ++++++++++++++
 src/test/regress/output/copy_encoding_error.source | 161 +++++++++++++++++++++
 src/test/regress/sql/.gitignore                    |   1 +
 10 files changed, 318 insertions(+), 15 deletions(-)

diff --git a/src/backend/commands/copyfromparse.c 
b/src/backend/commands/copyfromparse.c
index 5d1ff1b53c8..6bf309478aa 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -649,8 +649,7 @@ sreh_conversion_error:
                 */
                if (cstate->input_reached_error)
                {
-                       /* so far, we only support no transcoding conversion 
error handling */
-                       if (cstate->cdbsreh && !cstate->need_transcoding)
+                       if (cstate->cdbsreh)
                        {
                                MemoryContext oldcontext = CurrentMemoryContext;
                                PG_TRY();
@@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo 
*flinfo,
 void
 RemoveInvalidDataInBuf(CopyFromState cstate)
 {
-       int nbytes;
        int scanidx;
 
        if (cstate->errMode == ALL_OR_NOTHING)
@@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 
        if (!cstate->need_transcoding)
        {
+               int nbytes;
+
                /*
                 * According to `BeginCopyFrom`, if not need_transcoding these 
two
                 * pointer share one memory space.
@@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
                        /* leave a hint to identify find eol after next raw 
page read */
                        cstate->find_eol_with_rawreading = true;
                }
-
-               /* reset input buf, so we can redo conversion/verification */
-               cstate->input_reached_error = false;
-               cstate->input_buf_index = 0;
-               cstate->input_buf_len = 0;
-
-               /* reset line_buf */
-               resetStringInfo(&cstate->line_buf);
-               cstate->line_buf_valid = false;
-               cstate->cdbsreh->rejectcount++;
        }
        else
        {
-               ereport(ERROR, (errmsg("Data validation error: since the source 
data "
-                                                          "need transcoding 
sreh can not handle yet.")));
+               /*
+                * Transcoding case: raw_buf and input_buf are separate buffers.
+                * Skip the bad line in raw_buf by finding the next EOL.  No 
need to
+                * memmove raw_buf here; CopyLoadRawBuf() will compact it when 
more
+                * raw data is needed.
+                */
+               if (FindEolInUnverifyRawBuf(cstate, &scanidx))
+               {
+                       cstate->raw_buf_index += scanidx;
+               }
+               else
+               {
+                       /* Current page can not find eol, to skip current raw 
buffer */
+                       cstate->raw_buf_len = 0;
+                       cstate->raw_buf_index = 0;
+
+                       /* leave a hint to identify find eol after next raw 
page read */
+                       cstate->find_eol_with_rawreading = true;
+               }
        }
+
+       /* reset input buf, so we can redo conversion/verification */
+       cstate->input_reached_error = false;
+       cstate->input_buf_index = 0;
+       cstate->input_buf_len = 0;
+
+       /* reset line_buf */
+       resetStringInfo(&cstate->line_buf);
+       cstate->line_buf_valid = false;
 }
 
 static bool
diff --git a/src/test/regress/data/copy_enc_err_euccn.data 
b/src/test/regress/data/copy_enc_err_euccn.data
new file mode 100644
index 00000000000..a73dd217cd3
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_euccn.data
@@ -0,0 +1,3 @@
+1|good1
+2|bad�
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_euccn_multi.data 
b/src/test/regress/data/copy_enc_err_euccn_multi.data
new file mode 100644
index 00000000000..8a5a30ae25e
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_euccn_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|bad�
+3|good3
+4|bad�
+5|good5
diff --git a/src/test/regress/data/copy_enc_err_utf8.data 
b/src/test/regress/data/copy_enc_err_utf8.data
new file mode 100644
index 00000000000..e8297cc05ca
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_utf8.data
@@ -0,0 +1,3 @@
+1|good1
+2|bad�
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_utf8_multi.data 
b/src/test/regress/data/copy_enc_err_utf8_multi.data
new file mode 100644
index 00000000000..9544e7d3c13
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_utf8_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|bad�
+3|good3
+4|bad�
+5|good5
diff --git a/src/test/regress/expected/.gitignore 
b/src/test/regress/expected/.gitignore
index 927156dbc3f..c837ca324d5 100644
--- a/src/test/regress/expected/.gitignore
+++ b/src/test/regress/expected/.gitignore
@@ -69,3 +69,4 @@
 /tag.out
 /ao_unique_index_partition.out
 /bfv_copy.out
+/copy_encoding_error.out
diff --git a/src/test/regress/greenplum_schedule 
b/src/test/regress/greenplum_schedule
index 039e8d7e9c4..3c8f7965b28 100755
--- a/src/test/regress/greenplum_schedule
+++ b/src/test/regress/greenplum_schedule
@@ -35,6 +35,7 @@ test: gp_dispatch_keepalives
 # copy command
 # copy form a file with different EOL
 test: copy_eol
+test: copy_encoding_error
 
 test: dedupset
 
diff --git a/src/test/regress/input/copy_encoding_error.source 
b/src/test/regress/input/copy_encoding_error.source
new file mode 100644
index 00000000000..6302442b761
--- /dev/null
+++ b/src/test/regress/input/copy_encoding_error.source
@@ -0,0 +1,107 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' 
DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' 
DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+SELECT gp_truncate_error_log('copy_enc_err');
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER 
'|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' 
DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+SELECT * FROM copy_enc_err ORDER BY a;
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/output/copy_encoding_error.source 
b/src/test/regress/output/copy_encoding_error.source
new file mode 100644
index 00000000000..3beceb6b2f3
--- /dev/null
+++ b/src/test/regress/output/copy_encoding_error.source
@@ -0,0 +1,161 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected 
related input data
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' 
DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected 
related input data
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' 
DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+ERROR:  segment reject limit reached, aborting operation
+DETAIL:  Last error was: invalid byte sequence for encoding "UTF8": 0xfe
+CONTEXT:  COPY copy_enc_err, line 3
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER 
'|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected 
related input data
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' 
DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected 
related input data
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore
index 7cbb805e8f5..9b5f3660fa7 100644
--- a/src/test/regress/sql/.gitignore
+++ b/src/test/regress/sql/.gitignore
@@ -63,3 +63,4 @@
 /tag.sql
 /ao_unique_index_partition.sql
 /bfv_copy.sql
+/copy_encoding_error.sql


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) branch main updated: Fix COPY FROM encoding error double-counting and enable SREH for transcoding

Reply via email to