This is an automated email from the ASF dual-hosted git repository.

djwang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry-pxf.git

commit e56511ca07c17efd5ef0c2090b595b02bd15e251
Author: Bradford D. Boyle <[email protected]>
AuthorDate: Mon Sep 19 13:27:48 2022 -0700

    Relax the requirement that C string length matches Java string length (#870)
    
    Commit 94f8ccad added a check that the length of received strings (as
    determined by strnlen) matched the length of the string as determined by
    the Java-based PXF service; if they did not, this was treated as a fatal
    error. Some users have reported that they have ORC/Parquet files with
    strings that contain ASCCII NUL-bytes. These strings would not have a
    strnlen calculate length that matches with the Java string lenght.
    
    This commit removes the requirements that the lengths be equal and
    instead logs a debug message when they do not match.
    
    Authored-by: Bradford D. Boyle <[email protected]>
---
 .../pxf/automation/features/orc/OrcReadTest.java   |  14 ++++++++++++
 automation/src/test/resources/data/orc/README.md   |   7 ++++++
 .../resources/data/orc/orc_null_in_string.json     |   6 +++++
 .../test/resources/data/orc/orc_null_in_string.orc | Bin 0 -> 719 bytes
 .../features/orc/read/null_in_string/__init__.py   |   0
 .../orc/read/null_in_string/expected/query01.ans   |  13 +++++++++++
 .../features/orc/read/null_in_string/runTest.py    |  11 +++++++++
 .../orc/read/null_in_string/sql/query01.sql        |   3 +++
 external-table/src/gpdbwritableformatter.c         |  25 ++++++++++++---------
 9 files changed, 69 insertions(+), 10 deletions(-)

diff --git 
a/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java
 
b/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java
index 1e0feed3..de41d36b 100644
--- 
a/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java
+++ 
b/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java
@@ -13,6 +13,7 @@ public class OrcReadTest extends BaseFeature {
     private static final String ORC_PRIMITIVE_TYPES_UNORDERED_SUBSET = 
"orc_types_unordered_subset.orc";
     private static final String ORC_LIST_TYPES = "orc_list_types.orc";
     private static final String ORC_MULTIDIM_LIST_TYPES = 
"orc_multidim_list_types.orc";
+    private static final String ORC_NULL_IN_STRING = "orc_null_in_string.orc";
 
     private static final String[] ORC_TABLE_COLUMNS = {
             "id      integer",
@@ -73,6 +74,12 @@ public class OrcReadTest extends BaseFeature {
             "varchar_arr  text[]"
     };
 
+    private static final String[] ORC_NULL_IN_STRING_COLUMNS = new String[]{
+            "id      int",
+            "context text",
+            "value   text"
+    };
+
     private String hdfsPath;
     private ProtocolEnum protocol;
 
@@ -87,6 +94,7 @@ public class OrcReadTest extends BaseFeature {
         hdfs.copyFromLocal(resourcePath + 
ORC_PRIMITIVE_TYPES_UNORDERED_SUBSET, hdfsPath + 
ORC_PRIMITIVE_TYPES_UNORDERED_SUBSET);
         hdfs.copyFromLocal(resourcePath + ORC_LIST_TYPES, hdfsPath + 
ORC_LIST_TYPES);
         hdfs.copyFromLocal(resourcePath + ORC_MULTIDIM_LIST_TYPES, hdfsPath + 
ORC_MULTIDIM_LIST_TYPES);
+        hdfs.copyFromLocal(resourcePath + ORC_NULL_IN_STRING, hdfsPath + 
ORC_NULL_IN_STRING);
 
         prepareReadableExternalTable(PXF_ORC_TABLE, ORC_TABLE_COLUMNS, 
hdfsPath + ORC_PRIMITIVE_TYPES);
     }
@@ -146,6 +154,12 @@ public class OrcReadTest extends BaseFeature {
         runTincTest("pxf.features.orc.read.multidim_list_types.runTest");
     }
 
+    @Test(groups = {"features", "gpdb", "security", "hcfs"})
+    public void orcReadStringsContainingNullByte() throws Exception {
+        prepareReadableExternalTable("pxf_orc_null_in_string", 
ORC_NULL_IN_STRING_COLUMNS, hdfsPath + ORC_NULL_IN_STRING);
+        runTincTest("pxf.features.orc.read.null_in_string.runTest");
+    }
+
     private void prepareReadableExternalTable(String name, String[] fields, 
String path) throws Exception {
         prepareReadableExternalTable(name, fields, path, false);
     }
diff --git a/automation/src/test/resources/data/orc/README.md 
b/automation/src/test/resources/data/orc/README.md
index 62e5ceb6..0b143878 100644
--- a/automation/src/test/resources/data/orc/README.md
+++ b/automation/src/test/resources/data/orc/README.md
@@ -103,3 +103,10 @@ If desired, you can copy down the CSV file by running the 
following command:
 mv "./orc_multidim_list_types/000000_0" "./${CSV_FILENAME}"
 ```
 
+## Generate the orc_null_in_string.orc file
+
+To generate this file, you will need to download a copy of the ORC Tools uber 
jar: 
<https://search.maven.org/remotecontent?filepath=org/apache/orc/orc-tools/1.8.0/orc-tools-1.8.0-uber.jar>
+
+```shell
+java -jar orc-tools-1.8.0-uber.jar convert --schema 
'struct<id:int,desc:string,value:string>' --output orc_null_in_string.orc 
orc_null_in_string.json
+```
diff --git a/automation/src/test/resources/data/orc/orc_null_in_string.json 
b/automation/src/test/resources/data/orc/orc_null_in_string.json
new file mode 100644
index 00000000..e918aad6
--- /dev/null
+++ b/automation/src/test/resources/data/orc/orc_null_in_string.json
@@ -0,0 +1,6 @@
+{"id": 1, "context": "simple string", "value": "hello"}
+{"id": 2, "context": "simple string with space", "value": "hello world"}
+{"id": 3, "context": "simple string with double quote", "value": "hello 
\"world\""}
+{"id": 4, "context": "NUL-byte in middle of string", "value": 
"hello\u0000world"}
+{"id": 5, "context": "NUL-byte at the beginning of string", "value": 
"\u0000hello world"}
+{"id": 6, "context": "NUL-byte at the end of string", "value": "hello 
world\u0000"}
diff --git a/automation/src/test/resources/data/orc/orc_null_in_string.orc 
b/automation/src/test/resources/data/orc/orc_null_in_string.orc
new file mode 100644
index 00000000..3cc1e6ae
Binary files /dev/null and 
b/automation/src/test/resources/data/orc/orc_null_in_string.orc differ
diff --git 
a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/__init__.py 
b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git 
a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/expected/query01.ans
 
b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/expected/query01.ans
new file mode 100644
index 00000000..bbfd641a
--- /dev/null
+++ 
b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/expected/query01.ans
@@ -0,0 +1,13 @@
+-- start_ignore
+-- end_ignore
+-- @description query01 for reading strings contain NUL-byte from ORC files
+SELECT * FROM pxf_orc_null_in_string ORDER BY id;
+ id |               context               |     value
+----+-------------------------------------+---------------
+  1 | simple string                       | hello
+  2 | simple string with space            | hello world
+  3 | simple string with double quote     | hello "world"
+  4 | NUL-byte in middle of string        | hello
+  5 | NUL-byte at the beginning of string |
+  6 | NUL-byte at the end of string       | hello world
+(6 rows)
\ No newline at end of file
diff --git 
a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/runTest.py 
b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/runTest.py
new file mode 100644
index 00000000..916deefe
--- /dev/null
+++ b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/runTest.py
@@ -0,0 +1,11 @@
+from mpp.models import SQLConcurrencyTestCase
+
+class OrcNullInString(SQLConcurrencyTestCase):
+    """
+    @db_name pxfautomation
+    @concurrency 1
+    @gpdiff True
+    """
+    sql_dir = 'sql'
+    ans_dir = 'expected'
+    out_dir = 'output'
diff --git 
a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/sql/query01.sql 
b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/sql/query01.sql
new file mode 100644
index 00000000..52e65cb3
--- /dev/null
+++ 
b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/sql/query01.sql
@@ -0,0 +1,3 @@
+-- @description query01 for reading strings contain NUL-byte from ORC files
+
+SELECT * FROM pxf_orc_null_in_string ORDER BY id;
diff --git a/external-table/src/gpdbwritableformatter.c 
b/external-table/src/gpdbwritableformatter.c
index 835c9a04..7db9f30b 100644
--- a/external-table/src/gpdbwritableformatter.c
+++ b/external-table/src/gpdbwritableformatter.c
@@ -960,19 +960,24 @@ gpdbwritableformatter_import(PG_FUNCTION_ARGS)
                                 * Read string value from data_buf and convert 
it to internal representation
                                 *
                                 * PXF service should send all strings with a 
terminating nul-byte, so we can
-                                * determine the length of the string and 
compare it with the length that PXF
-                                * service computed and sent. Since we've 
checked that outlen[i] is no larger
-                                * than the number of bytes left for this tuple 
in data_buf, we use it as max
-                                * number of bytes to scan.
+                                * determine the number of bytes that the input 
function will read and compare
+                                * it with the length that the PXF service 
computed and sent. Since we've
+                                * checked that outlen[i] is no larger than the 
number of bytes left for this
+                                * tuple in data_buf, we use it as max number 
of bytes to scan in the call to
+                                * strnlen
                                 */
                                size_t actual_len = strnlen(data_buf + bufidx, 
myData->outlen[i]);
 
-                               /* outlen[i] includes the terminating nul-byte 
*/
-                               if (actual_len != myData->outlen[i] - 1)
-                                       ereport(FATAL,
-                                                       
(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
-                                                                       
errmsg("data for column %d of row %d has invalid length", i + 1, 
myData->lineno),
-                                                                       
errdetail("expected column %d to have length %d, actual length is %ld", i + 1, 
myData->outlen[i] - 1, actual_len)));
+                               /*
+                                * Compare the length as returned by strnlen 
with the length as determined by
+                                * the PXF service. If the source data includes 
ASCII NUL-byte in the string,
+                                * then these two values will not match. It's 
possible that this will result
+                                * in the Postgres input function 
truncating/mis-reading the value. Rather
+                                * than treat this as an error here, we log a 
debug message.
+                                */
+                               if (actual_len != myData->outlen[i]-1)
+                                       ereport(DEBUG1,
+                                                       (errmsg("expected 
column %d of row %d to have length %d, actual length is %ld", i+1, 
myData->lineno, myData->outlen[i]-1, actual_len)));
 
                                myData->values[i] = InputFunctionCall(iofunc,
                                                                                
                          data_buf + bufidx,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to