This is an automated email from the ASF dual-hosted git repository. djwang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry-pxf.git
commit e56511ca07c17efd5ef0c2090b595b02bd15e251 Author: Bradford D. Boyle <[email protected]> AuthorDate: Mon Sep 19 13:27:48 2022 -0700 Relax the requirement that C string length matches Java string length (#870) Commit 94f8ccad added a check that the length of received strings (as determined by strnlen) matched the length of the string as determined by the Java-based PXF service; if they did not, this was treated as a fatal error. Some users have reported that they have ORC/Parquet files with strings that contain ASCCII NUL-bytes. These strings would not have a strnlen calculate length that matches with the Java string lenght. This commit removes the requirements that the lengths be equal and instead logs a debug message when they do not match. Authored-by: Bradford D. Boyle <[email protected]> --- .../pxf/automation/features/orc/OrcReadTest.java | 14 ++++++++++++ automation/src/test/resources/data/orc/README.md | 7 ++++++ .../resources/data/orc/orc_null_in_string.json | 6 +++++ .../test/resources/data/orc/orc_null_in_string.orc | Bin 0 -> 719 bytes .../features/orc/read/null_in_string/__init__.py | 0 .../orc/read/null_in_string/expected/query01.ans | 13 +++++++++++ .../features/orc/read/null_in_string/runTest.py | 11 +++++++++ .../orc/read/null_in_string/sql/query01.sql | 3 +++ external-table/src/gpdbwritableformatter.c | 25 ++++++++++++--------- 9 files changed, 69 insertions(+), 10 deletions(-) diff --git a/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java b/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java index 1e0feed3..de41d36b 100644 --- a/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java +++ b/automation/src/test/java/org/greenplum/pxf/automation/features/orc/OrcReadTest.java @@ -13,6 +13,7 @@ public class OrcReadTest extends BaseFeature { private static final String ORC_PRIMITIVE_TYPES_UNORDERED_SUBSET = "orc_types_unordered_subset.orc"; private static final String ORC_LIST_TYPES = "orc_list_types.orc"; private static final String ORC_MULTIDIM_LIST_TYPES = "orc_multidim_list_types.orc"; + private static final String ORC_NULL_IN_STRING = "orc_null_in_string.orc"; private static final String[] ORC_TABLE_COLUMNS = { "id integer", @@ -73,6 +74,12 @@ public class OrcReadTest extends BaseFeature { "varchar_arr text[]" }; + private static final String[] ORC_NULL_IN_STRING_COLUMNS = new String[]{ + "id int", + "context text", + "value text" + }; + private String hdfsPath; private ProtocolEnum protocol; @@ -87,6 +94,7 @@ public class OrcReadTest extends BaseFeature { hdfs.copyFromLocal(resourcePath + ORC_PRIMITIVE_TYPES_UNORDERED_SUBSET, hdfsPath + ORC_PRIMITIVE_TYPES_UNORDERED_SUBSET); hdfs.copyFromLocal(resourcePath + ORC_LIST_TYPES, hdfsPath + ORC_LIST_TYPES); hdfs.copyFromLocal(resourcePath + ORC_MULTIDIM_LIST_TYPES, hdfsPath + ORC_MULTIDIM_LIST_TYPES); + hdfs.copyFromLocal(resourcePath + ORC_NULL_IN_STRING, hdfsPath + ORC_NULL_IN_STRING); prepareReadableExternalTable(PXF_ORC_TABLE, ORC_TABLE_COLUMNS, hdfsPath + ORC_PRIMITIVE_TYPES); } @@ -146,6 +154,12 @@ public class OrcReadTest extends BaseFeature { runTincTest("pxf.features.orc.read.multidim_list_types.runTest"); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void orcReadStringsContainingNullByte() throws Exception { + prepareReadableExternalTable("pxf_orc_null_in_string", ORC_NULL_IN_STRING_COLUMNS, hdfsPath + ORC_NULL_IN_STRING); + runTincTest("pxf.features.orc.read.null_in_string.runTest"); + } + private void prepareReadableExternalTable(String name, String[] fields, String path) throws Exception { prepareReadableExternalTable(name, fields, path, false); } diff --git a/automation/src/test/resources/data/orc/README.md b/automation/src/test/resources/data/orc/README.md index 62e5ceb6..0b143878 100644 --- a/automation/src/test/resources/data/orc/README.md +++ b/automation/src/test/resources/data/orc/README.md @@ -103,3 +103,10 @@ If desired, you can copy down the CSV file by running the following command: mv "./orc_multidim_list_types/000000_0" "./${CSV_FILENAME}" ``` +## Generate the orc_null_in_string.orc file + +To generate this file, you will need to download a copy of the ORC Tools uber jar: <https://search.maven.org/remotecontent?filepath=org/apache/orc/orc-tools/1.8.0/orc-tools-1.8.0-uber.jar> + +```shell +java -jar orc-tools-1.8.0-uber.jar convert --schema 'struct<id:int,desc:string,value:string>' --output orc_null_in_string.orc orc_null_in_string.json +``` diff --git a/automation/src/test/resources/data/orc/orc_null_in_string.json b/automation/src/test/resources/data/orc/orc_null_in_string.json new file mode 100644 index 00000000..e918aad6 --- /dev/null +++ b/automation/src/test/resources/data/orc/orc_null_in_string.json @@ -0,0 +1,6 @@ +{"id": 1, "context": "simple string", "value": "hello"} +{"id": 2, "context": "simple string with space", "value": "hello world"} +{"id": 3, "context": "simple string with double quote", "value": "hello \"world\""} +{"id": 4, "context": "NUL-byte in middle of string", "value": "hello\u0000world"} +{"id": 5, "context": "NUL-byte at the beginning of string", "value": "\u0000hello world"} +{"id": 6, "context": "NUL-byte at the end of string", "value": "hello world\u0000"} diff --git a/automation/src/test/resources/data/orc/orc_null_in_string.orc b/automation/src/test/resources/data/orc/orc_null_in_string.orc new file mode 100644 index 00000000..3cc1e6ae Binary files /dev/null and b/automation/src/test/resources/data/orc/orc_null_in_string.orc differ diff --git a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/__init__.py b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/expected/query01.ans b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/expected/query01.ans new file mode 100644 index 00000000..bbfd641a --- /dev/null +++ b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/expected/query01.ans @@ -0,0 +1,13 @@ +-- start_ignore +-- end_ignore +-- @description query01 for reading strings contain NUL-byte from ORC files +SELECT * FROM pxf_orc_null_in_string ORDER BY id; + id | context | value +----+-------------------------------------+--------------- + 1 | simple string | hello + 2 | simple string with space | hello world + 3 | simple string with double quote | hello "world" + 4 | NUL-byte in middle of string | hello + 5 | NUL-byte at the beginning of string | + 6 | NUL-byte at the end of string | hello world +(6 rows) \ No newline at end of file diff --git a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/runTest.py b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/runTest.py new file mode 100644 index 00000000..916deefe --- /dev/null +++ b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/runTest.py @@ -0,0 +1,11 @@ +from mpp.models import SQLConcurrencyTestCase + +class OrcNullInString(SQLConcurrencyTestCase): + """ + @db_name pxfautomation + @concurrency 1 + @gpdiff True + """ + sql_dir = 'sql' + ans_dir = 'expected' + out_dir = 'output' diff --git a/automation/tincrepo/main/pxf/features/orc/read/null_in_string/sql/query01.sql b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/sql/query01.sql new file mode 100644 index 00000000..52e65cb3 --- /dev/null +++ b/automation/tincrepo/main/pxf/features/orc/read/null_in_string/sql/query01.sql @@ -0,0 +1,3 @@ +-- @description query01 for reading strings contain NUL-byte from ORC files + +SELECT * FROM pxf_orc_null_in_string ORDER BY id; diff --git a/external-table/src/gpdbwritableformatter.c b/external-table/src/gpdbwritableformatter.c index 835c9a04..7db9f30b 100644 --- a/external-table/src/gpdbwritableformatter.c +++ b/external-table/src/gpdbwritableformatter.c @@ -960,19 +960,24 @@ gpdbwritableformatter_import(PG_FUNCTION_ARGS) * Read string value from data_buf and convert it to internal representation * * PXF service should send all strings with a terminating nul-byte, so we can - * determine the length of the string and compare it with the length that PXF - * service computed and sent. Since we've checked that outlen[i] is no larger - * than the number of bytes left for this tuple in data_buf, we use it as max - * number of bytes to scan. + * determine the number of bytes that the input function will read and compare + * it with the length that the PXF service computed and sent. Since we've + * checked that outlen[i] is no larger than the number of bytes left for this + * tuple in data_buf, we use it as max number of bytes to scan in the call to + * strnlen */ size_t actual_len = strnlen(data_buf + bufidx, myData->outlen[i]); - /* outlen[i] includes the terminating nul-byte */ - if (actual_len != myData->outlen[i] - 1) - ereport(FATAL, - (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), - errmsg("data for column %d of row %d has invalid length", i + 1, myData->lineno), - errdetail("expected column %d to have length %d, actual length is %ld", i + 1, myData->outlen[i] - 1, actual_len))); + /* + * Compare the length as returned by strnlen with the length as determined by + * the PXF service. If the source data includes ASCII NUL-byte in the string, + * then these two values will not match. It's possible that this will result + * in the Postgres input function truncating/mis-reading the value. Rather + * than treat this as an error here, we log a debug message. + */ + if (actual_len != myData->outlen[i]-1) + ereport(DEBUG1, + (errmsg("expected column %d of row %d to have length %d, actual length is %ld", i+1, myData->lineno, myData->outlen[i]-1, actual_len))); myData->values[i] = InputFunctionCall(iofunc, data_buf + bufidx, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
