This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 85cd07a11e876f3d8773f2638f699c61a6b0dd4c Author: pranavyl <pranav.lo...@cloudera.com> AuthorDate: Fri Dec 8 19:11:28 2023 +0100 IMPALA-11499: Refactor UrlEncode function to handle special characters An error came from an issue with URL encoding, where certain Unicode characters were being incorrectly encoded due to their UTF-8 representation matching characters in the set of characters to escape. For example, the string '运', which consists of three bytes 0xe8 0xbf 0x90 was wrongly getting encoded into '\E8%FFFFFFBF\90', because the middle byte matched one of the two bytes that represented the "\u00FF" literal. Inclusion of "\u00FF" was likely a mistake from the beginning and it should have been '\x7F'. The patch makes three key changes: 1. Before the change, the set of characters that need to be escaped was stored as a string. The current patch uses an unordered_set instead. 2. '\xFF', which is an invalid UTF-8 byte and whose inclusion was erroneous from the beginning, is replaced with '\x7F', which is a control character for DELETE, ensuring consistency and correctness in URL encoding. 3. The list of characters to be escaped is extended to match the current list in Hive. Testing: Tests on both traditional Hive tables and Iceberg tables are included in unicode-column-name.test, insert.test, coding-util-test.cc and test_insert.py. Change-Id: I88c4aba5d811dfcec809583d0c16fcbc0ca730fb Reviewed-on: http://gerrit.cloudera.org:8080/21131 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/util/coding-util-test.cc | 14 ++++- be/src/util/coding-util.cc | 39 ++++++++------ .../functional-query/queries/QueryTest/insert.test | 18 +++++++ .../queries/QueryTest/unicode-column-name.test | 36 +++++++++++++ tests/query_test/test_insert.py | 63 ++++++++++++++++++++++ 5 files changed, 151 insertions(+), 19 deletions(-) diff --git a/be/src/util/coding-util-test.cc b/be/src/util/coding-util-test.cc index ad04222ae..7914de148 100644 --- a/be/src/util/coding-util-test.cc +++ b/be/src/util/coding-util-test.cc @@ -113,8 +113,18 @@ TEST(UrlCodingTest, BlankString) { } TEST(UrlCodingTest, PathSeparators) { - TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false); - TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true); + string test_path = "/home/impala/directory/"; + string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F"; + TestUrl(test_path, encoded_test_path, false); + TestUrl(test_path, encoded_test_path, true); + string test = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10" + "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'" + "*/:=?\\{[]^"; + string encoded_test = "SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25" + "%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E"; + TestUrl(test, encoded_test, false); + TestUrl(test, encoded_test, true); } TEST(Base64Test, Basic) { diff --git a/be/src/util/coding-util.cc b/be/src/util/coding-util.cc index a78a3791d..7a7d1f6ae 100644 --- a/be/src/util/coding-util.cc +++ b/be/src/util/coding-util.cc @@ -18,16 +18,17 @@ #include "util/coding-util.h" #include <cctype> +#include <iomanip> #include <limits> #include <sstream> +#include <unordered_set> -#include <boost/algorithm/string/classification.hpp> +#include <boost/algorithm/string.hpp> #include <boost/function.hpp> #include <sasl/sasl.h> #include "common/compiler-util.h" #include "common/logging.h" - #include "common/names.h" #include "sasl/saslutil.h" @@ -37,33 +38,37 @@ using std::uppercase; namespace impala { +// It is more convenient to maintain the complement of the set of +// characters to escape when not in Hive-compat mode. +static function<bool (char)> ShouldNotEscape = is_any_of("-_.~"); + // Hive selectively encodes characters. This is the whitelist of // characters it will encode. // See common/src/java/org/apache/hadoop/hive/common/FileUtils.java // in the Hive source code for the source of this list. -static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF"); - -// It is more convenient to maintain the complement of the set of -// characters to escape when not in Hive-compat mode. -static function<bool (char)> ShouldNotEscape = is_any_of("-_.~"); +static const std::unordered_set<char> SpecialCharacters = { + '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\b', '\t', '\n', + '\v', '\f', '\r', '\x0E', '\x0F', '\x10', '\x11', '\x12', '\x13', '\x14', + '\x15', '\x16', '\x17', '\x18', '\x19', '\x1A', '\x1B', '\x1C', '\x1D', '\x1E', + '\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', '[', ']', + '^'}; static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) { - (*out).reserve(in_len); stringstream ss; - for (int i = 0; i < in_len; ++i) { - const char ch = in[i]; + // "uppercase" and "hex" only affect the insertion of integers, not that of char values. + ss << uppercase << hex << setfill('0'); + for (char ch : std::string_view(in, in_len)) { // Escape the character iff a) we are in Hive-compat mode and the - // character is in the Hive whitelist or b) we are not in - // Hive-compat mode, and the character is not alphanumeric or one - // of the four commonly excluded characters. - if ((hive_compat && HiveShouldEscape(ch)) || - (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) { - ss << '%' << uppercase << hex << static_cast<uint32_t>(ch); + // character is in the Hive whitelist or b) we are not in Hive-compat mode and + // the character is not alphanumeric and it is not one of the characters specifically + // excluded from escaping (see ShouldNotEscape()). + if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat && + !isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) { + ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch)); } else { ss << ch; } } - (*out) = ss.str(); } diff --git a/testdata/workloads/functional-query/queries/QueryTest/insert.test b/testdata/workloads/functional-query/queries/QueryTest/insert.test index 88cec41bf..190fa1f36 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/insert.test +++ b/testdata/workloads/functional-query/queries/QueryTest/insert.test @@ -446,6 +446,24 @@ SELECT * FROM insert_string_partitioned WHERE s2 = "/\%."; string, string ==== ---- QUERY +truncate insert_string_partitioned; +INSERT INTO TABLE insert_string_partitioned PARTITION(s2="O'Reilly") values ('0'); +SELECT * FROM insert_string_partitioned where s2= "O'Reilly"; +---- RESULTS +'0','O''Reilly' +---- TYPES +STRING, STRING +==== +---- QUERY +truncate insert_string_partitioned; +INSERT INTO TABLE insert_string_partitioned PARTITION(s2="Impala'#%*/:=?{}[]^") values ('0'); +SELECT * FROM insert_string_partitioned where s2= "Impala'#%*/:=?{}[]^"; +---- RESULTS +'0','Impala''#%*/:=?{}[]^' +---- TYPES +STRING, STRING +==== +---- QUERY # static partition insert into string-partitioned table with non-escaped special characters # (Hive chooses not to escape + and ' ') truncate insert_string_partitioned; diff --git a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test index 59bd14dda..9d9f304da 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test +++ b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test @@ -291,3 +291,39 @@ INT, STRING drop table testtbl_parquet; ---- RESULTS 'Table has been dropped.' +==== +---- QUERY +# Tests for IMPALA-11499 +create table unicode_partition_values (id int) partitioned by (p string) stored as parquet; +---- RESULTS +'Table has been created.' +==== +---- QUERY +insert into unicode_partition_values partition(p='运营业务数据') values (0); +insert into unicode_partition_values partition(p='运') values (0); +insert into unicode_partition_values partition(p='运营业务数据1234567890!@#$%^&*(){}[]') values (0); +select * from unicode_partition_values; +---- RESULTS: RAW_STRING +0,'运' +0,'运营业务数据' +0,'运营业务数据1234567890!@#$%^&*(){}[]' +---- TYPES +INT, STRING +==== +---- QUERY +create table unicode_partition_values_iceberg (id int, p string) partitioned by spec (identity(p)) stored by iceberg; +---- RESULTS +'Table has been created.' +==== +---- QUERY +insert into unicode_partition_values_iceberg values (0, '运营业务数据'); +insert into unicode_partition_values_iceberg values (0, '运'); +insert into unicode_partition_values_iceberg values (0, '运营业务数据1234567890!@#$%^&*(){}[]'); +select * from unicode_partition_values_iceberg; +---- RESULTS: RAW_STRING +0,'运' +0,'运营业务数据' +0,'运营业务数据1234567890!@#$%^&*(){}[]' +---- TYPES +INT, STRING +==== \ No newline at end of file diff --git a/tests/query_test/test_insert.py b/tests/query_test/test_insert.py index 0124ac932..81987a941 100644 --- a/tests/query_test/test_insert.py +++ b/tests/query_test/test_insert.py @@ -294,6 +294,69 @@ class TestInsertPartKey(ImpalaTestSuite): self.run_test_case('QueryTest/insert_part_key', vector, multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1) + def test_escaped_partition_values(self, unique_database): + """Test for special characters in partition values.""" + tbl = unique_database + ".tbl" + self.execute_query( + "create table {}(i int) partitioned by (p string) stored as parquet".format(tbl)) + # "\\'" is used to represent a single quote since the insert statement uses a single + # quote on the partition value. "\\\\" is used for backslash since it gets escaped + # again in parsing the insert statement. + special_characters = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \ + "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \ + "\x1C\x1D\x1E\x1F\"\x7F\\'%*/:=?\\\\{[]#^" + part_value = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \ + "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \ + "\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^" + part_dir = "p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \ + "10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \ + "%2F%3A%3D%3F%5C%7B%5B%5D%23%5E" + res = self.execute_query( + "insert into {} partition(p='{}') values (0)".format(tbl, special_characters)) + assert res.data[0] == part_dir + ": 1" + res = self.client.execute("select p from {}".format(tbl)) + assert res.data[0] == part_value + res = self.execute_query("show partitions " + tbl) + # There is a "\t" in the partition value making splitting of the result line + # difficult, hence we only verify that the value is present in string + # representation of the whole row. + assert part_value in res.data[0] + assert part_dir in res.data[0] + + def test_escaped_partition_values_iceberg(self, unique_database): + """Test for special characters in partition values for iceberg tables""" + tbl = unique_database + ".tbl" + self.execute_query("create table {} (id int, p string) partitioned by" + " spec (identity(p)) stored by iceberg".format(tbl)) + # "\\'" is used to represent a single quote since the insert statement uses a single + # quote on the partition value. "\\\\" is used for backslash since it gets escaped + # again in parsing the insert statement. + special_characters = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \ + "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \ + "\x1C\x1D\x1E\x1F\"\x7F\\'%*/:=?\\\\{[]#^" + part_value = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \ + "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \ + "\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^" + part_dir = "p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \ + "10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \ + "%2F%3A%3D%3F%5C%7B%5B%5D%23%5E" + show_part_value = "SpecialCharacters\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006" \ + "\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010" \ + "\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017" \ + "\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E" \ + "\\u001F\\\"\\u007F\'%*\\/:=?\\\\{[]#^" + res = self.execute_query( + "insert into {} values (0, '{}')".format(tbl, special_characters)) + assert res.data[0] == part_dir + ": 1" + res = self.client.execute("select p from {}".format(tbl)) + assert res.data[0] == part_value + res = self.execute_query("show partitions " + tbl) + # There is a "\t" in the partition value making splitting of the result line + # difficult, hence we only verify that the value is present in string + # representation of the whole row. + assert show_part_value in res.data[0] + + class TestInsertNullQueries(ImpalaTestSuite): @classmethod def get_workload(self):