(impala) 01/02: IMPALA-11499: Refactor UrlEncode function to handle special characters

joemcdonnell Thu, 09 May 2024 09:35:34 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 85cd07a11e876f3d8773f2638f699c61a6b0dd4c
Author: pranavyl <[email protected]>
AuthorDate: Fri Dec 8 19:11:28 2023 +0100

    IMPALA-11499: Refactor UrlEncode function to handle special characters
    
    An error came from an issue with URL encoding, where certain Unicode
    characters were being incorrectly encoded due to their UTF-8
    representation matching characters in the set of characters to escape.
    For example, the string '运', which consists of three bytes
    0xe8 0xbf 0x90 was wrongly getting encoded into '\E8%FFFFFFBF\90',
    because the middle byte matched one of the two bytes that
    represented the "\u00FF" literal. Inclusion of "\u00FF" was likely
    a mistake from the beginning and it should have been '\x7F'.
    
    The patch makes three key changes:
    1. Before the change, the set of characters that need to be escaped
    was stored as a string. The current patch uses an unordered_set
    instead.
    
    2. '\xFF', which is an invalid UTF-8 byte and whose inclusion was
    erroneous from the beginning, is replaced with '\x7F', which is a
    control character for DELETE, ensuring consistency and correctness in
    URL encoding.
    
    3. The list of characters to be escaped is extended to match the
    current list in Hive.
    
    Testing: Tests on both traditional Hive tables and Iceberg tables
    are included in unicode-column-name.test, insert.test,
    coding-util-test.cc and test_insert.py.
    
    Change-Id: I88c4aba5d811dfcec809583d0c16fcbc0ca730fb
    Reviewed-on: http://gerrit.cloudera.org:8080/21131
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/util/coding-util-test.cc                    | 14 ++++-
 be/src/util/coding-util.cc                         | 39 ++++++++------
 .../functional-query/queries/QueryTest/insert.test | 18 +++++++
 .../queries/QueryTest/unicode-column-name.test     | 36 +++++++++++++
 tests/query_test/test_insert.py                    | 63 ++++++++++++++++++++++
 5 files changed, 151 insertions(+), 19 deletions(-)

diff --git a/be/src/util/coding-util-test.cc b/be/src/util/coding-util-test.cc
index ad04222ae..7914de148 100644
--- a/be/src/util/coding-util-test.cc
+++ b/be/src/util/coding-util-test.cc
@@ -113,8 +113,18 @@ TEST(UrlCodingTest, BlankString) {
 }
 
 TEST(UrlCodingTest, PathSeparators) {
-  TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false);
-  TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true);
+  string test_path = "/home/impala/directory/";
+  string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F";
+  TestUrl(test_path, encoded_test_path, false);
+  TestUrl(test_path, encoded_test_path, true);
+  string test = 
"SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10"
+                
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'"
+                "*/:=?\\{[]^";
+  string encoded_test = 
"SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
+                        
"%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25"
+                        "%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E";
+  TestUrl(test, encoded_test, false);
+  TestUrl(test, encoded_test, true);
 }
 
 TEST(Base64Test, Basic) {
diff --git a/be/src/util/coding-util.cc b/be/src/util/coding-util.cc
index a78a3791d..7a7d1f6ae 100644
--- a/be/src/util/coding-util.cc
+++ b/be/src/util/coding-util.cc
@@ -18,16 +18,17 @@
 #include "util/coding-util.h"
 
 #include <cctype>
+#include <iomanip>
 #include <limits>
 #include <sstream>
+#include <unordered_set>
 
-#include <boost/algorithm/string/classification.hpp>
+#include <boost/algorithm/string.hpp>
 #include <boost/function.hpp>
 #include <sasl/sasl.h>
 
 #include "common/compiler-util.h"
 #include "common/logging.h"
-
 #include "common/names.h"
 #include "sasl/saslutil.h"
 
@@ -37,33 +38,37 @@ using std::uppercase;
 
 namespace impala {
 
+// It is more convenient to maintain the complement of the set of
+// characters to escape when not in Hive-compat mode.
+static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
+
 // Hive selectively encodes characters. This is the whitelist of
 // characters it will encode.
 // See common/src/java/org/apache/hadoop/hive/common/FileUtils.java
 // in the Hive source code for the source of this list.
-static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF");
-
-// It is more convenient to maintain the complement of the set of
-// characters to escape when not in Hive-compat mode.
-static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
+static const std::unordered_set<char> SpecialCharacters = {
+    '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\b', '\t', '\n',
+    '\v', '\f', '\r',  '\x0E', '\x0F', '\x10', '\x11', '\x12', '\x13', '\x14',
+    '\x15', '\x16', '\x17', '\x18', '\x19', '\x1A', '\x1B', '\x1C', '\x1D', 
'\x1E',
+    '\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', 
'[', ']',
+    '^'};
 
 static inline void UrlEncode(const char* in, int in_len, string* out, bool 
hive_compat) {
-  (*out).reserve(in_len);
   stringstream ss;
-  for (int i = 0; i < in_len; ++i) {
-    const char ch = in[i];
+  // "uppercase" and "hex" only affect the insertion of integers, not that of 
char values.
+  ss << uppercase << hex << setfill('0');
+  for (char ch : std::string_view(in, in_len)) {
     // Escape the character iff a) we are in Hive-compat mode and the
-    // character is in the Hive whitelist or b) we are not in
-    // Hive-compat mode, and the character is not alphanumeric or one
-    // of the four commonly excluded characters.
-    if ((hive_compat && HiveShouldEscape(ch)) ||
-        (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) {
-      ss << '%' << uppercase << hex << static_cast<uint32_t>(ch);
+    // character is in the Hive whitelist or b) we are not in Hive-compat mode 
and
+    // the character is not alphanumeric and it is not one of the characters 
specifically
+    // excluded from escaping (see ShouldNotEscape()).
+    if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat &&
+            !isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) 
{
+      ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned 
char>(ch));
     } else {
       ss << ch;
     }
   }
-
   (*out) = ss.str();
 }
 
diff --git a/testdata/workloads/functional-query/queries/QueryTest/insert.test 
b/testdata/workloads/functional-query/queries/QueryTest/insert.test
index 88cec41bf..190fa1f36 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/insert.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/insert.test
@@ -446,6 +446,24 @@ SELECT * FROM insert_string_partitioned WHERE s2 = "/\%.";
 string, string
 ====
 ---- QUERY
+truncate insert_string_partitioned;
+INSERT INTO TABLE insert_string_partitioned PARTITION(s2="O'Reilly") values 
('0');
+SELECT * FROM insert_string_partitioned where s2= "O'Reilly";
+---- RESULTS
+'0','O''Reilly'
+---- TYPES
+STRING, STRING
+====
+---- QUERY
+truncate insert_string_partitioned;
+INSERT INTO TABLE insert_string_partitioned 
PARTITION(s2="Impala'#%*/:=?{}[]^") values ('0');
+SELECT * FROM insert_string_partitioned where s2= "Impala'#%*/:=?{}[]^";
+---- RESULTS
+'0','Impala''#%*/:=?{}[]^'
+---- TYPES
+STRING, STRING
+====
+---- QUERY
 # static partition insert into string-partitioned table with non-escaped 
special characters
 # (Hive chooses not to escape + and ' ')
 truncate insert_string_partitioned;
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
 
b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
index 59bd14dda..9d9f304da 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
@@ -291,3 +291,39 @@ INT, STRING
 drop table testtbl_parquet;
 ---- RESULTS
 'Table has been dropped.'
+====
+---- QUERY
+# Tests for IMPALA-11499
+create table unicode_partition_values (id int) partitioned by (p string) 
stored as parquet;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+insert into unicode_partition_values partition(p='运营业务数据') values (0);
+insert into unicode_partition_values partition(p='运') values (0);
+insert into unicode_partition_values 
partition(p='运营业务数据1234567890!@#$%^&*(){}[]') values (0);
+select * from unicode_partition_values;
+---- RESULTS: RAW_STRING
+0,'运'
+0,'运营业务数据'
+0,'运营业务数据1234567890!@#$%^&*(){}[]'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+create table unicode_partition_values_iceberg (id int, p string) partitioned 
by spec (identity(p)) stored by iceberg;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+insert into unicode_partition_values_iceberg values (0, '运营业务数据');
+insert into unicode_partition_values_iceberg values (0, '运');
+insert into unicode_partition_values_iceberg values (0, 
'运营业务数据1234567890!@#$%^&*(){}[]');
+select * from unicode_partition_values_iceberg;
+---- RESULTS: RAW_STRING
+0,'运'
+0,'运营业务数据'
+0,'运营业务数据1234567890!@#$%^&*(){}[]'
+---- TYPES
+INT, STRING
+====
\ No newline at end of file
diff --git a/tests/query_test/test_insert.py b/tests/query_test/test_insert.py
index 0124ac932..81987a941 100644
--- a/tests/query_test/test_insert.py
+++ b/tests/query_test/test_insert.py
@@ -294,6 +294,69 @@ class TestInsertPartKey(ImpalaTestSuite):
     self.run_test_case('QueryTest/insert_part_key', vector,
         multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
 
+  def test_escaped_partition_values(self, unique_database):
+    """Test for special characters in partition values."""
+    tbl = unique_database + ".tbl"
+    self.execute_query(
+        "create table {}(i int) partitioned by (p string) stored as 
parquet".format(tbl))
+    # "\\'" is used to represent a single quote since the insert statement 
uses a single
+    # quote on the partition value. "\\\\" is used for backslash since it gets 
escaped
+    # again in parsing the insert statement.
+    special_characters = 
"SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \
+                         
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
+                         "\x1C\x1D\x1E\x1F\"\x7F\\'%*/:=?\\\\{[]#^"
+    part_value = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \
+                 "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
+                 "\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^"
+    part_dir = 
"p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \
+                
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \
+                "%2F%3A%3D%3F%5C%7B%5B%5D%23%5E"
+    res = self.execute_query(
+        "insert into {} partition(p='{}') values (0)".format(tbl, 
special_characters))
+    assert res.data[0] == part_dir + ": 1"
+    res = self.client.execute("select p from {}".format(tbl))
+    assert res.data[0] == part_value
+    res = self.execute_query("show partitions " + tbl)
+    # There is a "\t" in the partition value making splitting of the result 
line
+    # difficult, hence we only verify that the value is present in string
+    # representation of the whole row.
+    assert part_value in res.data[0]
+    assert part_dir in res.data[0]
+
+  def test_escaped_partition_values_iceberg(self, unique_database):
+    """Test for special characters in partition values for iceberg tables"""
+    tbl = unique_database + ".tbl"
+    self.execute_query("create table {} (id int, p string) partitioned by"
+                       " spec (identity(p)) stored by iceberg".format(tbl))
+    # "\\'" is used to represent a single quote since the insert statement 
uses a single
+    # quote on the partition value. "\\\\" is used for backslash since it gets 
escaped
+    # again in parsing the insert statement.
+    special_characters = 
"SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \
+                         
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
+                         "\x1C\x1D\x1E\x1F\"\x7F\\'%*/:=?\\\\{[]#^"
+    part_value = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r" \
+                 "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
+                 "\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^"
+    part_dir = 
"p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \
+               
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \
+               "%2F%3A%3D%3F%5C%7B%5B%5D%23%5E"
+    show_part_value = 
"SpecialCharacters\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006" \
+                      "\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010" \
+                      "\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017" \
+                      "\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E" \
+                     "\\u001F\\\"\\u007F\'%*\\/:=?\\\\{[]#^"
+    res = self.execute_query(
+        "insert into {} values (0, '{}')".format(tbl, special_characters))
+    assert res.data[0] == part_dir + ": 1"
+    res = self.client.execute("select p from {}".format(tbl))
+    assert res.data[0] == part_value
+    res = self.execute_query("show partitions " + tbl)
+    # There is a "\t" in the partition value making splitting of the result 
line
+    # difficult, hence we only verify that the value is present in string
+    # representation of the whole row.
+    assert show_part_value in res.data[0]
+
+
 class TestInsertNullQueries(ImpalaTestSuite):
   @classmethod
   def get_workload(self):

(impala) 01/02: IMPALA-11499: Refactor UrlEncode function to handle special characters

Reply via email to