This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch branch-1.18.x
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/branch-1.18.x by this push:
new be8c57524 KUDU-3646 fix Base64Decode()
be8c57524 is described below
commit be8c57524f4c5fdbca51ac1730284ac520d59be5
Author: Alexey Serbin <[email protected]>
AuthorDate: Mon Mar 3 23:53:28 2025 -0800
KUDU-3646 fix Base64Decode()
Prior to this patch, Base64Decode() in src/kudu/util/url-coding.cc
had a bug that lead to truncating the output data by discarding
trailing null/zero bytes.
Originally imported from the Impala's codebase [1], the bug has been
lurking for 11+ years in the Kudu's codebase. Meanwhile, Impala fixed
the issue a long time ago [2].
This patch addresses the issue and adds corresponding test coverage.
In addition, this patch adds extra validation for the input of
Base64Decode() and provides test coverage for that as well.
Also, reference binding to null pointer in url-coding-test.cc has been
addressed as well.
[1] https://github.com/apache/kudu/commit/ce4d760ca
[2] https://issues.apache.org/jira/browse/IMPALA-2878
Change-Id: I7f5d41eb4e41897d520f00b12c6e487ecc5a00fc
Reviewed-on: http://gerrit.cloudera.org:8080/22573
Reviewed-by: Ashwani Raina <[email protected]>
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Abhishek Chennaka <[email protected]>
(cherry picked from commit 34089941debf91a0d1de538a41da482fcb8dcf4a)
Reviewed-on: http://gerrit.cloudera.org:8080/22607
Tested-by: Kudu Jenkins
---
src/kudu/util/url-coding-test.cc | 66 ++++++++++++++++++++++++++++++++++-
src/kudu/util/url-coding.cc | 74 +++++++++++++++++++++++++++++++++-------
src/kudu/util/url-coding.h | 5 +++
3 files changed, 131 insertions(+), 14 deletions(-)
diff --git a/src/kudu/util/url-coding-test.cc b/src/kudu/util/url-coding-test.cc
index 9504f695c..b69c651da 100644
--- a/src/kudu/util/url-coding-test.cc
+++ b/src/kudu/util/url-coding-test.cc
@@ -23,6 +23,7 @@
#include <string>
#include <vector>
+#include <glog/logging.h>
#include <gtest/gtest.h>
using std::ostringstream;
@@ -65,7 +66,9 @@ void TestBase64(const string& input, const string&
expected_encoded) {
// Convert string to vector and try that also
vector<uint8_t> input_vector(input.size());
- memcpy(&input_vector[0], input.c_str(), input.size());
+ if (!input_vector.empty()) {
+ memcpy(&input_vector[0], input.c_str(), input.size());
+ }
string intermediate2;
Base64Encode(input_vector, &intermediate2);
EXPECT_EQ(intermediate, intermediate2);
@@ -94,12 +97,73 @@ TEST(UrlCodingTest, PathSeparators) {
}
TEST(Base64Test, Basic) {
+ TestBase64("", "");
TestBase64("a", "YQ==");
+ TestBase64(string("a\0", 2), "YQA=");
+ TestBase64(string("\0a", 2), "AGE=");
+ TestBase64(string("a\0\0", 3), "YQAA");
+ TestBase64(string("\0a\0", 3), "AGEA");
+ TestBase64(string("\0 a \0", 5), "ACBhIAA=");
+ TestBase64(string(" \0a\0 ", 5), "IABhACA=");
TestBase64("ab", "YWI=");
+ TestBase64(string("ab\0", 3), "YWIA");
+ TestBase64("a a", "YSBh");
TestBase64("abc", "YWJj");
+ TestBase64("a a ", "YSBhIA==");
+ TestBase64(string("a a \0", 5), "YSBhIAA=");
+ TestBase64(string("a a \0\0", 6), "YSBhIAAA");
+ TestBase64(string("\0a\0 \0a\0 \0\0", 10), "AGEAIABhACAAAA==");
TestBase64("abcd", "YWJjZA==");
+ TestBase64(string("abcd\0", 5), "YWJjZAA=");
TestBase64("abcde", "YWJjZGU=");
+ TestBase64(string("abcde\0", 6), "YWJjZGUA");
TestBase64("abcdef", "YWJjZGVm");
+ TestBase64(string("a\0b", 3), "YQBi");
+ TestBase64(string("a\0b\0", 4), "YQBiAA==");
+ TestBase64(string("a\0b\0\0", 5), "YQBiAAA=");
+ TestBase64(string("a\0b\0\0\0", 6), "YQBiAAAA");
+}
+
+TEST(Base64DecodeTest, InvalidInput) {
+ // These should fail to decode.
+ string out;
+ ASSERT_FALSE(Base64Decode("A", &out));
+ ASSERT_FALSE(Base64Decode("=", &out));
+ ASSERT_FALSE(Base64Decode("A=", &out));
+ ASSERT_FALSE(Base64Decode("==", &out));
+ ASSERT_FALSE(Base64Decode("A==", &out));
+ ASSERT_FALSE(Base64Decode("=A=", &out));
+ ASSERT_FALSE(Base64Decode("===", &out));
+ ASSERT_FALSE(Base64Decode("A===", &out));
+ ASSERT_FALSE(Base64Decode("====", &out));
+ ASSERT_FALSE(Base64Decode("A====", &out));
+ ASSERT_FALSE(Base64Decode("==A==", &out));
+ ASSERT_FALSE(Base64Decode("YQ=", &out));
+ ASSERT_FALSE(Base64Decode("=W=", &out));
+ ASSERT_FALSE(Base64Decode("YQW==", &out));
+ ASSERT_FALSE(Base64Decode("=YQW=", &out));
+ ASSERT_FALSE(Base64Decode("=Y=W=", &out));
+ ASSERT_FALSE(Base64Decode("ADCD=", &out));
+ ASSERT_FALSE(Base64Decode("ADCD==", &out));
+ ASSERT_FALSE(Base64Decode("ADCD===", &out));
+}
+
+TEST(Base64DecodeTest, InvalidInputMisplacedPaddingSymbols) {
+#if !DCHECK_IS_ON()
+ GTEST_SKIP() << "this test is only for DCHECK_IS_ON()";
+#endif
+ string out;
+ ASSERT_DEATH(({ Base64Decode("=a", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("==a", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("===a", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("=A==", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("==A=", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("YQA=YQA=", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("YQA=ABCD", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("YQA==ABCD", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("=QA=ABCD", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("YQ==YQ==", &out); }), "non-trailing '='");
+ ASSERT_DEATH(({ Base64Decode("YQ==YQA=", &out); }), "non-trailing '='");
}
TEST(HtmlEscapingTest, Basic) {
diff --git a/src/kudu/util/url-coding.cc b/src/kudu/util/url-coding.cc
index a80720dbb..9b5fce0a3 100644
--- a/src/kudu/util/url-coding.cc
+++ b/src/kudu/util/url-coding.cc
@@ -38,6 +38,10 @@ using boost::archive::iterators::transform_width;
using std::string;
using std::vector;
+// TODO(aserbin): consider replacing boost-based implementation of base64
+// encoding/decoding with https://github.com/tplgy/cppcodec
+// per list at https://en.cppreference.com/w/cpp/links/libs
+
namespace kudu {
// Hive selectively encodes characters. This is the whitelist of
@@ -165,23 +169,67 @@ void Base64Encode(const string& in, std::ostringstream*
out) {
bool Base64Decode(const string& in, string* out) {
typedef transform_width<binary_from_base64<string::const_iterator>, 8, 6>
base64_decode;
- string tmp = in;
- // Replace padding with base64 encoded NULL
- replace(tmp.begin(), tmp.end(), '=', 'A');
+
+ if (in.empty()) {
+ *out = "";
+ return true;
+ }
+
+ // Boost's binary_from_base64 translates '=' symbols into null/zero bytes
[1],
+ // so the corresponding trailing null/zero extra bytes must be removed after
+ // the transformation.
+ //
+ // [1]
https://www.boost.org/doc/libs/1_87_0/boost/archive/iterators/binary_from_base64.hpp
+ size_t padded_num = 0;
+ auto padding_it = in.crbegin();
+ for (; padding_it != in.crend(); ++padding_it) {
+ if (*padding_it != '=') {
+ break;
+ }
+ ++padded_num;
+ }
+ if (padded_num > 0) {
+ // If padding is present, be strict on the length of the input
+ // because of the following reasons, at least:
+ // * padding is to provide proper 'alignment' for 6-to-8 bit transcoding,
+ // and any length mismatch means corruption of the input data
+ // * trailing extra zero bytes are removed below with an assumption
+ // of proper 8-to-6 'alignment' to guarantee the consistency of output
+ if (in.size() % 4 != 0) {
+ return false;
+ }
+ if (padded_num > 2) {
+ // Invalid base64-encoded sequence.
+ return false;
+ }
+ }
+
+ // It's not enforced in release builds for backward compatibility reasons,
+ // but '=' symbols are expected to be present only in the end of the input
+ // string. This implementation doesn't provide functionality to handle
+ // a concatenation of base64-encoded sequences in a consistent manner.
+ DCHECK(!std::any_of(padding_it, in.crend(), [](char c) { return c == '='; }))
+ << "non-trailing '='";
+
+ string tmp;
try {
- *out = string(base64_decode(tmp.begin()), base64_decode(tmp.end()));
- } catch(std::exception& e) {
+ tmp = string(base64_decode(in.begin()), base64_decode(in.end()));
+ } catch (std::exception&) {
return false;
}
-
- // Remove trailing '\0' that were added as padding. Since \0 is special,
- // the boost functions get confused so do this manually.
- int num_padded_chars = 0;
- for (int i = out->size() - 1; i >= 0; --i) {
- if ((*out)[i] != '\0') break;
- ++num_padded_chars;
+ DCHECK_GT(tmp.size(), padded_num);
+#if DCHECK_IS_ON()
+ for (auto it = tmp.crbegin(); it != tmp.crbegin() + padded_num; ++it) {
+ // Make sure only null/zero bytes are removed when binary_from_base64 adds
+ // those for the padding '=' symbols in the input.
+ DCHECK_EQ('\0', *it);
}
- out->resize(out->size() - num_padded_chars);
+#endif
+ // Remove trailing zero bytes produced by binary_from_base64 for the padding.
+ tmp.erase(tmp.end() - padded_num, tmp.end());
+
+ DCHECK(padded_num == 0 || (tmp.size() % 3 == 3 - padded_num));
+ *out = std::move(tmp);
return true;
}
diff --git a/src/kudu/util/url-coding.h b/src/kudu/util/url-coding.h
index 3f667aa1b..5443b3a40 100644
--- a/src/kudu/util/url-coding.h
+++ b/src/kudu/util/url-coding.h
@@ -52,6 +52,11 @@ void Base64Encode(const std::string& in, std::ostringstream*
out);
// Utility method to decode base64 encoded strings. Also not extremely
// performant.
// Returns true unless the string could not be correctly decoded.
+//
+// NOTE: current implementation doesn't handle concatenation of base64-encoded
+// sequences in a consistent manner, converting all padding '=' symbols
+// in the beginning and in the middle of the input into zero/null bytes;
+// only single base64-encoded sequence is expected in the input
bool Base64Decode(const std::string& in, std::string* out);
// Replaces &, < and > with &, < and > respectively. This is