Repository: incubator-impala Updated Branches: refs/heads/master d8d3e2391 -> 2dad444c8
IMPALA-3732: handle string length overflow in avro files Avro string lengths are encoded as 64-bit integers. Impala can only handle up to 32-bit integers, so we need to be careful about handling out-of-range integers. Negative integers were already handled by a previous patch, but if a positive 64-bit integer is truncated to a 32-bit integer, the result can be a negative length. This patch fixes CHAR/VARCHAR behaviour, where we can just truncate the string, and STRING, where we can't truncate the string, so must return an error. Testing: Added unit tests for STRING, CHAR, and VARCHAR that exercise the string overflow handling. Change-Id: If6541e7c68255bf599b26386a55057c93e62af51 Reviewed-on: http://gerrit.cloudera.org:8080/3383 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2dad444c Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2dad444c Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2dad444c Branch: refs/heads/master Commit: 2dad444c8c1db09598d40ae34a81b4bc4276fbb4 Parents: d8d3e23 Author: Tim Armstrong <[email protected]> Authored: Tue Jun 14 14:57:19 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Wed Jun 15 02:33:17 2016 -0700 ---------------------------------------------------------------------- be/src/exec/hdfs-avro-scanner-ir.cc | 18 +++++- be/src/exec/hdfs-avro-scanner-test.cc | 92 +++++++++++++++++++++++++----- be/src/exec/hdfs-avro-scanner.cc | 10 ++++ be/src/exec/hdfs-avro-scanner.h | 1 + common/thrift/generate_error_codes.py | 3 + 5 files changed, 108 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-avro-scanner-ir.cc b/be/src/exec/hdfs-avro-scanner-ir.cc index 6839fe1..eeaae9d 100644 --- a/be/src/exec/hdfs-avro-scanner-ir.cc +++ b/be/src/exec/hdfs-avro-scanner-ir.cc @@ -13,6 +13,7 @@ // limitations under the License. #include <algorithm> +#include <limits> #include "exec/hdfs-avro-scanner.h" #include "exec/read-write-util.h" @@ -21,6 +22,7 @@ using namespace impala; using namespace strings; +using std::numeric_limits; // Functions in this file are cross-compiled to IR with clang. @@ -183,8 +185,10 @@ bool HdfsAvroScanner::ReadAvroVarchar(PrimitiveType type, int max_len, uint8_t** if (write_slot) { DCHECK(type == TYPE_VARCHAR); StringValue* sv = reinterpret_cast<StringValue*>(slot); - int str_len = std::min(static_cast<int>(len.val), max_len); - DCHECK(str_len >= 0); + // 'max_len' is an int, so the result of min() should always be in [0, INT_MAX]. + // We need to be careful not to truncate the length before evaluating min(). + int str_len = static_cast<int>(std::min<int64_t>(len.val, max_len)); + DCHECK_GE(str_len, 0); sv->len = str_len; sv->ptr = reinterpret_cast<char*>(*data); } @@ -199,7 +203,10 @@ bool HdfsAvroScanner::ReadAvroChar(PrimitiveType type, int max_len, uint8_t** da if (write_slot) { DCHECK(type == TYPE_CHAR); ColumnType ctype = ColumnType::CreateCharType(max_len); - int str_len = std::min(static_cast<int>(len.val), max_len); + // 'max_len' is an int, so the result of min() should always be in [0, INT_MAX]. + // We need to be careful not to truncate the length before evaluating min(). + int str_len = static_cast<int>(std::min<int64_t>(len.val, max_len)); + DCHECK_GE(str_len, 0); if (ctype.IsVarLenStringType()) { StringValue* sv = reinterpret_cast<StringValue*>(slot); sv->ptr = reinterpret_cast<char*>(pool->TryAllocate(max_len)); @@ -227,6 +234,11 @@ bool HdfsAvroScanner::ReadAvroString(PrimitiveType type, uint8_t** data, if (UNLIKELY(!len.ok)) return false; if (write_slot) { DCHECK(type == TYPE_STRING); + if (UNLIKELY(len.val > numeric_limits<int>::max())) { + SetStatusValueOverflow(TErrorCode::SCANNER_STRING_LENGTH_OVERFLOW, len.val, + numeric_limits<int>::max()); + return false; + } StringValue* sv = reinterpret_cast<StringValue*>(slot); sv->len = len.val; sv->ptr = reinterpret_cast<char*>(*data); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-avro-scanner-test.cc b/be/src/exec/hdfs-avro-scanner-test.cc index e5f77f3..1a8b759 100644 --- a/be/src/exec/hdfs-avro-scanner-test.cc +++ b/be/src/exec/hdfs-avro-scanner-test.cc @@ -23,7 +23,8 @@ #include "runtime/runtime-state.h" #include "runtime/string-value.inline.h" -// TODO: CHAR, VARCHAR (IMPALA-3658) +// TODO: IMPALA-3658: complete CHAR unit tests. +// TODO: IMPALA-3658: complete VARCHAR unit tests. namespace impala { @@ -51,6 +52,21 @@ class HdfsAvroScannerTest : public testing::Test { } } + template<typename T> + void CheckReadResult(T expected_val, int expected_encoded_len, + TErrorCode::type expected_error, T val, bool success, int actual_encoded_len) { + bool expect_success = expected_error == TErrorCode::OK; + EXPECT_EQ(success, expect_success); + + if (success) { + EXPECT_TRUE(scanner_.parse_status_.ok()); + EXPECT_EQ(val, expected_val); + EXPECT_EQ(expected_encoded_len, actual_encoded_len); + } else { + EXPECT_EQ(scanner_.parse_status_.code(), expected_error); + } + } + // Templated function for calling different ReadAvro* functions. // // PrimitiveType is a template parameter so we can pass in int 'slot_byte_size' to @@ -62,22 +78,12 @@ class HdfsAvroScannerTest : public testing::Test { TErrorCode::type expected_error = TErrorCode::OK) { // Reset parse_status_ scanner_.parse_status_ = Status::OK(); - uint8_t* new_data = data; T slot; - bool expect_success = expected_error == TErrorCode::OK; - bool success = (scanner_.*read_fn)(type, &new_data, data + data_len, true, &slot, NULL); - EXPECT_EQ(success, expect_success); - - if (success) { - EXPECT_TRUE(scanner_.parse_status_.ok()); - EXPECT_EQ(slot, expected_val); - EXPECT_EQ(new_data, data + expected_encoded_len); - } else { - EXPECT_EQ(scanner_.parse_status_.code(), expected_error); - } + CheckReadResult(expected_val, expected_encoded_len, expected_error, slot, success, + new_data - data); } void TestReadAvroBoolean(uint8_t* data, int64_t data_len, bool expected_val, @@ -131,12 +137,48 @@ class HdfsAvroScannerTest : public testing::Test { expected_val, 8, expected_error); } + void TestReadAvroChar(int max_len, uint8_t* data, int64_t data_len, + StringValue expected_val, int expected_encoded_len, + TErrorCode::type expected_error = TErrorCode::OK) { + // Reset parse_status_ + scanner_.parse_status_ = Status::OK(); + uint8_t* new_data = data; + char slot[max<int>(sizeof(StringValue), max_len)]; + bool success = scanner_.ReadAvroChar(TYPE_CHAR, max_len, &new_data, data + data_len, + true, slot, NULL); + StringValue value; + if (max_len <= ColumnType::MAX_CHAR_INLINE_LENGTH) { + // Convert to non-inlined string value for comparison. + value.len = max_len; + value.ptr = slot; + } else { + value = *reinterpret_cast<StringValue*>(slot); + } + CheckReadResult(expected_val, expected_encoded_len, expected_error, value, success, + new_data - data); + } + + void TestReadAvroVarchar(int max_len, uint8_t* data, int64_t data_len, + StringValue expected_val, int expected_encoded_len, + TErrorCode::type expected_error = TErrorCode::OK) { + // Reset parse_status_ + scanner_.parse_status_ = Status::OK(); + + uint8_t* new_data = data; + StringValue slot; + bool success = scanner_.ReadAvroVarchar(TYPE_VARCHAR, max_len, &new_data, + data + data_len, true, &slot, NULL); + CheckReadResult(expected_val, expected_encoded_len, expected_error, slot, success, + new_data - data); + } + void TestReadAvroString(uint8_t* data, int64_t data_len, StringValue expected_val, int expected_encoded_len, TErrorCode::type expected_error = TErrorCode::OK) { TestReadAvroType(&HdfsAvroScanner::ReadAvroString, TYPE_STRING, data, data_len, expected_val, expected_encoded_len, expected_error); } + template<typename T> void TestReadAvroDecimal(uint8_t* data, int64_t data_len, DecimalValue<T> expected_val, int expected_encoded_len, TErrorCode::type expected_error = TErrorCode::OK) { @@ -282,6 +324,30 @@ TEST_F(HdfsAvroScannerTest, DoubleTest) { TestReadAvroDouble(data, 7, d, TErrorCode::AVRO_TRUNCATED_BLOCK); } +TEST_F(HdfsAvroScannerTest, StringLengthOverflowTest) { + const char* s = "hello world"; + StringValue sv(s); + // Test handling of strings that would overflow the StringValue::len 32-bit integer. + // Test cases with 0 and 1 in the upper bit of the 32-bit integer. + vector<int64_t> large_string_lens({0x1FFFFFFFF, 0x0FFFFFFFF, 0x100000000}); + for (int64_t large_string_len: large_string_lens) { + LOG(INFO) << "Testing large string of length " << large_string_len; + vector<uint8_t> large_buffer(large_string_len + ReadWriteUtil::MAX_ZLONG_LEN); + int64_t zlong_len = ReadWriteUtil::PutZLong(large_string_len, large_buffer.data()); + memcpy(large_buffer.data() + zlong_len, s, strlen(s)); + + // CHAR and VARCHAR truncation should still work in this case. + TestReadAvroChar(sv.len, large_buffer.data(), large_buffer.size(), sv, + zlong_len + large_string_len); + TestReadAvroVarchar(sv.len, large_buffer.data(), large_buffer.size(), sv, + zlong_len + large_string_len); + + // Interpreting as a string would overflow length field. + TestReadAvroString(large_buffer.data(), large_buffer.size(), sv, -1, + TErrorCode::SCANNER_STRING_LENGTH_OVERFLOW); + } +} + TEST_F(HdfsAvroScannerTest, StringTest) { uint8_t data[100]; const char* s = "hello"; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc index 70dff83..fdb0a96 100644 --- a/be/src/exec/hdfs-avro-scanner.cc +++ b/be/src/exec/hdfs-avro-scanner.cc @@ -655,6 +655,16 @@ void HdfsAvroScanner::SetStatusInvalidValue(TErrorCode::type error_code, int64_t } } +void HdfsAvroScanner::SetStatusValueOverflow(TErrorCode::type error_code, int64_t len, + int64_t limit) { + DCHECK(parse_status_.ok()); + if (TestInfo::is_test()) { + parse_status_ = Status(error_code, "test file", len, limit, 123); + } else { + parse_status_ = Status(error_code, stream_->filename(), len, limit, stream_->file_offset()); + } +} + // This function produces a codegen'd function equivalent to MaterializeTuple() but // optimized for the table schema. Via helper functions CodegenReadRecord() and // CodegenReadScalar(), it eliminates the conditionals necessary when interpreting the http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-avro-scanner.h b/be/src/exec/hdfs-avro-scanner.h index 2069a9b..961b2da 100644 --- a/be/src/exec/hdfs-avro-scanner.h +++ b/be/src/exec/hdfs-avro-scanner.h @@ -284,6 +284,7 @@ class HdfsAvroScanner : public BaseSequenceScanner { /// contain exception handling code. void SetStatusCorruptData(TErrorCode::type error_code); void SetStatusInvalidValue(TErrorCode::type error_code, int64_t len); + void SetStatusValueOverflow(TErrorCode::type error_code, int64_t len, int64_t limit); /// Unit test constructor HdfsAvroScanner(); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/common/thrift/generate_error_codes.py ---------------------------------------------------------------------- diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index 6365281..0b6f5cf 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -265,6 +265,9 @@ error_codes = ( ("AVRO_INVALID_METADATA_COUNT", 86, "File '$0' is corrupt: invalid metadata count $1 " "at offset $2"), + + ("SCANNER_STRING_LENGTH_OVERFLOW", 87, "File '$0' could not be read: string $1 was " + "longer than supported limit of $2 bytes at offset $3"), ) import sys
