Repository: incubator-impala Updated Branches: refs/heads/master 3b8a0891d -> 6d1a130d8
IMPALA-4387: validate decimal type in Avro file schema This patch prevents an invalid decimal type in an Avro file schema from crashing Impala. Most invalid Avro schemas are caught by the frontend, but file schemas still need to be validated by the backend. After this patch files with bad schemas are skipped. Testing: This was hit very rarely by the scanner fuzzing. Added a regression test that scans a file with a bad schema. Change-Id: I25a326ee2220bc14d3b5f887dc288b4adf859cfc Reviewed-on: http://gerrit.cloudera.org:8080/4876 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6587c08f Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6587c08f Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6587c08f Branch: refs/heads/master Commit: 6587c08f701ea4a72d03a36e159f9074a84d5f8f Parents: 3b8a089 Author: Tim Armstrong <[email protected]> Authored: Thu Oct 27 13:13:02 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Sun Oct 30 00:12:58 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-avro-scanner.cc | 15 ++++--- be/src/runtime/decimal-test.cc | 18 ++++++++ be/src/runtime/types.h | 10 +++-- be/src/util/avro-util.cc | 42 +++++++++++++------ be/src/util/avro-util.h | 7 +++- common/thrift/generate_error_codes.py | 7 +++- testdata/bad_avro_snap/README | 13 +++++- .../bad_avro_snap/invalid_decimal_schema.avro | Bin 0 -> 328 bytes .../functional/functional_schema_template.sql | 10 +++++ .../datasets/functional/schema_constraints.csv | 1 + .../queries/DataErrorsTest/avro-errors.test | 8 ++++ 11 files changed, 104 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/exec/hdfs-avro-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc index 91a9d03..5b17fc1 100644 --- a/be/src/exec/hdfs-avro-scanner.cc +++ b/be/src/exec/hdfs-avro-scanner.cc @@ -385,12 +385,15 @@ Status HdfsAvroScanner::VerifyTypesMatch(const AvroSchemaElement& table_schema, return Status::OK(); } - ColumnType reader_type = AvroSchemaToColumnType(table_schema.schema); - ColumnType writer_type = AvroSchemaToColumnType(file_schema.schema); + ColumnType reader_type; + RETURN_IF_ERROR(AvroSchemaToColumnType(table_schema.schema, field_name, &reader_type)); + ColumnType writer_type; + RETURN_IF_ERROR(AvroSchemaToColumnType(file_schema.schema, field_name, &writer_type)); bool match = VerifyTypesMatch(reader_type, writer_type); if (match) return Status::OK(); return Status(TErrorCode::AVRO_SCHEMA_RESOLUTION_ERROR, field_name, - avro_type_name(table_schema.schema->type), avro_type_name(file_schema.schema->type)); + avro_type_name(table_schema.schema->type), + avro_type_name(file_schema.schema->type)); } Status HdfsAvroScanner::VerifyTypesMatch(SlotDescriptor* slot_desc, avro_obj_t* schema) { @@ -405,10 +408,12 @@ Status HdfsAvroScanner::VerifyTypesMatch(SlotDescriptor* slot_desc, avro_obj_t* // TODO: update if/when we have TYPE_STRUCT primitive type if (schema->type == AVRO_RECORD) { return Status(TErrorCode::AVRO_SCHEMA_METADATA_MISMATCH, col_name, - slot_desc->type().DebugString(), avro_type_name(schema->type)); + slot_desc->type().DebugString(), avro_type_name(schema->type)); } - bool match = VerifyTypesMatch(slot_desc->type(), AvroSchemaToColumnType(schema)); + ColumnType file_type; + RETURN_IF_ERROR(AvroSchemaToColumnType(schema, col_name, &file_type)); + bool match = VerifyTypesMatch(slot_desc->type(), file_type); if (match) return Status::OK(); return Status(TErrorCode::AVRO_SCHEMA_METADATA_MISMATCH, col_name, slot_desc->type().DebugString(), avro_type_name(schema->type)); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/runtime/decimal-test.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/decimal-test.cc b/be/src/runtime/decimal-test.cc index ab87a0b..f88c8f5 100644 --- a/be/src/runtime/decimal-test.cc +++ b/be/src/runtime/decimal-test.cc @@ -22,6 +22,7 @@ #include <boost/cstdint.hpp> #include <boost/lexical_cast.hpp> #include "runtime/decimal-value.inline.h" +#include "runtime/types.h" #include "testutil/gtest-util.h" #include "util/string-parser.h" @@ -796,6 +797,23 @@ TEST(DecimalArithmetic, RandTesting) { } } +TEST(DecimalValidation, PrecisionScaleValidation) { + // Valid precision and scale. + EXPECT_TRUE(ColumnType::ValidateDecimalParams(1, 0)); + EXPECT_TRUE(ColumnType::ValidateDecimalParams(1, 1)); + EXPECT_TRUE(ColumnType::ValidateDecimalParams(38, 38)); + EXPECT_TRUE(ColumnType::ValidateDecimalParams(38, 0)); + + // Out of range precision or scale. + EXPECT_FALSE(ColumnType::ValidateDecimalParams(3, -1)); + EXPECT_FALSE(ColumnType::ValidateDecimalParams(0, 0)); + EXPECT_FALSE(ColumnType::ValidateDecimalParams(39, 0)); + EXPECT_FALSE(ColumnType::ValidateDecimalParams(38, 39)); + + // Incompatible precision and scale. + EXPECT_FALSE(ColumnType::ValidateDecimalParams(15, 16)); +} + } IMPALA_TEST_MAIN(); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/runtime/types.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h index a30447c..9558792 100644 --- a/be/src/runtime/types.h +++ b/be/src/runtime/types.h @@ -124,11 +124,13 @@ struct ColumnType { return ret; } + static bool ValidateDecimalParams(int precision, int scale) { + return precision >= 1 && precision <= MAX_PRECISION && scale >= 0 + && scale <= MAX_SCALE && scale <= precision; + } + static ColumnType CreateDecimalType(int precision, int scale) { - DCHECK_LE(precision, MAX_PRECISION); - DCHECK_LE(scale, MAX_SCALE); - DCHECK_GE(precision, 0); - DCHECK_LE(scale, precision); + DCHECK(ValidateDecimalParams(precision, scale)) << precision << ", " << scale; ColumnType ret; ret.type = TYPE_DECIMAL; ret.precision = precision; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/util/avro-util.cc ---------------------------------------------------------------------- diff --git a/be/src/util/avro-util.cc b/be/src/util/avro-util.cc index be3e4a2..f03242c 100644 --- a/be/src/util/avro-util.cc +++ b/be/src/util/avro-util.cc @@ -103,23 +103,39 @@ ScopedAvroSchemaElement::~ScopedAvroSchemaElement() { avro_schema_decref(element_.schema); } -ColumnType AvroSchemaToColumnType(const avro_schema_t& schema) { +Status AvroSchemaToColumnType( + const avro_schema_t& schema, const string& column_name, ColumnType* column_type) { switch (schema->type) { case AVRO_BYTES: case AVRO_STRING: - return TYPE_STRING; - case AVRO_INT32: return TYPE_INT; - case AVRO_INT64: return TYPE_BIGINT; - case AVRO_FLOAT: return TYPE_FLOAT; - case AVRO_DOUBLE: return TYPE_DOUBLE; - case AVRO_BOOLEAN: return TYPE_BOOLEAN; - case AVRO_DECIMAL: - return ColumnType::CreateDecimalType( - avro_schema_decimal_precision(schema), avro_schema_decimal_scale(schema)); + *column_type = TYPE_STRING; + return Status::OK(); + case AVRO_INT32: + *column_type = TYPE_INT; + return Status::OK(); + case AVRO_INT64: + *column_type = TYPE_BIGINT; + return Status::OK(); + case AVRO_FLOAT: + *column_type = TYPE_FLOAT; + return Status::OK(); + case AVRO_DOUBLE: + *column_type = TYPE_DOUBLE; + return Status::OK(); + case AVRO_BOOLEAN: + *column_type = TYPE_BOOLEAN; + return Status::OK(); + case AVRO_DECIMAL: { + int precision = avro_schema_decimal_precision(schema); + int scale = avro_schema_decimal_scale(schema); + if (!ColumnType::ValidateDecimalParams(precision, scale)) { + return Status(TErrorCode::AVRO_INVALID_DECIMAL, column_name, precision, scale); + } + *column_type = ColumnType::CreateDecimalType(precision, scale); + return Status::OK(); + } default: - DCHECK(false) << "NYI: " << avro_type_name(schema->type); - return INVALID_TYPE; + return Status(TErrorCode::AVRO_UNSUPPORTED_TYPE, column_name, avro_type_name(schema->type)); } } - } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/util/avro-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/avro-util.h b/be/src/util/avro-util.h index 1a9a354..3106279 100644 --- a/be/src/util/avro-util.h +++ b/be/src/util/avro-util.h @@ -80,8 +80,11 @@ struct ScopedAvroSchemaElement { DISALLOW_COPY_AND_ASSIGN(ScopedAvroSchemaElement); }; -ColumnType AvroSchemaToColumnType(const avro_schema_t& schema); - +/// Converts an Avro schema column to an Impala type. Returns an error status if the Avro +/// schema does not map to a valid Impala type. 'column_name' is used only for error +/// messages. +Status AvroSchemaToColumnType( + const avro_schema_t& schema, const std::string& column_name, ColumnType* column_type); } #endif http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/common/thrift/generate_error_codes.py ---------------------------------------------------------------------- diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index f03b073..e9df146 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -297,7 +297,12 @@ error_codes = ( ("KUDU_KEY_NOT_FOUND", 96, "Key not found in Kudu table '$0'."), - ("KUDU_SESSION_ERROR", 97, "Error in Kudu table '$0': $1") + ("KUDU_SESSION_ERROR", 97, "Error in Kudu table '$0': $1"), + + ("AVRO_UNSUPPORTED_TYPE", 98, "Column '$0': unsupported Avro type '$1'"), + + ("AVRO_INVALID_DECIMAL", 99, + "Column '$0': invalid Avro decimal type with precision = '$1' scale = '$2'"), ) import sys http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/bad_avro_snap/README ---------------------------------------------------------------------- diff --git a/testdata/bad_avro_snap/README b/testdata/bad_avro_snap/README index a88d0ad..6271967 100644 --- a/testdata/bad_avro_snap/README +++ b/testdata/bad_avro_snap/README @@ -1,7 +1,7 @@ -These Avro files were created by modifying Impala's HdfsAvroTableWriter. - String Data ----------- +Created by modifying Impala's HdfsAvroTableWriter. + These files' schemas have a single nullable string column 's'. negative_string_len.avro: contains two values, but the second value has a negative length. @@ -14,6 +14,15 @@ truncated_string.avro: contains one value, which is missing the last byte. Float Data ---------- +Created by modifying Impala's HdfsAvroTableWriter. + These files' schemas have a single nullable float column 'c1'. truncated_float.avro: contains two float values. The second is missing the last byte. + +Bad Schema +---------- +Created by editing the schema of a valid Avro file with vim. + +invalid_decimal_schema.avro: two columns, name STRING and value DECIMAL(5,7). + The DECIMAL value is invalid. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/bad_avro_snap/invalid_decimal_schema.avro ---------------------------------------------------------------------- diff --git a/testdata/bad_avro_snap/invalid_decimal_schema.avro b/testdata/bad_avro_snap/invalid_decimal_schema.avro new file mode 100644 index 0000000..0141f99 Binary files /dev/null and b/testdata/bad_avro_snap/invalid_decimal_schema.avro differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/datasets/functional/functional_schema_template.sql ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 7b929b7..bfe8f39 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -1509,6 +1509,16 @@ c1 FLOAT LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/truncated_float.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== ---- DATASET +functional +---- BASE_TABLE_NAME +bad_avro_decimal_schema +---- COLUMNS +name STRING +value DECIMAL(5,2) +---- DEPENDENT_LOAD +LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/invalid_decimal_schema.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +==== +---- DATASET -- IMPALA-694: uses data file produced by parquet-mr version 1.2.5-cdh4.5.0 -- (can't use LOAD DATA LOCAL with Impala so copied in create-load-data.sh) functional http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/datasets/functional/schema_constraints.csv ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 4c9da6d..d6d1111 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -37,6 +37,7 @@ table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block table_name:bad_avro_snap_floats, constraint:restrict_to, table_format:avro/snap/block +table_name:bad_avro_decimal_schema, constraint:restrict_to, table_format:avro/snap/block table_name:bad_parquet, constraint:restrict_to, table_format:parquet/none/none table_name:bad_parquet_strings_negative_len, constraint:restrict_to, table_format:parquet/none/none table_name:bad_parquet_strings_out_of_bounds, constraint:restrict_to, table_format:parquet/none/none http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test index 6ca2af6..e396583 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test @@ -24,3 +24,11 @@ float Problem parsing file $NAMENODE/test-warehouse/bad_avro_snap_floats_avro_snap/truncated_float.avro at 159 File '$NAMENODE/test-warehouse/bad_avro_snap_floats_avro_snap/truncated_float.avro' is corrupt: truncated data block at offset 159 ==== +---- QUERY +select * from bad_avro_decimal_schema +---- TYPES +string,decimal +---- RESULTS +---- ERRORS +Column 'value': invalid Avro decimal type with precision = '5' scale = '7' +====
