IMPALA-(3895,3859): Don't log file data on parse errors Logging file or table data is a bad idea, and doing it by default is particularly bad. This patch changes HdfsScanNode::LogRowParseError() to log a file and offset only.
Testing: See rewritten tests. To support testing this change, we also fix IMPALA-3895, by introducing a canonical string __HDFS_FILENAME__ that all Hadoop filenames in the ERROR output are replaced with before comparing with the expected results. This fixes a number of issues with the old way of matching filenames which purported to be a regex, but really wasn't. In particular, we can now match the rest of an ERROR line after the filename, which was not possible before. In some cases, we don't want to substitute filenames because the ERROR output is looking for a very specific output. In that case we can write: $NAMENODE/<filename> and this patch will not perform _any_ filename substitutions on ERROR sections that contain the $NAMENODE string. Finally, this patch fixes a bug where a test that had an ERRORS section but no RESULTS section would silently pass without testing anything. Change-Id: I5a604f8784a9ff7b4bf878f82ee7f56697df3272 Reviewed-on: http://gerrit.cloudera.org:8080/4020 Reviewed-by: Henry Robinson <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/34b5f1c4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/34b5f1c4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/34b5f1c4 Branch: refs/heads/master Commit: 34b5f1c416148f95a34324d66c1ebbf9585d1845 Parents: 480efc9 Author: Henry Robinson <[email protected]> Authored: Thu Jul 21 14:26:17 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Thu Aug 25 10:20:36 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-scanner-ir.cc | 4 +- be/src/exec/hdfs-scanner.cc | 26 +- be/src/exec/hdfs-scanner.h | 11 +- be/src/exec/hdfs-sequence-scanner.cc | 8 +- be/src/exec/hdfs-sequence-scanner.h | 4 - be/src/exec/hdfs-text-scanner.cc | 24 +- be/src/exec/hdfs-text-scanner.h | 4 - .../queries/DataErrorsTest/avro-errors.test | 14 +- .../DataErrorsTest/hbase-scan-node-errors.test | 132 +++++----- .../hdfs-rcfile-scan-node-errors.test | 255 +++++++++++-------- .../DataErrorsTest/hdfs-scan-node-errors.test | 135 ++++------ .../hdfs-sequence-scan-errors.test | 4 +- .../QueryTest/parquet-continue-on-error.test | 12 +- .../queries/QueryTest/strict-mode-abort.test | 12 +- .../queries/QueryTest/strict-mode.test | 31 +-- tests/common/impala_test_suite.py | 59 +++-- tests/common/test_result_verifier.py | 26 +- tests/util/filesystem_utils.py | 4 + tests/util/hdfs_util.py | 35 ++- 19 files changed, 393 insertions(+), 407 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-scanner-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner-ir.cc b/be/src/exec/hdfs-scanner-ir.cc index 098c8e5..2b053ca 100644 --- a/be/src/exec/hdfs-scanner-ir.cc +++ b/be/src/exec/hdfs-scanner-ir.cc @@ -64,9 +64,7 @@ int HdfsScanner::WriteAlignedTuples(MemPool* pool, TupleRow* tuple_row, int row_ // Report parse errors if (UNLIKELY(error_in_row)) { - if (!ReportTupleParseError(fields, error, i + row_idx_start)) { - return -1; - } + if (!ReportTupleParseError(fields, error)) return -1; } // Advance to the start of the next tuple http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc index b0a30d9..ad3d834 100644 --- a/be/src/exec/hdfs-scanner.cc +++ b/be/src/exec/hdfs-scanner.cc @@ -619,8 +619,7 @@ Status HdfsScanner::UpdateDecompressor(const string& codec) { return Status::OK(); } -bool HdfsScanner::ReportTupleParseError(FieldLocation* fields, uint8_t* errors, - int row_idx) { +bool HdfsScanner::ReportTupleParseError(FieldLocation* fields, uint8_t* errors) { for (int i = 0; i < scan_node_->materialized_slots().size(); ++i) { if (errors[i]) { const SlotDescriptor* desc = scan_node_->materialized_slots()[i]; @@ -628,36 +627,25 @@ bool HdfsScanner::ReportTupleParseError(FieldLocation* fields, uint8_t* errors, errors[i] = false; } } - - // Call into subclass to log a more accurate error message. - if (state_->LogHasSpace()) { - stringstream ss; - ss << "file: " << stream_->filename() << endl << "record: "; - LogRowParseError(row_idx, &ss); - state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()), 2); - } + LogRowParseError(); if (state_->abort_on_error()) DCHECK(!parse_status_.ok()); return parse_status_.ok(); } -void HdfsScanner::LogRowParseError(int row_idx, stringstream* ss) { - // This is only called for text and seq files which should override this function. - DCHECK(false); +void HdfsScanner::LogRowParseError() { + const string& s = Substitute("Error parsing row: file: $0, before offset: $1", + stream_->filename(), stream_->file_offset()); + state_->LogError(ErrorMsg(TErrorCode::GENERAL, s)); } void HdfsScanner::ReportColumnParseError(const SlotDescriptor* desc, const char* data, int len) { - // len < 0 is used to indicate the data contains escape characters. We don't care - // about that here and can just output the raw string. - if (len < 0) len = -len; - if (state_->LogHasSpace() || state_->abort_on_error()) { stringstream ss; ss << "Error converting column: " << desc->col_pos() - scan_node_->num_partition_keys() - << " TO " << desc->type() - << " (Data is: " << string(data,len) << ")"; + << " to " << desc->type(); // When skipping multiple header lines we only try to skip them in the first scan // range. For subsequent scan ranges, it's impossible to determine how many lines http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h index b7bc89b..6245e5f 100644 --- a/be/src/exec/hdfs-scanner.h +++ b/be/src/exec/hdfs-scanner.h @@ -372,23 +372,18 @@ class HdfsScanner { /// Utility function to report parse errors for each field. /// If errors[i] is nonzero, fields[i] had a parse error. - /// row_idx is the idx of the row in the current batch that had the parse error /// Returns false if parsing should be aborted. In this case parse_status_ is set /// to the error. /// This is called from WriteAlignedTuples. - bool ReportTupleParseError(FieldLocation* fields, uint8_t* errors, int row_idx); + bool ReportTupleParseError(FieldLocation* fields, uint8_t* errors); /// Triggers debug action of the scan node. This is currently used by parquet column /// readers to exercise various failure paths in parquet scanner. Returns the status /// returned by the scan node's TriggerDebugAction(). Status TriggerDebugAction() { return scan_node_->TriggerDebugAction(); } - /// Utility function to append an error message for an invalid row. This is called - /// from ReportTupleParseError() - /// row_idx is the index of the row in the current batch. Subclasses should override - /// this function (i.e. text needs to join boundary rows). Since this is only in the - /// error path, vtable overhead is acceptable. - virtual void LogRowParseError(int row_idx, std::stringstream*); + /// Utility function to append an error message for an invalid row. + void LogRowParseError(); /// Writes out all slots for 'tuple' from 'fields'. 'fields' must be aligned /// to the start of the tuple (e.g. fields[0] maps to slots[0]). http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-sequence-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-sequence-scanner.cc b/be/src/exec/hdfs-sequence-scanner.cc index b5542f6..ec52901 100644 --- a/be/src/exec/hdfs-sequence-scanner.cc +++ b/be/src/exec/hdfs-sequence-scanner.cc @@ -332,7 +332,7 @@ Status HdfsSequenceScanner::ProcessRange() { template_tuple_, &errors[0], &error_in_row); if (UNLIKELY(error_in_row)) { - ReportTupleParseError(&field_locations_[0], errors, 0); + ReportTupleParseError(&field_locations_[0], errors); RETURN_IF_ERROR(parse_status_); } } else { @@ -505,9 +505,3 @@ Status HdfsSequenceScanner::ReadCompressedBlock() { return Status::OK(); } - -void HdfsSequenceScanner::LogRowParseError(int row_idx, stringstream* ss) { - DCHECK_LT(row_idx, record_locations_.size()); - *ss << string(reinterpret_cast<const char*>(record_locations_[row_idx].record), - record_locations_[row_idx].len); -} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-sequence-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-sequence-scanner.h b/be/src/exec/hdfs-sequence-scanner.h index 1246b5a..e2dcee9 100644 --- a/be/src/exec/hdfs-sequence-scanner.h +++ b/be/src/exec/hdfs-sequence-scanner.h @@ -219,10 +219,6 @@ class HdfsSequenceScanner : public BaseSequenceScanner { /// record_len: length of the record Status GetRecord(uint8_t** record_ptr, int64_t *record_len); - /// Appends the current file and line to the RuntimeState's error log. - /// row_idx is 0-based (in current batch) where the parse error occurred. - virtual void LogRowParseError(int row_idx, std::stringstream*); - /// Helper class for picking fields and rows from delimited text. boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_; std::vector<FieldLocation> field_locations_; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-text-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc index 4be4b02..f1c80b0 100644 --- a/be/src/exec/hdfs-text-scanner.cc +++ b/be/src/exec/hdfs-text-scanner.cc @@ -722,25 +722,6 @@ Status HdfsTextScanner::Open(ScannerContext* context) { return Status::OK(); } -void HdfsTextScanner::LogRowParseError(int row_idx, stringstream* ss) { - DCHECK_LT(row_idx, row_end_locations_.size()); - char* row_end = row_end_locations_[row_idx]; - char* row_start; - if (row_idx == 0) { - row_start = batch_start_ptr_; - } else { - // Row start at 1 past the row end (i.e. the row delimiter) for the previous row - row_start = row_end_locations_[row_idx - 1] + 1; - } - - if (!boundary_row_.IsEmpty()) { - // Log the beginning of the line from the previous file buffer(s). - *ss << string(boundary_row_.buffer(), boundary_row_.len()); - } - // Log the erroneous line (or the suffix of a line if !boundary_line.empty()). - *ss << string(row_start, row_end - row_start); -} - // This function writes fields in 'field_locations_' to the row_batch. This function // deals with tuples that straddle batches. There are two cases: // 1. There is already a partial tuple in flight from the previous time around. @@ -776,10 +757,7 @@ int HdfsTextScanner::WriteFields(MemPool* pool, TupleRow* tuple_row, if (state_->abort_on_error()) { parse_status_ = Status(state_->ErrorLog()); } else { - stringstream ss; - ss << "file: " << stream_->filename() << endl << "record: "; - LogRowParseError(0, &ss); - state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str())); + LogRowParseError(); } if (!parse_status_.ok()) return 0; error_in_row_ = false; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/be/src/exec/hdfs-text-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-text-scanner.h b/be/src/exec/hdfs-text-scanner.h index b355ac8..937626f 100644 --- a/be/src/exec/hdfs-text-scanner.h +++ b/be/src/exec/hdfs-text-scanner.h @@ -165,10 +165,6 @@ class HdfsTextScanner : public HdfsScanner { /// the boundary pool. void WritePartialTuple(FieldLocation*, int num_fields, bool copy_strings); - /// Appends the current file and line to the RuntimeState's error log. - /// row_idx is 0-based (in current batch) where the parse error occured. - virtual void LogRowParseError(int row_idx, std::stringstream*); - /// Mem pool for boundary_row_ and boundary_column_. boost::scoped_ptr<MemPool> boundary_pool_; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test index aaf59e9..6ca2af6 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test @@ -7,11 +7,11 @@ select * from bad_avro_snap_strings ---- TYPES string ---- ERRORS -row_regex: .*Problem parsing file.* -row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/truncated_string.avro' is corrupt: truncated data block at offset 155.* -row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/negative_string_len.avro' is corrupt: invalid length -7 at offset 164.* -row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/invalid_union.avro' is corrupt: invalid union value 4 at offset 174.* -row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/invalid_union.avro' is corrupt: invalid encoded integer at offset 191.* +row_regex: .*Problem parsing file $NAMENODE/.* +File '$NAMENODE/test-warehouse/bad_avro_snap_strings_avro_snap/truncated_string.avro' is corrupt: truncated data block at offset 155 +File '$NAMENODE/test-warehouse/bad_avro_snap_strings_avro_snap/negative_string_len.avro' is corrupt: invalid length -7 at offset 164 +File '$NAMENODE/test-warehouse/bad_avro_snap_strings_avro_snap/invalid_union.avro' is corrupt: invalid union value 4 at offset 174 +File '$NAMENODE/test-warehouse/bad_avro_snap_strings_avro_snap/invalid_union.avro' is corrupt: invalid encoded integer at offset 191 ==== ---- QUERY # Read from the corrupt files. We may get partial results. @@ -21,6 +21,6 @@ select * from bad_avro_snap_floats ---- TYPES float ---- ERRORS -row_regex: .*Problem parsing file.* -row_regex: .*File '.*/bad_avro_snap_floats_avro_snap/truncated_float.avro' is corrupt: truncated data block at offset 159.* +Problem parsing file $NAMENODE/test-warehouse/bad_avro_snap_floats_avro_snap/truncated_float.avro at 159 +File '$NAMENODE/test-warehouse/bad_avro_snap_floats_avro_snap/truncated_float.avro' is corrupt: truncated data block at offset 159 ==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test index 45a8beb..7934b17 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test @@ -2,86 +2,86 @@ ---- QUERY select * from hbasealltypeserror ---- ERRORS -Error converting column d:timestamp_col: '0' TO TIMESTAMP +Error converting column d:timestamp_col: '0' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 0 -Error converting column d:bool_col: 'errfalse' TO BOOL +Error converting column d:bool_col: 'errfalse' to BOOL hbase table: functional_hbase.hbasealltypeserror row key: 1 -Error converting column d:double_col: 'xyz30.300000' TO DOUBLE -Error converting column d:float_col: 'xyz3.000000' TO FLOAT +Error converting column d:double_col: 'xyz30.300000' to DOUBLE +Error converting column d:float_col: 'xyz3.000000' to FLOAT hbase table: functional_hbase.hbasealltypeserror row key: 13 -Error converting column d:timestamp_col: '0009-01-01 00:00:00' TO TIMESTAMP -Error converting column d:tinyint_col: 'xyz5' TO TINYINT +Error converting column d:timestamp_col: '0009-01-01 00:00:00' to TIMESTAMP +Error converting column d:tinyint_col: 'xyz5' to TINYINT hbase table: functional_hbase.hbasealltypeserror row key: 15 -Error converting column d:timestamp_col: '0' TO TIMESTAMP +Error converting column d:timestamp_col: '0' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 16 -Error converting column d:double_col: 'xyz70.700000' TO DOUBLE +Error converting column d:double_col: 'xyz70.700000' to DOUBLE hbase table: functional_hbase.hbasealltypeserror row key: 17 -Error converting column d:timestamp_col: '1999-10-10 90:10:10' TO TIMESTAMP -Error converting column d:tinyint_col: 'err2' TO TINYINT +Error converting column d:timestamp_col: '1999-10-10 90:10:10' to TIMESTAMP +Error converting column d:tinyint_col: 'err2' to TINYINT hbase table: functional_hbase.hbasealltypeserror row key: 2 -Error converting column d:timestamp_col: '2020-20-10 10:10:10.123' TO TIMESTAMP +Error converting column d:timestamp_col: '2020-20-10 10:10:10.123' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 21 -Error converting column d:timestamp_col: '2020-10-40 10:10:10.123' TO TIMESTAMP +Error converting column d:timestamp_col: '2020-10-40 10:10:10.123' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 22 -Error converting column d:smallint_col: 'abc3' TO SMALLINT -Error converting column d:timestamp_col: '2020-10-10 60:10:10.123' TO TIMESTAMP +Error converting column d:smallint_col: 'abc3' to SMALLINT +Error converting column d:timestamp_col: '2020-10-10 60:10:10.123' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 23 -Error converting column d:timestamp_col: '2020-10-10 10:70:10.123' TO TIMESTAMP +Error converting column d:timestamp_col: '2020-10-10 10:70:10.123' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 24 -Error converting column d:int_col: 'abc5' TO INT +Error converting column d:int_col: 'abc5' to INT hbase table: functional_hbase.hbasealltypeserror row key: 25 -Error converting column d:tinyint_col: 'abc7' TO TINYINT +Error converting column d:tinyint_col: 'abc7' to TINYINT hbase table: functional_hbase.hbasealltypeserror row key: 27 -Error converting column d:int_col: 'abc9' TO INT +Error converting column d:int_col: 'abc9' to INT hbase table: functional_hbase.hbasealltypeserror row key: 29 -Error converting column d:smallint_col: 'err3' TO SMALLINT -Error converting column d:timestamp_col: '2002-14-10 00:00:00' TO TIMESTAMP +Error converting column d:smallint_col: 'err3' to SMALLINT +Error converting column d:timestamp_col: '2002-14-10 00:00:00' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserror row key: 3 -Error converting column d:bigint_col: 'err300' TO BIGINT -Error converting column d:bool_col: 't\rue' TO BOOL -Error converting column d:double_col: 'err300.900000' TO DOUBLE -Error converting column d:float_col: 'err30..000000' TO FLOAT -Error converting column d:int_col: 'err30' TO INT -Error converting column d:smallint_col: 'err30' TO SMALLINT -Error converting column d:timestamp_col: '0000-01-01 00:00:00' TO TIMESTAMP -Error converting column d:tinyint_col: 'err30' TO TINYINT +Error converting column d:bigint_col: 'err300' to BIGINT +Error converting column d:bool_col: 't\rue' to BOOL +Error converting column d:double_col: 'err300.900000' to DOUBLE +Error converting column d:float_col: 'err30..000000' to FLOAT +Error converting column d:int_col: 'err30' to INT +Error converting column d:smallint_col: 'err30' to SMALLINT +Error converting column d:timestamp_col: '0000-01-01 00:00:00' to TIMESTAMP +Error converting column d:tinyint_col: 'err30' to TINYINT hbase table: functional_hbase.hbasealltypeserror row key: 30 -Error converting column d:int_col: 'err4' TO INT +Error converting column d:int_col: 'err4' to INT hbase table: functional_hbase.hbasealltypeserror row key: 4 -Error converting column d:bigint_col: 'err50' TO BIGINT +Error converting column d:bigint_col: 'err50' to BIGINT hbase table: functional_hbase.hbasealltypeserror row key: 5 -Error converting column d:float_col: 'err6.000000' TO FLOAT +Error converting column d:float_col: 'err6.000000' to FLOAT hbase table: functional_hbase.hbasealltypeserror row key: 6 -Error converting column d:double_col: 'err70.700000' TO DOUBLE +Error converting column d:double_col: 'err70.700000' to DOUBLE hbase table: functional_hbase.hbasealltypeserror row key: 7 -Error converting column d:bigint_col: 'err90' TO BIGINT -Error converting column d:bool_col: 'errtrue' TO BOOL -Error converting column d:double_col: 'err90.900000' TO DOUBLE -Error converting column d:float_col: 'err9.000000' TO FLOAT -Error converting column d:int_col: 'err9' TO INT -Error converting column d:smallint_col: 'err9' TO SMALLINT -Error converting column d:timestamp_col: '0000-01-01 00:00:00' TO TIMESTAMP -Error converting column d:tinyint_col: 'err9' TO TINYINT +Error converting column d:bigint_col: 'err90' to BIGINT +Error converting column d:bool_col: 'errtrue' to BOOL +Error converting column d:double_col: 'err90.900000' to DOUBLE +Error converting column d:float_col: 'err9.000000' to FLOAT +Error converting column d:int_col: 'err9' to INT +Error converting column d:smallint_col: 'err9' to SMALLINT +Error converting column d:timestamp_col: '0000-01-01 00:00:00' to TIMESTAMP +Error converting column d:tinyint_col: 'err9' to TINYINT hbase table: functional_hbase.hbasealltypeserror row key: 9 ---- FILEERRORS @@ -124,64 +124,64 @@ int, bigint, boolean, string, double, float, int, smallint, string, timestamp, t ---- QUERY select * from hbasealltypeserrornonulls ---- ERRORS -Error converting column d:timestamp_col: '123456' TO TIMESTAMP +Error converting column d:timestamp_col: '123456' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserrornonulls row key: 0 -Error converting column d:bool_col: 'errfalse' TO BOOL -Error converting column d:timestamp_col: '1990-00-01 10:10:10' TO TIMESTAMP +Error converting column d:bool_col: 'errfalse' to BOOL +Error converting column d:timestamp_col: '1990-00-01 10:10:10' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserrornonulls row key: 1 -Error converting column d:double_col: 'xyz30.300000' TO DOUBLE -Error converting column d:float_col: 'xyz3.000000' TO FLOAT +Error converting column d:double_col: 'xyz30.300000' to DOUBLE +Error converting column d:float_col: 'xyz3.000000' to FLOAT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 13 -Error converting column d:tinyint_col: 'xyz5' TO TINYINT +Error converting column d:tinyint_col: 'xyz5' to TINYINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 15 -Error converting column d:double_col: 'xyz70.700000' TO DOUBLE +Error converting column d:double_col: 'xyz70.700000' to DOUBLE hbase table: functional_hbase.hbasealltypeserrornonulls row key: 17 -Error converting column d:tinyint_col: 'err2' TO TINYINT +Error converting column d:tinyint_col: 'err2' to TINYINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 2 -Error converting column d:smallint_col: 'abc3' TO SMALLINT +Error converting column d:smallint_col: 'abc3' to SMALLINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 23 -Error converting column d:int_col: 'abc5' TO INT -Error converting column d:timestamp_col: '2012-Mar-22 11:20:01.123' TO TIMESTAMP +Error converting column d:int_col: 'abc5' to INT +Error converting column d:timestamp_col: '2012-Mar-22 11:20:01.123' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserrornonulls row key: 25 -Error converting column d:tinyint_col: 'abc7' TO TINYINT +Error converting column d:tinyint_col: 'abc7' to TINYINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 27 -Error converting column d:timestamp_col: '11:20:01.123 2012-03-22 ' TO TIMESTAMP +Error converting column d:timestamp_col: '11:20:01.123 2012-03-22 ' to TIMESTAMP hbase table: functional_hbase.hbasealltypeserrornonulls row key: 28 -Error converting column d:int_col: 'abc9' TO INT +Error converting column d:int_col: 'abc9' to INT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 29 -Error converting column d:smallint_col: 'err3' TO SMALLINT +Error converting column d:smallint_col: 'err3' to SMALLINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 3 -Error converting column d:int_col: 'err4' TO INT +Error converting column d:int_col: 'err4' to INT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 4 -Error converting column d:bigint_col: 'err50' TO BIGINT +Error converting column d:bigint_col: 'err50' to BIGINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 5 -Error converting column d:float_col: 'err6.000000' TO FLOAT +Error converting column d:float_col: 'err6.000000' to FLOAT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 6 -Error converting column d:double_col: 'err70.700000' TO DOUBLE +Error converting column d:double_col: 'err70.700000' to DOUBLE hbase table: functional_hbase.hbasealltypeserrornonulls row key: 7 -Error converting column d:bigint_col: 'err90' TO BIGINT -Error converting column d:bool_col: 'errtrue' TO BOOL -Error converting column d:double_col: 'err90.900000' TO DOUBLE -Error converting column d:float_col: 'err9.000000' TO FLOAT -Error converting column d:int_col: 'err9' TO INT -Error converting column d:smallint_col: 'err9' TO SMALLINT -Error converting column d:tinyint_col: 'err9' TO TINYINT +Error converting column d:bigint_col: 'err90' to BIGINT +Error converting column d:bool_col: 'errtrue' to BOOL +Error converting column d:double_col: 'err90.900000' to DOUBLE +Error converting column d:float_col: 'err9.000000' to FLOAT +Error converting column d:int_col: 'err9' to INT +Error converting column d:smallint_col: 'err9' to SMALLINT +Error converting column d:tinyint_col: 'err9' to TINYINT hbase table: functional_hbase.hbasealltypeserrornonulls row key: 9 ---- FILEERRORS http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test index 4e06af9..17ab362 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test @@ -3,113 +3,158 @@ select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col from alltypeserror ---- ERRORS -Error converting column: 1 TO BOOL -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 1,errfalse,,1,1,10,1.000000,10.100000,01/01/09,1 -Error converting column: 2 TO TINYINT -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 2,true,err2,,2,20,2.000000,20.200000,01/01/09,2 -Error converting column: 3 TO SMALLINT -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 3,false,3,err3,,30,3.000000,30.300000,01/01/09,3 -Error converting column: 4 TO INT -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 4,true,4,4,err4,,4.000000,40.400000,01/01/09,4 -Error converting column: 5 TO BIGINT -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 5,false,5,5,5,err50,,50.500000,01/01/09,5 -Error converting column: 6 TO FLOAT -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 6,true,6,6,6,60,err6.000000,,01/01/09,6 -Error converting column: 7 TO DOUBLE -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 7,,,7,7,70,7.000000,err70.700000,01/01/09,7 -Error converting column: 1 TO BOOL -Error converting column: 2 TO TINYINT -Error converting column: 3 TO SMALLINT -Error converting column: 4 TO INT -Error converting column: 5 TO BIGINT -Error converting column: 6 TO FLOAT -Error converting column: 7 TO DOUBLE -file: alltypeserror_rc/year=2009/month=1/000001_0 -line: 9,errtrue,err9,err9,err9,err90,err9.000000,err90.900000,01/01/09,9 -Error converting column: 6 TO FLOAT -Error converting column: 7 TO DOUBLE -file: alltypeserror_rc/year=2009/month=2/000002_0 -line: 13,false,3,3,,,xyz3.000000,xyz30.300000,02/01/09,3 -Error converting column: 2 TO TINYINT -file: alltypeserror_rc/year=2009/month=2/000002_0 -line: 15,false,xyz5,5,5,50,5.000000,50.500000,02/01/09,5 -Error converting column: 7 TO DOUBLE -file: alltypeserror_rc/year=2009/month=2/000002_0 -line: 17,false,7,7,7,70,7.000000,xyz70.700000,02/01/09,7 -Error converting column: 3 TO SMALLINT -file: alltypeserror_rc/year=2009/month=3/000000_0 -line: 23,false,3,abc3,3,30,3.000000,30.300000,03/01/09,3 -Error converting column: 4 TO INT -file: alltypeserror_rc/year=2009/month=3/000000_0 -line: 25,false,5,5,abc5,50,5.000000,50.500000,03/01/09,5 -Error converting column: 2 TO TINYINT -file: alltypeserror_rc/year=2009/month=3/000000_0 -line: 27,false,abc7,7,7,70,7.000000,70.700000,03/01/09,7 -Error converting column: 4 TO INT -file: alltypeserror_rc/year=2009/month=3/000000_0 -line: 29,false,9,9,abc9,90,9.000000,90.900000,03/01/09,9 +Error converting column: 3 to SMALLINT +file: __HDFS_FILENAME__ +Error converting column: 4 to INT +file: __HDFS_FILENAME__ +Error converting column: 2 to TINYINT +file: __HDFS_FILENAME__ +Error converting column: 4 to INT +file: __HDFS_FILENAME__ +Error converting column: 1 to BOOLEAN +Error converting column: 2 to TINYINT +Error converting column: 3 to SMALLINT +Error converting column: 4 to INT +Error converting column: 5 to BIGINT +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 2 to TINYINT +file: __HDFS_FILENAME__ +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 1 to BOOLEAN +file: __HDFS_FILENAME__ +Error converting column: 2 to TINYINT +file: __HDFS_FILENAME__ +Error converting column: 3 to SMALLINT +file: __HDFS_FILENAME__ +Error converting column: 4 to INT +file: __HDFS_FILENAME__ +Error converting column: 5 to BIGINT +file: __HDFS_FILENAME__ +Error converting column: 6 to FLOAT +file: __HDFS_FILENAME__ +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 1 to BOOLEAN +Error converting column: 2 to TINYINT +Error converting column: 3 to SMALLINT +Error converting column: 4 to INT +Error converting column: 5 to BIGINT +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +---- RESULTS +0,NULL,NULL,0,0,0,0,0,'01/01/09','0' +1,NULL,NULL,1,1,10,1,10.1,'01/01/09','1' +10,NULL,NULL,NULL,0,0,0,0,'02/01/09','0' +11,false,NULL,NULL,NULL,10,1,10.1,'02/01/09','1' +12,true,2,NULL,NULL,NULL,2,20.2,'02/01/09','2' +13,false,3,3,NULL,NULL,NULL,NULL,'02/01/09','3' +14,true,4,4,4,40,NULL,NULL,'02/01/09','4' +15,false,NULL,5,5,50,5,50.5,'02/01/09','5' +16,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'02/01/09','6' +17,false,7,7,7,70,7,NULL,'02/01/09','7' +18,true,8,8,8,80,8,80.8,'02/01/09','8' +19,false,9,9,9,90,9,90.90000000000001,'02/01/09','9' +2,true,NULL,NULL,2,20,2,20.2,'01/01/09','2' +20,true,0,0,0,0,0,0,'03/01/09','0' +21,false,1,1,1,10,1,10.1,'03/01/09','1' +22,true,2,2,2,20,2,20.2,'03/01/09','2' +23,false,3,NULL,3,30,3,30.3,'03/01/09','3' +24,true,4,4,4,40,4,40.4,'03/01/09','4' +25,false,5,5,NULL,50,5,50.5,'03/01/09','5' +26,true,6,6,6,60,6,60.6,'03/01/09','6' +27,false,NULL,7,7,70,7,70.7,'03/01/09','7' +28,true,8,8,8,80,8,80.8,'03/01/09','8' +29,false,9,9,NULL,90,9,90.90000000000001,'03/01/09','9' +3,false,3,NULL,NULL,30,3,30.3,'01/01/09','3' +30,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','10' +4,true,4,4,NULL,NULL,4,40.4,'01/01/09','4' +5,false,5,5,5,NULL,NULL,50.5,'01/01/09','5' +6,true,6,6,6,60,NULL,NULL,'01/01/09','6' +7,NULL,NULL,7,7,70,7,NULL,'01/01/09','7' +8,false,NULL,NULL,8,80,8,80.8,'01/01/09','8' +9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/09','9' +---- TYPES +INT,BOOLEAN,TINYINT,SMALLINT,INT,BIGINT,FLOAT,DOUBLE,STRING,STRING ==== ---- QUERY select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col from alltypeserrornonulls +---- RESULTS +0,true,0,0,0,0,0,0,'01/01/09','0' +1,NULL,1,1,1,10,1,10.1,'01/01/09','1' +10,true,0,0,0,0,0,0,'02/01/09','0' +11,false,1,1,1,10,1,10.1,'02/01/09','1' +12,true,2,2,2,20,2,20.2,'02/01/09','2' +13,false,3,3,3,30,NULL,NULL,'02/01/09','3' +14,true,4,4,4,40,4,40.4,'02/01/09','4' +15,false,NULL,5,5,50,5,50.5,'02/01/09','5' +16,true,6,6,6,60,6,60.6,'02/01/09','6' +17,false,7,7,7,70,7,NULL,'02/01/09','7' +18,true,8,8,8,80,8,80.8,'02/01/09','8' +19,false,9,9,9,90,9,90.90000000000001,'02/01/09','9' +2,true,NULL,2,2,20,2,20.2,'01/01/09','2' +20,true,0,0,0,0,0,0,'03/01/09','0' +21,false,1,1,1,10,1,10.1,'03/01/09','1' +22,true,2,2,2,20,2,20.2,'03/01/09','2' +23,false,3,NULL,3,30,3,30.3,'03/01/09','3' +24,true,4,4,4,40,4,40.4,'03/01/09','4' +25,false,5,5,NULL,50,5,50.5,'03/01/09','5' +26,true,6,6,6,60,6,60.6,'03/01/09','6' +27,false,NULL,7,7,70,7,70.7,'03/01/09','7' +28,true,8,8,8,80,8,80.8,'03/01/09','8' +29,false,9,9,NULL,90,9,90.90000000000001,'03/01/09','9' +3,false,3,NULL,3,30,3,30.3,'01/01/09','3' +4,true,4,4,NULL,40,4,40.4,'01/01/09','4' +5,false,5,5,5,NULL,5,50.5,'01/01/09','5' +6,true,6,6,6,60,NULL,60.6,'01/01/09','6' +7,false,7,7,7,70,7,NULL,'01/01/09','7' +8,false,8,8,8,80,8,80.8,'01/01/09','8' +9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/09','9' +---- TYPES +INT,BOOLEAN,TINYINT,SMALLINT,INT,BIGINT,FLOAT,DOUBLE,STRING,STRING ---- ERRORS -Error converting column: 1 TO BOOL -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 1,errfalse,1,1,1,10,1.000000,10.100000,01/01/09,1 -Error converting column: 2 TO TINYINT -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 2,true,err2,2,2,20,2.000000,20.200000,01/01/09,2 -Error converting column: 3 TO SMALLINT -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 3,false,3,err3,3,30,3.000000,30.300000,01/01/09,3 -Error converting column: 4 TO INT -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 4,true,4,4,err4,40,4.000000,40.400000,01/01/09,4 -Error converting column: 5 TO BIGINT -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 5,false,5,5,5,err50,5.000000,50.500000,01/01/09,5 -Error converting column: 6 TO FLOAT -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 6,true,6,6,6,60,err6.000000,60.600000,01/01/09,6 -Error converting column: 7 TO DOUBLE -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 7,false,7,7,7,70,7.000000,err70.700000,01/01/09,7 -Error converting column: 1 TO BOOL -Error converting column: 2 TO TINYINT -Error converting column: 3 TO SMALLINT -Error converting column: 4 TO INT -Error converting column: 5 TO BIGINT -Error converting column: 6 TO FLOAT -Error converting column: 7 TO DOUBLE -file: alltypeserrornonulls_rc/year=2009/month=1/000001_0 -line: 9,errtrue,err9,err9,err9,err90,err9.000000,err90.900000,01/01/09,9 -Error converting column: 6 TO FLOAT -Error converting column: 7 TO DOUBLE -file: alltypeserrornonulls_rc/year=2009/month=2/000002_0 -line: 13,false,3,3,3,30,xyz3.000000,xyz30.300000,02/01/09,3 -Error converting column: 2 TO TINYINT -file: alltypeserrornonulls_rc/year=2009/month=2/000002_0 -line: 15,false,xyz5,5,5,50,5.000000,50.500000,02/01/09,5 -Error converting column: 7 TO DOUBLE -file: alltypeserrornonulls_rc/year=2009/month=2/000002_0 -line: 17,false,7,7,7,70,7.000000,xyz70.700000,02/01/09,7 -Error converting column: 3 TO SMALLINT -file: alltypeserrornonulls_rc/year=2009/month=3/000000_0 -line: 23,false,3,abc3,3,30,3.000000,30.300000,03/01/09,3 -Error converting column: 4 TO INT -file: alltypeserrornonulls_rc/year=2009/month=3/000000_0 -line: 25,false,5,5,abc5,50,5.000000,50.500000,03/01/09,5 -Error converting column: 2 TO TINYINT -file: alltypeserrornonulls_rc/year=2009/month=3/000000_0 -line: 27,false,abc7,7,7,70,7.000000,70.700000,03/01/09,7 -Error converting column: 4 TO INT -file: alltypeserrornonulls_rc/year=2009/month=3/000000_0 -line: 29,false,9,9,abc9,90,9.000000,90.900000,03/01/09,9 -==== \ No newline at end of file +Error converting column: 3 to SMALLINT +file: __HDFS_FILENAME__ +Error converting column: 4 to INT +file: __HDFS_FILENAME__ +Error converting column: 2 to TINYINT +file: __HDFS_FILENAME__ +Error converting column: 4 to INT +file: __HDFS_FILENAME__ +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 2 to TINYINT +file: __HDFS_FILENAME__ +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 1 to BOOLEAN +file: __HDFS_FILENAME__ +Error converting column: 2 to TINYINT +file: __HDFS_FILENAME__ +Error converting column: 3 to SMALLINT +file: __HDFS_FILENAME__ +Error converting column: 4 to INT +file: __HDFS_FILENAME__ +Error converting column: 5 to BIGINT +file: __HDFS_FILENAME__ +Error converting column: 6 to FLOAT +file: __HDFS_FILENAME__ +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +Error converting column: 1 to BOOLEAN +Error converting column: 2 to TINYINT +Error converting column: 3 to SMALLINT +Error converting column: 4 to INT +Error converting column: 5 to BIGINT +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +file: __HDFS_FILENAME__ +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test index fe1b087..2357f9a 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test @@ -8,33 +8,25 @@ ## implemented). #select id, bool_col, tinyint_col, smallint_col from alltypeserror #---- ERRORS -#Error converting column: 3 TO SMALLINT (Data is: abc3) +#Error converting column: 3 to SMALLINT #file: hdfs://regex:.$ -#record: 23,false,3,abc3,3,30,3.000000,30.300000,03/01/09,3,2020-10-10 60:10:10.123 -#Error converting column: 2 TO TINYINT (Data is: abc7) +#Error converting column: 2 to TINYINT #file: hdfs://regex:.$ -#record: 27,false,abc7,7,7,70,7.000000,70.700000,03/01/09,7,2020-10-10 10:10:10.123 -#Error converting column: 2 TO TINYINT (Data is: err30) -#Error converting column: 3 TO SMALLINT (Data is: err30) +#Error converting column: 2 to TINYINT +#Error converting column: 3 to SMALLINT #file: hdfs://regex:.$ -#record: 30,t\rue,err30,err30,err30,err300,err30..000000,err300.900000,01/01/10,10,0000-01-01 00:00:00 -#Error converting column: 2 TO TINYINT (Data is: xyz5) +#Error converting column: 2 to TINYINT #file: hdfs://regex:.$ -#record: 15,false,xyz5,5,5,50,5.000000,50.500000,02/01/09,5,0009-01-01 00:00:00 -#Error converting column: 1 TO BOOLEAN (Data is: errfalse) +#Error converting column: 1 to BOOLEAN #file: hdfs://regex:.$ -#record: 1,errfalse,,1,1,10,1.000000,10.100000,01/01/09,1,1999-10-10 -#Error converting column: 2 TO TINYINT (Data is: err2) +#Error converting column: 2 to TINYINT #file: hdfs://regex:.$ -#record: 2,true,err2,,2,20,2.000000,20.200000,01/01/09,2,1999-10-10 90:10:10 -#Error converting column: 3 TO SMALLINT (Data is: err3) +#Error converting column: 3 to SMALLINT #file: hdfs://regex:.$ -#record: 3,false,3,err3,,30,3.000000,30.300000,01/01/09,3,2002-14-10 00:00:00 -#Error converting column: 1 TO BOOLEAN (Data is: errtrue) -#Error converting column: 2 TO TINYINT (Data is: err9) -#Error converting column: 3 TO SMALLINT (Data is: err9) +#Error converting column: 1 to BOOLEAN +#Error converting column: 2 to TINYINT +#Error converting column: 3 to SMALLINT #file: hdfs://regex:.$ -#record: 9,errtrue,err9,err9,err9,err90,err9.000000,err90.900000,01/01/09,9,0000-01-01 00:00:00 # #---- RESULTS #0,NULL,NULL,0 @@ -92,68 +84,49 @@ bigint ---- QUERY select * from alltypeserrornonulls ---- ERRORS - -Error converting column: 3 TO SMALLINT (Data is: abc3) -file: hdfs://regex:.$ -record: 23,false,3,abc3,3,30,3.000000,30.300000,03/01/09,3,2012-03-22 11:20:01.123 -Error converting column: 4 TO INT (Data is: abc5) -Error converting column: 10 TO TIMESTAMP (Data is: 2012-Mar-22 11:20:01.123) -file: hdfs://regex:.$ -record: 25,false,5,5,abc5,50,5.000000,50.500000,03/01/09,5,2012-Mar-22 11:20:01.123 -Error converting column: 2 TO TINYINT (Data is: abc7) -file: hdfs://regex:.$ -record: 27,false,abc7,7,7,70,7.000000,70.700000,03/01/09,7,2012-03-22 11:20:01.123 -Error converting column: 10 TO TIMESTAMP (Data is: 11:20:01.123 2012-03-22 ) -file: hdfs://regex:.$ -record: 28,true,8,8,8,80,8.000000,80.800000,03/01/09,8,11:20:01.123 2012-03-22 -Error converting column: 4 TO INT (Data is: abc9) -file: hdfs://regex:.$ -record: 29,false,9,9,abc9,90,9.000000,90.900000,03/01/09,9,2012-03-22 -Error converting column: 6 TO FLOAT (Data is: xyz3.000000) -Error converting column: 7 TO DOUBLE (Data is: xyz30.300000) -file: hdfs://regex:.$ -record: 13,false,3,3,3,30,xyz3.000000,xyz30.300000,02/01/09,3,2012-03-22 11:20:01.123 -Error converting column: 2 TO TINYINT (Data is: xyz5) -file: hdfs://regex:.$ -record: 15,false,xyz5,5,5,50,5.000000,50.500000,02/01/09,5,2012-03-22 11:20:01.123 -Error converting column: 7 TO DOUBLE (Data is: xyz70.700000) -file: hdfs://regex:.$ -record: 17,false,7,7,7,70,7.000000,xyz70.700000,02/01/09,7,2012-03-22 11:20:01.123 -Error converting column: 10 TO TIMESTAMP (Data is: 123456) -file: hdfs://regex:.$ -record: 0,true,0,0,0,0,0.000000,0.000000,01/01/09,0,123456 -Error converting column: 1 TO BOOLEAN (Data is: errfalse) -Error converting column: 10 TO TIMESTAMP (Data is: 1990-00-01 10:10:10) -file: hdfs://regex:.$ -record: 1,errfalse,1,1,1,10,1.000000,10.100000,01/01/09,1,1990-00-01 10:10:10 -Error converting column: 2 TO TINYINT (Data is: err2) -file: hdfs://regex:.$ -record: 2,true,err2,2,2,20,2.000000,20.200000,01/01/09,2,2012-03-22 11:20:01.123 -Error converting column: 3 TO SMALLINT (Data is: err3) -file: hdfs://regex:.$ -record: 3,false,3,err3,3,30,3.000000,30.300000,01/01/09,3,2012-03-22 11:20:01.123 -Error converting column: 4 TO INT (Data is: err4) -file: hdfs://regex:.$ -record: 4,true,4,4,err4,40,4.000000,40.400000,01/01/09,4,2012-03-22 11:20:01.123 -Error converting column: 5 TO BIGINT (Data is: err50) -file: hdfs://regex:.$ -record: 5,false,5,5,5,err50,5.000000,50.500000,01/01/09,5,2012-03-22 11:20:01.123 -Error converting column: 6 TO FLOAT (Data is: err6.000000) -file: hdfs://regex:.$ -record: 6,true,6,6,6,60,err6.000000,60.600000,01/01/09,6,2012-03-22 11:20:01.123 -Error converting column: 7 TO DOUBLE (Data is: err70.700000) -file: hdfs://regex:.$ -record: 7,false,7,7,7,70,7.000000,err70.700000,01/01/09,7,2012-03-22 11:20:01.123 -Error converting column: 1 TO BOOLEAN (Data is: errtrue) -Error converting column: 2 TO TINYINT (Data is: err9) -Error converting column: 3 TO SMALLINT (Data is: err9) -Error converting column: 4 TO INT (Data is: err9) -Error converting column: 5 TO BIGINT (Data is: err90) -Error converting column: 6 TO FLOAT (Data is: err9.000000) -Error converting column: 7 TO DOUBLE (Data is: err90.900000) -file: hdfs://regex:.$ -record: 9,errtrue,err9,err9,err9,err90,err9.000000,err90.900000,01/01/09,9,2012-03-22 11:20:01.123 - +Error converting column: 3 to SMALLINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 4 to INT +Error converting column: 10 to TIMESTAMP +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 2 to TINYINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 10 to TIMESTAMP +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 4 to INT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 2 to TINYINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 7 to DOUBLE +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 10 to TIMESTAMP +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 1 to BOOLEAN +Error converting column: 10 to TIMESTAMP +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 2 to TINYINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 3 to SMALLINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 4 to INT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 5 to BIGINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 6 to FLOAT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 7 to DOUBLE +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 1 to BOOLEAN +Error converting column: 2 to TINYINT +Error converting column: 3 to SMALLINT +Error converting column: 4 to INT +Error converting column: 5 to BIGINT +Error converting column: 6 to FLOAT +Error converting column: 7 to DOUBLE +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ ---- RESULTS 0,true,0,0,0,0,0,0,'01/01/09','0',NULL,2009,1 1,NULL,1,1,1,10,1,10.1,'01/01/09','1',NULL,2009,1 http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test index abc578d..e65c06d 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test @@ -5,9 +5,9 @@ select count(*) from functional_seq_snap.bad_seq_snap Bad synchronization marker Expected: '6e 91 6 ec be 78 a0 ac 72 10 7e 41 b4 da 93 3c ' Actual: '6e 91 6 78 78 78 a0 ac 72 10 7e 41 b4 da 93 3c ' -Problem parsing file: hdfs://regex:.$ +Problem parsing file __HDFS_FILENAME__ at 899514 (1 of 5 similar) Decompressor: invalid compressed length. Data is likely corrupt. (1 of 3 similar) -Tried to read 896782 bytes but could only read 896777 bytes. This may indicate data file corruption. (file: hdfs://regex:.$ +Tried to read 896782 bytes but could only read 896777 bytes. This may indicate data file corruption. (file __HDFS_FILENAME__, byte offset: 2691508) ---- RESULTS 9434 http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test index 1a16d75..2952706 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test @@ -40,7 +40,7 @@ bigint,bigint 29,10 30,10 ---- ERRORS -Column metadata states there are 50 values, but read 100 values from column element. file: hdfs://regex:.$ +Column metadata states there are 50 values, but read 100 values from column element. file=__HDFS_FILENAME__ (1 of 2 similar) ==== ---- QUERY # Same as above but only selecting a single scalar column. @@ -81,7 +81,7 @@ bigint 29 30 ---- ERRORS -Column metadata states there are 11 values, but read 10 values from column id. file: hdfs://regex:.$ +Column metadata states there are 11 values, but read 10 values from column id. file=__HDFS_FILENAME__ ==== ---- QUERY SELECT * from bad_parquet_strings_negative_len @@ -89,8 +89,8 @@ SELECT * from bad_parquet_strings_negative_len STRING ---- RESULTS ---- ERRORS -row_regex: .*File '.*/plain-encoded-negative-len.parq' is corrupt: error decoding value of type STRING at offset 58.* -row_regex: .*File '.*/dict-encoded-negative-len.parq' is corrupt: error reading dictionary for data of type STRING: could not decode dictionary.* +File '$NAMENODE/test-warehouse/bad_parquet_strings_negative_len_parquet/plain-encoded-negative-len.parq' is corrupt: error decoding value of type STRING at offset 58 +File '$NAMENODE/test-warehouse/bad_parquet_strings_negative_len_parquet/dict-encoded-negative-len.parq' is corrupt: error reading dictionary for data of type STRING: could not decode dictionary ==== ---- QUERY SELECT * from bad_parquet_strings_out_of_bounds @@ -98,6 +98,6 @@ SELECT * from bad_parquet_strings_out_of_bounds STRING ---- RESULTS ---- ERRORS -row_regex: .*File '.*/plain-encoded-out-of-bounds.parq' is corrupt: error decoding value of type STRING at offset 58.* -row_regex: .*File '.*/dict-encoded-out-of-bounds.parq' is corrupt: error reading dictionary for data of type STRING: could not decode dictionary.* +File '$NAMENODE/test-warehouse/bad_parquet_strings_out_of_bounds_parquet/plain-encoded-out-of-bounds.parq' is corrupt: error decoding value of type STRING at offset 58 +File '$NAMENODE/test-warehouse/bad_parquet_strings_out_of_bounds_parquet/dict-encoded-out-of-bounds.parq' is corrupt: error reading dictionary for data of type STRING: could not decode dictionary ==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test b/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test index 8dccc07..808346f 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test +++ b/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test @@ -2,30 +2,30 @@ ---- QUERY select tinyint_col from overflow ---- CATCH -Error converting column: 0 TO TINYINT (Data is: 1000) +Error converting column: 0 to TINYINT ==== ---- QUERY select smallint_col from overflow ---- CATCH -Error converting column: 1 TO SMALLINT (Data is: 100000) +Error converting column: 1 to SMALLINT ==== ---- QUERY select int_col from overflow ---- CATCH -Error converting column: 2 TO INT (Data is: 10000000000000000) +Error converting column: 2 to INT ==== ---- QUERY select bigint_col from overflow ---- CATCH -Error converting column: 3 TO BIGINT (Data is: 10000000000000000000) +Error converting column: 3 to BIGINT ==== ---- QUERY select float_col from overflow ---- CATCH -Error converting column: 4 TO FLOAT (Data is: 1e1000000) +Error converting column: 4 to FLOAT ==== ---- QUERY select double_col from overflow ---- CATCH -Error converting column: 5 TO DOUBLE (Data is: 1e10000) +Error converting column: 5 to DOUBLE ==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test b/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test index 2d85a74..b70c272 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test +++ b/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test @@ -2,23 +2,20 @@ ---- QUERY select * from overflow ---- ERRORS -Error converting column: 0 TO TINYINT (Data is: 1000) -Error converting column: 1 TO SMALLINT (Data is: 100000) -Error converting column: 2 TO INT (Data is: 10000000000000000) -Error converting column: 3 TO BIGINT (Data is: 10000000000000000000) -Error converting column: 4 TO FLOAT (Data is: 1e1000000) -Error converting column: 5 TO DOUBLE (Data is: 1e10000) -file: hdfs://regex:.$ -record: 1000,100000,10000000000000000,10000000000000000000,1e1000000,1e10000 -Error converting column: 0 TO TINYINT (Data is: -1000) -Error converting column: 1 TO SMALLINT (Data is: -100000) -Error converting column: 2 TO INT (Data is: -10000000000000000) -Error converting column: 3 TO BIGINT (Data is: -10000000000000000000) -Error converting column: 4 TO FLOAT (Data is: -1e1000000) -Error converting column: 5 TO DOUBLE (Data is: -1e10000) -file: hdfs://regex:.$ -record: -1000,-100000,-10000000000000000,-10000000000000000000,-1e1000000,-1e10000 - +Error converting column: 0 to TINYINT +Error converting column: 1 to SMALLINT +Error converting column: 2 to INT +Error converting column: 3 to BIGINT +Error converting column: 4 to FLOAT +Error converting column: 5 to DOUBLE +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 0 to TINYINT +Error converting column: 1 to SMALLINT +Error converting column: 2 to INT +Error converting column: 3 to BIGINT +Error converting column: 4 to FLOAT +Error converting column: 5 to DOUBLE +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ ---- RESULTS 1,2,3,4,5.5,6.6 NULL,NULL,NULL,NULL,NULL,NULL http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/tests/common/impala_test_suite.py ---------------------------------------------------------------------- diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index 36e7442..2c0f3c6 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -48,13 +48,17 @@ from tests.common.test_vector import TestDimension from tests.performance.query import Query from tests.performance.query_exec_functions import execute_using_jdbc from tests.performance.query_executor import JdbcQueryExecConfig -from tests.util.filesystem_utils import IS_S3, S3_BUCKET_NAME -from tests.util.hdfs_util import HdfsConfig, get_hdfs_client, get_hdfs_client_from_conf +from tests.util.filesystem_utils import IS_S3, S3_BUCKET_NAME, FILESYSTEM_PREFIX +from tests.util.hdfs_util import ( + HdfsConfig, + get_hdfs_client, + get_hdfs_client_from_conf, + NAMENODE) from tests.util.s3_util import S3Client from tests.util.test_file_parser import ( - QueryTestSectionReader, - parse_query_test_file, - write_test_file) + QueryTestSectionReader, + parse_query_test_file, + write_test_file) from tests.util.thrift_util import create_transport # Imports required for Hive Metastore Client @@ -72,19 +76,8 @@ IMPALAD_HS2_HOST_PORT =\ HIVE_HS2_HOST_PORT = pytest.config.option.hive_server2 WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR'] HDFS_CONF = HdfsConfig(pytest.config.option.minicluster_xml_conf) -CORE_CONF = HdfsConfig(os.path.join(os.environ['HADOOP_CONF_DIR'], "core-site.xml")) TARGET_FILESYSTEM = os.getenv("TARGET_FILESYSTEM") or "hdfs" IMPALA_HOME = os.getenv("IMPALA_HOME") -# FILESYSTEM_PREFIX is the path prefix that should be used in queries. When running -# the tests against the default filesystem (fs.defaultFS), FILESYSTEM_PREFIX is the -# empty string. When running against a secondary filesystem, it will be the scheme -# and authority porotion of the qualified path. -FILESYSTEM_PREFIX = os.getenv("FILESYSTEM_PREFIX") -# NAMENODE is the path prefix that should be used in results, since paths that come -# out of Impala have been qualified. When running against the default filesystem, -# this will be the same as fs.defaultFS. When running against a secondary filesystem, -# this will be the same as FILESYSTEM_PREFIX. -NAMENODE = FILESYSTEM_PREFIX or CORE_CONF.get('fs.defaultFS') # Match any SET statement. Assume that query options' names # only contain alphabets and underscores. SET_PATTERN = re.compile(r'\s*set\s*([a-zA-Z_]+)=*', re.I) @@ -220,6 +213,27 @@ class ImpalaTestSuite(BaseTestSuite): if expected_str in actual_str: return assert False, 'Unexpected exception string: %s' % actual_str + def __verify_results_and_errors(self, vector, test_section, result, use_db): + """Verifies that both results and error sections are as expected. Rewrites both + by replacing $NAMENODE, $DATABASE and $IMPALA_HOME with their actual values, and + optionally rewriting filenames with __HDFS_FILENAME__, to ensure that expected and + actual values are easily compared. + """ + replace_filenames_with_placeholder = True + for section_name in ('RESULTS', 'ERRORS'): + if section_name in test_section: + if "$NAMENODE" in test_section[section_name]: + replace_filenames_with_placeholder = False + test_section[section_name] = test_section[section_name] \ + .replace('$NAMENODE', NAMENODE) \ + .replace('$IMPALA_HOME', IMPALA_HOME) + if use_db: + test_section['RESULTS'] = test_section['RESULTS'].replace('$DATABASE', use_db) + verify_raw_results(test_section, result, vector.get_value('table_format').file_format, + pytest.config.option.update_results, + replace_filenames_with_placeholder) + + def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False, encoding=None, wait_secs_between_stmts=None): """ @@ -325,14 +339,11 @@ class ImpalaTestSuite(BaseTestSuite): if encoding: result.data = [row.decode(encoding) for row in result.data] # Replace $NAMENODE in the expected results with the actual namenode URI. if 'RESULTS' in test_section: - test_section['RESULTS'] = test_section['RESULTS'] \ - .replace('$NAMENODE', NAMENODE) \ - .replace('$IMPALA_HOME', IMPALA_HOME) - if use_db: - test_section['RESULTS'] = test_section['RESULTS'].replace('$DATABASE', use_db) - verify_raw_results(test_section, result, - vector.get_value('table_format').file_format, - pytest.config.option.update_results) + self.__verify_results_and_errors(vector, test_section, result, use_db) + else: + # TODO: Can't validate errors without expected results for now. + assert 'ERRORS' not in test_section,\ + "'ERRORS' sections must have accompanying 'RESULTS' sections" # If --update_results, then replace references to the namenode URI with $NAMENODE. if pytest.config.option.update_results and 'RESULTS' in test_section: test_section['RESULTS'] = test_section['RESULTS'] \ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/tests/common/test_result_verifier.py ---------------------------------------------------------------------- diff --git a/tests/common/test_result_verifier.py b/tests/common/test_result_verifier.py index 8130b4d..1410b92 100644 --- a/tests/common/test_result_verifier.py +++ b/tests/common/test_result_verifier.py @@ -24,6 +24,7 @@ import re from functools import wraps from tests.util.test_file_parser import (join_section_lines, remove_comments, split_section_lines) +from tests.util.hdfs_util import NAMENODE logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s') LOG = logging.getLogger('test_result_verfier') @@ -272,18 +273,20 @@ def verify_errors(expected_errors, actual_errors): ['DUMMY_LABEL'], order_matters=False) VERIFIER_MAP['VERIFY_IS_EQUAL'](expected, actual) -def apply_error_match_filter(error_list): +def apply_error_match_filter(error_list, replace_filenames=True): """Applies a filter to each entry in the given list of errors to ensure result matching is stable.""" - updated_errors = list() - for row in error_list: - # The actual file path isn't very interesting and can vary. Filter it out. - row = re.sub(r'^file:.+$|file=.+$|file hdfs:.+$', 'file: hdfs://regex:.$', row) + file_regex = r'%s.*/[\w\.\-]+' % NAMENODE + def replace_fn(row): + # The actual file path isn't very interesting and can vary. Change it to a canonical + # string that allows result rows to sort in the same order as expected rows. + if replace_filenames: row = re.sub(file_regex, '__HDFS_FILENAME__', row) # The "Backend <id>" can also vary, so filter it out as well. - updated_errors.append(re.sub(r'Backend \d+:', '', row)) - return updated_errors + return re.sub(r'Backend \d+:', '', row) + return [replace_fn(row) for row in error_list] -def verify_raw_results(test_section, exec_result, file_format, update_section=False): +def verify_raw_results(test_section, exec_result, file_format, update_section=False, + replace_filenames=True): """ Accepts a raw exec_result object and verifies it matches the expected results. If update_section is true, updates test_section with the actual results @@ -294,16 +297,16 @@ def verify_raw_results(test_section, exec_result, file_format, update_section=Fa result format used in the tests. """ expected_results = None - if 'RESULTS' in test_section: expected_results = remove_comments(test_section['RESULTS']) else: + assert 'ERRORS' not in test_section, "'ERRORS' section must have accompanying 'RESULTS' section" LOG.info("No results found. Skipping verification"); return - if 'ERRORS' in test_section: expected_errors = split_section_lines(remove_comments(test_section['ERRORS'])) - actual_errors = apply_error_match_filter(exec_result.log.split('\n')) + actual_errors = apply_error_match_filter(exec_result.log.split('\n'), + replace_filenames) try: verify_errors(expected_errors, actual_errors) except AssertionError: @@ -476,4 +479,3 @@ def verify_runtime_profile(expected, actual): assert len(unmatched_lines) == 0, ("Did not find matches for lines in runtime profile:" "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines), actual)) - http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/tests/util/filesystem_utils.py ---------------------------------------------------------------------- diff --git a/tests/util/filesystem_utils.py b/tests/util/filesystem_utils.py index 1adc055..d435720 100644 --- a/tests/util/filesystem_utils.py +++ b/tests/util/filesystem_utils.py @@ -18,6 +18,10 @@ # Utilities for supporting different filesystems. import os +# FILESYSTEM_PREFIX is the path prefix that should be used in queries. When running +# the tests against the default filesystem (fs.defaultFS), FILESYSTEM_PREFIX is the +# empty string. When running against a secondary filesystem, it will be the scheme +# and authority portion of the qualified path. FILESYSTEM_PREFIX = os.getenv("FILESYSTEM_PREFIX") or str() SECONDARY_FILESYSTEM = os.getenv("SECONDARY_FILESYSTEM") or str() FILESYSTEM = os.getenv("TARGET_FILESYSTEM") http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34b5f1c4/tests/util/hdfs_util.py ---------------------------------------------------------------------- diff --git a/tests/util/hdfs_util.py b/tests/util/hdfs_util.py index 1c1080f..3c3a45e 100644 --- a/tests/util/hdfs_util.py +++ b/tests/util/hdfs_util.py @@ -26,6 +26,28 @@ from pywebhdfs.webhdfs import PyWebHdfsClient, errors, _raise_pywebhdfs_exceptio from xml.etree.ElementTree import parse from tests.util.filesystem_base import BaseFilesystem +from tests.util.filesystem_utils import FILESYSTEM_PREFIX + +class HdfsConfig(object): + """Reads an XML configuration file (produced by a mini-cluster) into a dictionary + accessible via get()""" + def __init__(self, *filename): + self.conf = {} + for arg in filename: + tree = parse(arg) + for property in tree.getroot().getiterator('property'): + self.conf[property.find('name').text] = property.find('value').text + + def get(self, key): + return self.conf.get(key) + +# Configuration object for the configuration that the minicluster will use. +CORE_CONF = HdfsConfig(join_path(environ['HADOOP_CONF_DIR'], "core-site.xml")) +# NAMENODE is the path prefix that should be used in results, since paths that come +# out of Impala have been qualified. When running against the default filesystem, +# this will be the same as fs.defaultFS. When running against a secondary filesystem, +# this will be the same as FILESYSTEM_PREFIX. +NAMENODE = FILESYSTEM_PREFIX or CORE_CONF.get('fs.defaultFS') class PyWebHdfsClientWithChmod(PyWebHdfsClient, BaseFilesystem): def chmod(self, path, permission): @@ -118,19 +140,6 @@ class PyWebHdfsClientWithChmod(PyWebHdfsClient, BaseFilesystem): return False return True -class HdfsConfig(object): - """Reads an XML configuration file (produced by a mini-cluster) into a dictionary - accessible via get()""" - def __init__(self, *filename): - self.conf = {} - for arg in filename: - tree = parse(arg) - for property in tree.getroot().getiterator('property'): - self.conf[property.find('name').text] = property.find('value').text - - def get(self, key): - return self.conf.get(key) - def get_hdfs_client_from_conf(conf): """Returns a new HTTP client for an HDFS cluster using an HdfsConfig object""" hostport = conf.get('dfs.namenode.http-address')
