Repository: incubator-impala Updated Branches: refs/heads/master c14a09040 -> ec957456d
IMPALA-5525 Extend TestScannersFuzzing to test uncompressed parquet test_scanners_fuzz.py currently tests compressed parquet but does not test uncompressed parquet. This fix adds a new test case for uncompressed parquet. Testing ------- Ran the query_test/test_scanners_fuzz.py in a loop (5 times) and there was no impalad crash seen. Change-Id: I760de7203a51cf82b16016fa8043cadc7c8325bc Reviewed-on: http://gerrit.cloudera.org:8080/8056 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d40047aa Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d40047aa Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d40047aa Branch: refs/heads/master Commit: d40047aa9b6aff6ef33c10ce5f05ebdac9b37e63 Parents: c14a090 Author: Pranay <[email protected]> Authored: Wed Sep 13 09:54:11 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Oct 6 00:32:17 2017 +0000 ---------------------------------------------------------------------- tests/query_test/test_scanners_fuzz.py | 66 ++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d40047aa/tests/query_test/test_scanners_fuzz.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py index 53fe348..c336a17 100644 --- a/tests/query_test/test_scanners_fuzz.py +++ b/tests/query_test/test_scanners_fuzz.py @@ -28,6 +28,7 @@ from subprocess import check_call from tests.common.test_dimensions import create_exec_option_dimension_from_dict from tests.common.impala_test_suite import ImpalaTestSuite, LOG from tests.util.filesystem_utils import WAREHOUSE, get_fs_path +from tests.util.test_file_parser import QueryTestSectionReader # Random fuzz testing of HDFS scanners. Existing tables for any HDFS file format # are corrupted in random ways to flush out bugs with handling of corrupted data. @@ -67,7 +68,10 @@ class TestScannersFuzzing(ImpalaTestSuite): def test_fuzz_alltypes(self, vector, unique_database): - self.run_fuzz_test(vector, unique_database, "alltypes") + table_format = vector.get_value('table_format') + src_db = QueryTestSectionReader.get_db_name(table_format) + table_name = "alltypes" + self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name) def test_fuzz_decimal_tbl(self, vector, unique_database): table_format = vector.get_value('table_format') @@ -82,16 +86,46 @@ class TestScannersFuzzing(ImpalaTestSuite): # decimal_tbl is not present for these file formats pytest.skip() - self.run_fuzz_test(vector, unique_database, table_name, 10) + src_db = QueryTestSectionReader.get_db_name(table_format) + self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10) def test_fuzz_nested_types(self, vector, unique_database): table_format = vector.get_value('table_format') + table_name = "complextypestbl" + src_db = QueryTestSectionReader.get_db_name(table_format) + + if table_format.file_format != 'parquet': pytest.skip() + self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10) + + def test_fuzz_uncompressed_parquet(self, vector, unique_database): + """Parquet tables in default schema are compressed, so in order + to do the fuzz_test on an uncompressed parquet table, this test + clones from an existing parquet table into a new table with + no compression. + """ + table_format = vector.get_value('table_format') + if vector.get_value('table_format').compression_codec != 'none': pytest.skip() if table_format.file_format != 'parquet': pytest.skip() - self.run_fuzz_test(vector, unique_database, "complextypestbl", 10) + + """Even when the compression_codec is none, the default compression type is snappy + so compression codec is changed explicitly to be none. + """ + self.execute_query("set compression_codec=none") + + tbl_list = ["alltypes", "decimal_tbl"] + for orig_tbl_name in tbl_list: + src_table_name = "parquet_uncomp_src_" + orig_tbl_name + fuzz_table_name = "parquet_uncomp_dst_" + orig_tbl_name + fq_tbl_name = unique_database + "." + src_table_name + create_tbl = ("create table {0} stored as parquet as select * from" + " functional_parquet.{1}".format(fq_tbl_name, orig_tbl_name)) + self.execute_query(create_tbl) + self.run_fuzz_test(vector, unique_database, src_table_name, unique_database, + fuzz_table_name, 10) # TODO: add test coverage for additional data types like char and varchar - def run_fuzz_test(self, vector, unique_database, table, num_copies=1): + def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. @@ -106,27 +140,26 @@ class TestScannersFuzzing(ImpalaTestSuite): LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) - table_format = vector.get_value('table_format') - self.change_database(self.client, table_format) - - tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table, + tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % fuzz_table, dir=os.path.join(os.environ['IMPALA_HOME'], "testdata")) - self.execute_query("create table %s.%s like %s" % (unique_database, table, table)) + self.execute_query("create table %s.%s like %s.%s" % (fuzz_db, fuzz_table, + src_db, src_table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( - unique_database, table)) + fuzz_db, fuzz_table)) LOG.info("Generating corrupted version of %s in %s. Local working directory is %s", - table, unique_database, tmp_table_dir) + fuzz_table, fuzz_db, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. - table_loc = self._get_table_location(table, vector) + fq_table_name = src_db + "." + src_table + table_loc = self._get_table_location(fq_table_name, vector) check_call(['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query('alter table {0}.{1} add partition ({2})'.format( - unique_database, table, ','.join(partition))) + fuzz_db, fuzz_table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = ["%s/%s" % (tmp_table_dir, file_or_dir) @@ -137,14 +170,14 @@ class TestScannersFuzzing(ImpalaTestSuite): shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. - self.execute_query("refresh %s.%s" % (unique_database, table)) + self.execute_query("refresh %s.%s" % (fuzz_db, fuzz_table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( - unique_database, table), - 'select count(*) from {0}.{1} q'.format(unique_database, table)] + fuzz_db, fuzz_table), + 'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table)] for query, batch_size, disable_codegen in \ itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES): @@ -164,6 +197,7 @@ class TestScannersFuzzing(ImpalaTestSuite): # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). + table_format = vector.get_value('table_format') if table_format.file_format != 'parquet' \ and not (table_format.file_format == 'text' and table_format.compression_codec != 'none'):
