[1/2] incubator-impala git commit: IMPALA-5525 Extend TestScannersFuzzing to test uncompressed parquet

jrussell Thu, 05 Oct 2017 21:32:13 -0700

Repository: incubator-impala
Updated Branches:
  refs/heads/master c14a09040 -> ec957456d



IMPALA-5525 Extend TestScannersFuzzing to test uncompressed parquet

test_scanners_fuzz.py currently tests compressed parquet but
does not test uncompressed parquet. This fix adds a new test
case for uncompressed parquet.

Testing
-------
Ran the query_test/test_scanners_fuzz.py in a loop (5 times)
and there was no impalad crash seen.

Change-Id: I760de7203a51cf82b16016fa8043cadc7c8325bc
Reviewed-on: http://gerrit.cloudera.org:8080/8056
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d40047aa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d40047aa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d40047aa

Branch: refs/heads/master
Commit: d40047aa9b6aff6ef33c10ce5f05ebdac9b37e63
Parents: c14a090
Author: Pranay <[email protected]>
Authored: Wed Sep 13 09:54:11 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Fri Oct 6 00:32:17 2017 +0000

----------------------------------------------------------------------
 tests/query_test/test_scanners_fuzz.py | 66 ++++++++++++++++++++++-------
 1 file changed, 50 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d40047aa/tests/query_test/test_scanners_fuzz.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners_fuzz.py 
b/tests/query_test/test_scanners_fuzz.py
index 53fe348..c336a17 100644
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -28,6 +28,7 @@ from subprocess import check_call
 from tests.common.test_dimensions import create_exec_option_dimension_from_dict
 from tests.common.impala_test_suite import ImpalaTestSuite, LOG
 from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
+from tests.util.test_file_parser import QueryTestSectionReader
 
 # Random fuzz testing of HDFS scanners. Existing tables for any HDFS file 
format
 # are corrupted in random ways to flush out bugs with handling of corrupted 
data.
@@ -67,7 +68,10 @@ class TestScannersFuzzing(ImpalaTestSuite):
 
 
   def test_fuzz_alltypes(self, vector, unique_database):
-    self.run_fuzz_test(vector, unique_database, "alltypes")
+    table_format = vector.get_value('table_format')
+    src_db = QueryTestSectionReader.get_db_name(table_format)
+    table_name = "alltypes"
+    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name)
 
   def test_fuzz_decimal_tbl(self, vector, unique_database):
     table_format = vector.get_value('table_format')
@@ -82,16 +86,46 @@ class TestScannersFuzzing(ImpalaTestSuite):
       # decimal_tbl is not present for these file formats
       pytest.skip()
 
-    self.run_fuzz_test(vector, unique_database, table_name, 10)
+    src_db = QueryTestSectionReader.get_db_name(table_format)
+    self.run_fuzz_test(vector, src_db, table_name, unique_database, 
table_name, 10)
 
   def test_fuzz_nested_types(self, vector, unique_database):
     table_format = vector.get_value('table_format')
+    table_name = "complextypestbl"
+    src_db = QueryTestSectionReader.get_db_name(table_format)
+
+    if table_format.file_format != 'parquet': pytest.skip()
+    self.run_fuzz_test(vector, src_db, table_name, unique_database, 
table_name, 10)
+
+  def test_fuzz_uncompressed_parquet(self, vector, unique_database):
+    """Parquet tables in default schema are compressed, so in order
+       to do the fuzz_test on an uncompressed parquet table, this test
+       clones from an existing parquet table into a new table with
+       no compression.
+    """
+    table_format = vector.get_value('table_format')
+    if vector.get_value('table_format').compression_codec != 'none': 
pytest.skip()
     if table_format.file_format != 'parquet': pytest.skip()
-    self.run_fuzz_test(vector, unique_database, "complextypestbl", 10)
+
+    """Even when the compression_codec is none, the default compression type 
is snappy
+       so compression codec is changed explicitly to be none.
+    """
+    self.execute_query("set compression_codec=none")
+
+    tbl_list = ["alltypes", "decimal_tbl"]
+    for orig_tbl_name in tbl_list:
+      src_table_name = "parquet_uncomp_src_" + orig_tbl_name
+      fuzz_table_name = "parquet_uncomp_dst_" + orig_tbl_name
+      fq_tbl_name = unique_database + "." + src_table_name
+      create_tbl = ("create table {0} stored as parquet as select * from"
+          " functional_parquet.{1}".format(fq_tbl_name, orig_tbl_name))
+      self.execute_query(create_tbl)
+      self.run_fuzz_test(vector, unique_database, src_table_name, 
unique_database,
+          fuzz_table_name, 10)
 
   # TODO: add test coverage for additional data types like char and varchar
 
-  def run_fuzz_test(self, vector, unique_database, table, num_copies=1):
+  def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, 
num_copies=1):
     """ Do some basic fuzz testing: create a copy of an existing table with 
randomly
     corrupted files and make sure that we don't crash or behave in an 
unexpected way.
     'unique_database' is used for the table, so it will be cleaned up 
automatically.
@@ -106,27 +140,26 @@ class TestScannersFuzzing(ImpalaTestSuite):
     LOG.info("Using random seed %d", random_seed)
     rng.seed(long(random_seed))
 
-    table_format = vector.get_value('table_format')
-    self.change_database(self.client, table_format)
-
-    tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table,
+    tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % fuzz_table,
         dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
 
-    self.execute_query("create table %s.%s like %s" % (unique_database, table, 
table))
+    self.execute_query("create table %s.%s like %s.%s" % (fuzz_db, fuzz_table,
+        src_db, src_table))
     fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format(
-        unique_database, table))
+        fuzz_db, fuzz_table))
 
     LOG.info("Generating corrupted version of %s in %s. Local working 
directory is %s",
-        table, unique_database, tmp_table_dir)
+        fuzz_table, fuzz_db, tmp_table_dir)
 
     # Find the location of the existing table and get the full table directory 
structure.
-    table_loc = self._get_table_location(table, vector)
+    fq_table_name = src_db + "." + src_table
+    table_loc = self._get_table_location(fq_table_name, vector)
     check_call(['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", 
tmp_table_dir])
 
     partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, 
rng)
     for partition in partitions:
       self.execute_query('alter table {0}.{1} add partition ({2})'.format(
-          unique_database, table, ','.join(partition)))
+          fuzz_db, fuzz_table, ','.join(partition)))
 
     # Copy all of the local files and directories to hdfs.
     to_copy = ["%s/%s" % (tmp_table_dir, file_or_dir)
@@ -137,14 +170,14 @@ class TestScannersFuzzing(ImpalaTestSuite):
       shutil.rmtree(tmp_table_dir)
 
     # Querying the corrupted files should not DCHECK or crash.
-    self.execute_query("refresh %s.%s" % (unique_database, table))
+    self.execute_query("refresh %s.%s" % (fuzz_db, fuzz_table))
     # Execute a query that tries to read all the columns and rows in the file.
     # Also execute a count(*) that materializes no columns, since different 
code
     # paths are exercised.
     queries = [
         'select count(*) from (select distinct * from {0}.{1}) q'.format(
-            unique_database, table),
-        'select count(*) from {0}.{1} q'.format(unique_database, table)]
+            fuzz_db, fuzz_table),
+        'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table)]
 
     for query, batch_size, disable_codegen in \
         itertools.product(queries, self.BATCH_SIZES, 
self.DISABLE_CODEGEN_VALUES):
@@ -164,6 +197,7 @@ class TestScannersFuzzing(ImpalaTestSuite):
         # Parquet and compressed text can fail the query for some parse errors.
         # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
         # (IMPALA-4013).
+        table_format = vector.get_value('table_format')
         if table_format.file_format != 'parquet' \
             and not (table_format.file_format == 'text' and
             table_format.compression_codec != 'none'):

[1/2] incubator-impala git commit: IMPALA-5525 Extend TestScannersFuzzing to test uncompressed parquet

Reply via email to