Repository: drill Updated Branches: refs/heads/master ee399317a -> 34969583b
DRILL-4996: Parquet Date auto-correction is not working in auto-partitioned parquet files generated by drill-1.6 - Changed detection approach of corrupted date values for the case, when parquet files are generated by drill: the corruption status is determined by looking at the min/max values in the metadata; - Appropriate refactoring of TestCorruptParquetDateCorrection. This closes #687 Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/eef3b3fb Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/eef3b3fb Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/eef3b3fb Branch: refs/heads/master Commit: eef3b3fb6f4e76e95510253d155d0659e387fc99 Parents: ee39931 Author: Vitalii Diravka <[email protected]> Authored: Mon Dec 12 04:41:49 2016 +0000 Committer: Parth Chandra <[email protected]> Committed: Fri Jan 13 17:44:29 2017 -0800 ---------------------------------------------------------------------- .../store/parquet/ParquetReaderUtility.java | 18 +- .../TestCorruptParquetDateCorrection.java | 284 +++++++++---------- ...t_dates_and_old_drill_parquet_writer.parquet | Bin 0 -> 4241 bytes 3 files changed, 140 insertions(+), 162 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/eef3b3fb/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java index b22e666..a94e220 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java @@ -195,26 +195,26 @@ public class ParquetReaderUtility { String createdBy = footer.getFileMetaData().getCreatedBy(); String drillVersion = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.DRILL_VERSION_PROPERTY); - String stringWriterVersion = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.WRITER_VERSION_PROPERTY); + String writerVersionValue = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.WRITER_VERSION_PROPERTY); // This flag can be present in parquet files which were generated with 1.9.0-SNAPSHOT and 1.9.0 drill versions. // If this flag is present it means that the version of the drill parquet writer is 2 final String isDateCorrectFlag = "is.date.correct"; String isDateCorrect = footer.getFileMetaData().getKeyValueMetaData().get(isDateCorrectFlag); if (drillVersion != null) { int writerVersion = 1; - if (stringWriterVersion != null) { - writerVersion = Integer.parseInt(stringWriterVersion); + if (writerVersionValue != null) { + writerVersion = Integer.parseInt(writerVersionValue); } else if (Boolean.valueOf(isDateCorrect)) { writerVersion = DRILL_WRITER_VERSION_STD_DATE_FORMAT; } return writerVersion >= DRILL_WRITER_VERSION_STD_DATE_FORMAT ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION - : DateCorruptionStatus.META_SHOWS_CORRUPTION; + // loop through parquet column metadata to find date columns, check for corrupt values + : checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates); } else { // Possibly an old, un-migrated Drill file, check the column statistics to see if min/max values look corrupt // only applies if there is a date column selected if (createdBy == null || createdBy.equals("parquet-mr")) { - // loop through parquet column metadata to find date columns, check for corrupt values return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates); } else { // check the created by to see if it is a migrated Drill file @@ -226,7 +226,7 @@ public class ParquetReaderUtility { SemanticVersion semVer = parsedCreatedByVersion.getSemanticVersion(); String pre = semVer.pre + ""; if (semVer.major == 1 && semVer.minor == 8 && semVer.patch == 1 && pre.contains("drill")) { - return DateCorruptionStatus.META_SHOWS_CORRUPTION; + return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates); } } // written by a tool that wasn't Drill, the dates are not corrupted @@ -244,9 +244,9 @@ public class ParquetReaderUtility { * Detect corrupt date values by looking at the min/max values in the metadata. * * This should only be used when a file does not have enough metadata to determine if - * the data was written with an older version of Drill, or an external tool. Drill - * versions 1.3 and beyond should have enough metadata to confirm that the data was written - * by Drill. + * the data was written with an external tool or an older version of Drill + * ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} < + * {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT}) * * This method only checks the first Row Group, because Drill has only ever written * a single Row Group per file. http://git-wip-us.apache.org/repos/asf/drill/blob/eef3b3fb/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestCorruptParquetDateCorrection.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestCorruptParquetDateCorrection.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestCorruptParquetDateCorrection.java index 0ab247d..8cd1a85 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestCorruptParquetDateCorrection.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestCorruptParquetDateCorrection.java @@ -17,6 +17,8 @@ */ package org.apache.drill.exec.physical.impl.writer; +import static java.lang.String.format; + import org.apache.drill.PlanTestBase; import org.apache.drill.TestBuilder; import org.apache.drill.common.util.TestTools; @@ -37,10 +39,11 @@ import java.util.regex.Pattern; * Tests for compatibility reading old parquet files after date corruption * issue was fixed in DRILL-4203. * - * Drill was writing non-standard dates into parquet files for all releases - * before 1.9.0. The values have been read by Drill correctly by Drill, but - * external tools like Spark reading the files will see corrupted values for - * all dates that have been written by Drill. + * Drill could write non-standard dates into parquet files. This issue is related to + * all drill releases where {@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} < + * {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT} + * The values have been read correctly by Drill, but external tools like Spark reading the files will see + * corrupted values for all dates that have been written by Drill. * * This change corrects the behavior of the Drill parquet writer to correctly * store dates in the format given in the parquet specification. @@ -59,8 +62,7 @@ import java.util.regex.Pattern; * While the old behavior was a consistent shift into an unlikely range * to be used in a modern database (over 10,000 years in the future), these are still * valid date values. In the case where these may have been written into - * files intentionally, and we cannot be certain from the metadata if Drill - * produced the files, an option is included to turn off the auto-correction. + * files intentionally, an option is included to turn off the auto-correction. * Use of this option is assumed to be extremely unlikely, but it is included * for completeness. */ @@ -76,11 +78,8 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { // - one from the 0.6 version of Drill, before files had min/max statistics // - detecting corrupt values must be deferred to actual data page reading // - one from 1.4, where there is a proper created-by, but the corruption is present - private static final String MIXED_CORRUPTED_AND_CORRECTED_DATES_PATH = + private static final String MIXED_CORRUPTED_AND_CORRECT_DATES_PATH = "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/mixed_drill_versions"; - // partitioned with 1.4.0, date values are known to be corrupt - private static final String CORRUPTED_PARTITIONED_DATES_1_4_0_PATH = - "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/partitioned_with_corruption_4203"; // partitioned with 1.2.0, no certain metadata that these were written with Drill // the value will be checked to see that they look corrupt and they will be corrected // by default. Users can use the format plugin option autoCorrectCorruptDates to disable @@ -88,9 +87,13 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { // in the similar range as Drill's corrupt values private static final String CORRUPTED_PARTITIONED_DATES_1_2_PATH = "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/partitioned_with_corruption_4203_1_2"; + // partitioned with 1.4.0, no certain metadata regarding the date corruption status. + // The same detection approach of the corrupt date values as for the files partitioned with 1.2.0 + private static final String CORRUPTED_PARTITIONED_DATES_1_4_0_PATH = + "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/partitioned_with_corruption_4203"; private static final String PARQUET_DATE_FILE_WITH_NULL_FILLED_COLS = "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/null_date_cols_with_corruption_4203.parquet"; - private static final String CORRECTED_PARTITIONED_DATES_1_9_PATH = + private static final String CORRECT_PARTITIONED_DATES_1_9_PATH = "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/1_9_0_partitioned_no_corruption"; private static final String VARCHAR_PARTITIONED = "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/fewtypes_varcharpartition"; @@ -98,11 +101,13 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/fewtypes_datepartition"; private static final String EXCEPTION_WHILE_PARSING_CREATED_BY_META = "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/hive1dot2_fewtypes_null"; + private static final String CORRECT_DATES_1_6_0_PATH = + "[WORKING_PATH]/src/test/resources/parquet/4203_corrupt_dates/correct_dates_and_old_drill_parquet_writer.parquet"; + private static final String PARTITIONED_1_2_FOLDER = "partitioned_with_corruption_4203_1_2"; + private static final String MIXED_CORRUPTED_AND_CORRECT_PARTITIONED_FOLDER = "mixed_partitioned"; private static FileSystem fs; private static Path path; - static String PARTITIONED_1_2_FOLDER = "partitioned_with_corruption_4203_1_2"; - static String MIXED_CORRUPTED_AND_CORRECTED_PARTITIONED_FOLDER = "mixed_partitioned"; @BeforeClass public static void initFs() throws Exception { @@ -116,9 +121,9 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { copyDirectoryIntoTempSpace(CORRUPTED_PARTITIONED_DATES_1_2_PATH); copyMetaDataCacheToTempReplacingInternalPaths("parquet/4203_corrupt_dates/drill.parquet.metadata_1_2.requires_replace.txt", PARTITIONED_1_2_FOLDER); - copyDirectoryIntoTempSpace(CORRUPTED_PARTITIONED_DATES_1_2_PATH, MIXED_CORRUPTED_AND_CORRECTED_PARTITIONED_FOLDER); - copyDirectoryIntoTempSpace(CORRECTED_PARTITIONED_DATES_1_9_PATH, MIXED_CORRUPTED_AND_CORRECTED_PARTITIONED_FOLDER); - copyDirectoryIntoTempSpace(CORRUPTED_PARTITIONED_DATES_1_4_0_PATH, MIXED_CORRUPTED_AND_CORRECTED_PARTITIONED_FOLDER); + copyDirectoryIntoTempSpace(CORRUPTED_PARTITIONED_DATES_1_2_PATH, MIXED_CORRUPTED_AND_CORRECT_PARTITIONED_FOLDER); + copyDirectoryIntoTempSpace(CORRECT_PARTITIONED_DATES_1_9_PATH, MIXED_CORRUPTED_AND_CORRECT_PARTITIONED_FOLDER); + copyDirectoryIntoTempSpace(CORRUPTED_PARTITIONED_DATES_1_4_0_PATH, MIXED_CORRUPTED_AND_CORRECT_PARTITIONED_FOLDER); } /** @@ -128,20 +133,20 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { * in the case where we are certain correction is NOT needed. For more info see DRILL-4203. */ @Test - public void testReadPartitionedOnCorrectedDates() throws Exception { + public void testReadPartitionedOnCorrectDates() throws Exception { try { for (String selection : new String[]{"*", "date_col"}) { // for sanity, try reading all partitions without a filter TestBuilder builder = testBuilder() - .sqlQuery("select " + selection + " from table(dfs.`" + CORRECTED_PARTITIONED_DATES_1_9_PATH + "`" + - "(type => 'parquet', autoCorrectCorruptDates => false))") + .sqlQuery("select %s from table(dfs.`%s` (type => 'parquet', autoCorrectCorruptDates => false))", + selection, CORRECT_PARTITIONED_DATES_1_9_PATH) .unOrdered() .baselineColumns("date_col"); - addDateBaselineVals(builder); + addDateBaselineValues(builder); builder.go(); - String query = "select " + selection + " from table(dfs.`" + CORRECTED_PARTITIONED_DATES_1_9_PATH + "` " + - "(type => 'parquet', autoCorrectCorruptDates => false))" + " where date_col = date '1970-01-01'"; + String query = format("select %s from table(dfs.`%s` (type => 'parquet', autoCorrectCorruptDates => false))" + + " where date_col = date '1970-01-01'", selection, CORRECT_PARTITIONED_DATES_1_9_PATH); // verify that pruning is actually taking place testPlanMatchingPatterns(query, new String[]{"numFiles=1"}, null); @@ -161,9 +166,7 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { @Test public void testVarcharPartitionedReadWithCorruption() throws Exception { testBuilder() - .sqlQuery("select date_col from " + - "dfs.`" + VARCHAR_PARTITIONED + "`" + - "where length(varchar_col) = 12") + .sqlQuery("select date_col from dfs.`%s` where length(varchar_col) = 12", VARCHAR_PARTITIONED) .baselineColumns("date_col") .unOrdered() .baselineValues(new DateTime(2039, 4, 9, 0, 0)) @@ -174,24 +177,21 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { @Test public void testDatePartitionedReadWithCorruption() throws Exception { testBuilder() - .sqlQuery("select date_col from " + - "dfs.`" + DATE_PARTITIONED + "`" + - "where date_col = '1999-04-08'") + .sqlQuery("select date_col from dfs.`%s` where date_col = '1999-04-08'", DATE_PARTITIONED) .baselineColumns("date_col") .unOrdered() .baselineValues(new DateTime(1999, 4, 8, 0, 0)) .go(); - String sql = "select date_col from dfs.`" + DATE_PARTITIONED + "` where date_col > '1999-04-08'"; - testPlanMatchingPatterns(sql, new String[]{"numFiles=6"}, null); + String query = format("select date_col from dfs.`%s` where date_col > '1999-04-08'", DATE_PARTITIONED); + testPlanMatchingPatterns(query, new String[]{"numFiles=6"}, null); } @Test public void testCorrectDatesAndExceptionWhileParsingCreatedBy() throws Exception { testBuilder() - .sqlQuery("select date_col from " + - "dfs.`" + EXCEPTION_WHILE_PARSING_CREATED_BY_META + - "` where to_date(date_col, 'yyyy-mm-dd') < '1997-01-02'") + .sqlQuery("select date_col from dfs.`%s` where to_date(date_col, 'yyyy-mm-dd') < '1997-01-02'", + EXCEPTION_WHILE_PARSING_CREATED_BY_META) .baselineColumns("date_col") .unOrdered() .baselineValues(new DateTime(1996, 1, 29, 0, 0)) @@ -201,68 +201,34 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { .go(); } - /** - * Test reading a directory full of partitioned parquet files with dates, these files have a drill version - * number of 1.4.0 in their footers, so we can be certain they are corrupt. The option to disable the - * correction is passed, but it will not change the result in the case where we are certain correction - * is needed. For more info see DRILL-4203. - */ - @Test - public void testReadPartitionedOnCorruptedDates() throws Exception { - try { - for (String selection : new String[]{"*", "date_col"}) { - // for sanity, try reading all partitions without a filter - TestBuilder builder = testBuilder() - .sqlQuery("select " + selection + " from table(dfs.`" + CORRUPTED_PARTITIONED_DATES_1_4_0_PATH + "`" + - "(type => 'parquet', autoCorrectCorruptDates => false))") - .unOrdered() - .baselineColumns("date_col"); - addDateBaselineVals(builder); - builder.go(); - - String query = "select " + selection + " from table(dfs.`" + CORRUPTED_PARTITIONED_DATES_1_4_0_PATH + "` " + - "(type => 'parquet', autoCorrectCorruptDates => false))" + " where date_col = date '1970-01-01'"; - // verify that pruning is actually taking place - testPlanMatchingPatterns(query, new String[]{"numFiles=1"}, null); - - // read with a filter on the partition column - testBuilder() - .sqlQuery(query) - .unOrdered() - .baselineColumns("date_col") - .baselineValues(new DateTime(1970, 1, 1, 0, 0)) - .go(); - } - } finally { - test("alter session reset all"); - } - } @Test public void testReadPartitionedOnCorruptedDates_UserDisabledCorrection() throws Exception { try { for (String selection : new String[]{"*", "date_col"}) { - // for sanity, try reading all partitions without a filter - TestBuilder builder = testBuilder() - .sqlQuery("select " + selection + " from table(dfs.`" + CORRUPTED_PARTITIONED_DATES_1_2_PATH + "`" + - "(type => 'parquet', autoCorrectCorruptDates => false))") - .unOrdered() - .baselineColumns("date_col"); - addCorruptedDateBaselineVals(builder); - builder.go(); - - String query = "select " + selection + " from table(dfs.`" + CORRUPTED_PARTITIONED_DATES_1_2_PATH + "` " + - "(type => 'parquet', autoCorrectCorruptDates => false))" + " where date_col = cast('15334-03-17' as date)"; - // verify that pruning is actually taking place - testPlanMatchingPatterns(query, new String[]{"numFiles=1"}, null); - - // read with a filter on the partition column - testBuilder() - .sqlQuery(query) - .unOrdered() - .baselineColumns("date_col") - .baselineValues(new DateTime(15334, 03, 17, 0, 0)) - .go(); + for (String table : new String[]{CORRUPTED_PARTITIONED_DATES_1_2_PATH, CORRUPTED_PARTITIONED_DATES_1_4_0_PATH}) { + // for sanity, try reading all partitions without a filter + TestBuilder builder = testBuilder() + .sqlQuery("select %s from table(dfs.`%s` (type => 'parquet', autoCorrectCorruptDates => false))", + selection, table) + .unOrdered() + .baselineColumns("date_col"); + addCorruptedDateBaselineValues(builder); + builder.go(); + + String query = format("select %s from table(dfs.`%s` (type => 'parquet', " + + "autoCorrectCorruptDates => false)) where date_col = cast('15334-03-17' as date)", selection, table); + // verify that pruning is actually taking place + testPlanMatchingPatterns(query, new String[]{"numFiles=1"}, null); + + // read with a filter on the partition column + testBuilder() + .sqlQuery(query) + .unOrdered() + .baselineColumns("date_col") + .baselineValues(new DateTime(15334, 3, 17, 0, 0)) + .go(); + } } } finally { test("alter session reset all"); @@ -270,29 +236,31 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { } @Test - public void testCorruptValDetectionDuringPruning() throws Exception { + public void testCorruptValueDetectionDuringPruning() throws Exception { try { for (String selection : new String[]{"*", "date_col"}) { - // for sanity, try reading all partitions without a filter - TestBuilder builder = testBuilder() - .sqlQuery("select " + selection + " from dfs.`" + CORRUPTED_PARTITIONED_DATES_1_2_PATH + "`") - .unOrdered() - .baselineColumns("date_col"); - addDateBaselineVals(builder); - builder.go(); - - String query = "select " + selection + " from dfs.`" + CORRUPTED_PARTITIONED_DATES_1_2_PATH + "`" + - " where date_col = date '1970-01-01'"; - // verify that pruning is actually taking place - testPlanMatchingPatterns(query, new String[]{"numFiles=1"}, null); - - // read with a filter on the partition column - testBuilder() - .sqlQuery(query) - .unOrdered() - .baselineColumns("date_col") - .baselineValues(new DateTime(1970, 1, 1, 0, 0)) - .go(); + for (String table : new String[]{CORRUPTED_PARTITIONED_DATES_1_2_PATH, CORRUPTED_PARTITIONED_DATES_1_4_0_PATH}) { + // for sanity, try reading all partitions without a filter + TestBuilder builder = testBuilder() + .sqlQuery("select %s from dfs.`%s`", selection, table) + .unOrdered() + .baselineColumns("date_col"); + addDateBaselineValues(builder); + builder.go(); + + String query = format("select %s from dfs.`%s`" + + " where date_col = date '1970-01-01'", selection, table); + // verify that pruning is actually taking place + testPlanMatchingPatterns(query, new String[]{"numFiles=1"}, null); + + // read with a filter on the partition column + testBuilder() + .sqlQuery(query) + .unOrdered() + .baselineColumns("date_col") + .baselineValues(new DateTime(1970, 1, 1, 0, 0)) + .go(); + } } } finally { test("alter session reset all"); @@ -313,8 +281,8 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { @Test public void testReadCorruptDatesWithNullFilledColumns() throws Exception { testBuilder() - .sqlQuery("select null_dates_1, null_dates_2, non_existent_field, date_col from dfs.`" + - PARQUET_DATE_FILE_WITH_NULL_FILLED_COLS + "`") + .sqlQuery("select null_dates_1, null_dates_2, non_existent_field, date_col from dfs.`%s`", + PARQUET_DATE_FILE_WITH_NULL_FILLED_COLS) .unOrdered() .baselineColumns("null_dates_1", "null_dates_2", "non_existent_field", "date_col") .baselineValues(null, null, null, new DateTime(1970, 1, 1, 0, 0)) @@ -332,7 +300,7 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { readFilesWithUserDisabledAutoCorrection(); try { - test(String.format("alter session set %s = true", ExecConstants.PARQUET_NEW_RECORD_READER)); + test("alter session set %s = true", ExecConstants.PARQUET_NEW_RECORD_READER); // read all of the types with the complex reader readFilesWithUserDisabledAutoCorrection(); } finally { @@ -352,34 +320,34 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { @Test public void testReadMixedOldAndNewBothReaders() throws Exception { /// read once with the flat reader - readMixedCorruptedAndCorrectedDates(); + readMixedCorruptedAndCorrectDates(); try { // read all of the types with the complex reader - test(String.format("alter session set %s = true", ExecConstants.PARQUET_NEW_RECORD_READER)); - readMixedCorruptedAndCorrectedDates(); + test("alter session set %s = true", ExecConstants.PARQUET_NEW_RECORD_READER); + readMixedCorruptedAndCorrectDates(); } finally { - test(String.format("alter session set %s = false", ExecConstants.PARQUET_NEW_RECORD_READER)); + test("alter session set %s = false", ExecConstants.PARQUET_NEW_RECORD_READER); } } @Test public void testReadOldMetadataCacheFile() throws Exception { // for sanity, try reading all partitions without a filter - String query = "select date_col from dfs.`" + new Path(path, PARTITIONED_1_2_FOLDER) + "`"; + String query = format("select date_col from dfs.`%s`", new Path(path, PARTITIONED_1_2_FOLDER)); TestBuilder builder = testBuilder() .sqlQuery(query) .unOrdered() .baselineColumns("date_col"); - addDateBaselineVals(builder); + addDateBaselineValues(builder); builder.go(); testPlanMatchingPatterns(query, new String[]{"usedMetadataFile=true"}, null); } @Test public void testReadOldMetadataCacheFileWithPruning() throws Exception { - String query = "select date_col from dfs.`" + new Path(path, PARTITIONED_1_2_FOLDER) + "`" + - " where date_col = date '1970-01-01'"; + String query = format("select date_col from dfs.`%s` where date_col = date '1970-01-01'", + new Path(path, PARTITIONED_1_2_FOLDER)); // verify that pruning is actually taking place testPlanMatchingPatterns(query, new String[]{"numFiles=1", "usedMetadataFile=true"}, null); @@ -396,15 +364,16 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { public void testReadOldMetadataCacheFileOverrideCorrection() throws Exception { // for sanity, try reading all partitions without a filter TestBuilder builder = testBuilder() - .sqlQuery("select date_col from table(dfs.`" + new Path(path, PARTITIONED_1_2_FOLDER) + "`" + - "(type => 'parquet', autoCorrectCorruptDates => false))") + .sqlQuery("select date_col from table(dfs.`%s` (type => 'parquet', autoCorrectCorruptDates => false))", + new Path(path, PARTITIONED_1_2_FOLDER)) .unOrdered() .baselineColumns("date_col"); - addCorruptedDateBaselineVals(builder); + addCorruptedDateBaselineValues(builder); builder.go(); - String query = "select date_col from table(dfs.`" + new Path(path, PARTITIONED_1_2_FOLDER) + "` " + - "(type => 'parquet', autoCorrectCorruptDates => false))" + " where date_col = cast('15334-03-17' as date)"; + String query = format("select date_col from table(dfs.`%s` (type => 'parquet', " + + "autoCorrectCorruptDates => false)) where date_col = cast('15334-03-17' as date)", + new Path(path, PARTITIONED_1_2_FOLDER)); // verify that pruning is actually taking place testPlanMatchingPatterns(query, new String[]{"numFiles=1", "usedMetadataFile=true"}, null); @@ -413,27 +382,26 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { .sqlQuery(query) .unOrdered() .baselineColumns("date_col") - .baselineValues(new DateTime(15334, 03, 17, 0, 0)) + .baselineValues(new DateTime(15334, 3, 17, 0, 0)) .go(); } @Test public void testReadNewMetadataCacheFileOverOldAndNewFiles() throws Exception { - String table = "dfs.`" + new Path(path, MIXED_CORRUPTED_AND_CORRECTED_PARTITIONED_FOLDER) + "`"; + String table = format("dfs.`%s`", new Path(path, MIXED_CORRUPTED_AND_CORRECT_PARTITIONED_FOLDER)); copyMetaDataCacheToTempReplacingInternalPaths("parquet/4203_corrupt_dates/" + - "mixed_version_partitioned_metadata.requires_replace.txt", MIXED_CORRUPTED_AND_CORRECTED_PARTITIONED_FOLDER); + "mixed_version_partitioned_metadata.requires_replace.txt", MIXED_CORRUPTED_AND_CORRECT_PARTITIONED_FOLDER); // for sanity, try reading all partitions without a filter TestBuilder builder = testBuilder() .sqlQuery("select date_col from " + table) .unOrdered() .baselineColumns("date_col"); - addDateBaselineVals(builder); - addDateBaselineVals(builder); - addDateBaselineVals(builder); + addDateBaselineValues(builder); + addDateBaselineValues(builder); + addDateBaselineValues(builder); builder.go(); - String query = "select date_col from " + table + - " where date_col = date '1970-01-01'"; + String query = format("select date_col from %s where date_col = date '1970-01-01'", table); // verify that pruning is actually taking place testPlanMatchingPatterns(query, new String[]{"numFiles=3", "usedMetadataFile=true"}, null); @@ -448,28 +416,38 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { .go(); } + @Test + public void testCorrectDateValuesGeneratedByOldVersionOfDrill() throws Exception { + testBuilder() + .sqlQuery("select i_rec_end_date from dfs.`%s` limit 1", CORRECT_DATES_1_6_0_PATH) + .baselineColumns("i_rec_end_date") + .unOrdered() + .baselineValues(new DateTime(2000, 10, 26, 0, 0)) + .go(); + } + /** * Read a directory with parquet files where some have corrupted dates, see DRILL-4203. * @throws Exception */ - private void readMixedCorruptedAndCorrectedDates() throws Exception { + private void readMixedCorruptedAndCorrectDates() throws Exception { // ensure that selecting the date column explicitly or as part of a star still results // in checking the file metadata for date columns (when we need to check the statistics // for bad values) to set the flag that the values are corrupt for (String selection : new String[] {"*", "date_col"}) { TestBuilder builder = testBuilder() - .sqlQuery("select " + selection + " from dfs.`" + MIXED_CORRUPTED_AND_CORRECTED_DATES_PATH + "`") + .sqlQuery("select %s from dfs.`%s`", selection, MIXED_CORRUPTED_AND_CORRECT_DATES_PATH) .unOrdered() .baselineColumns("date_col"); for (int i = 0; i < 4; i++) { - addDateBaselineVals(builder); + addDateBaselineValues(builder); } builder.go(); } } - private void addDateBaselineVals(TestBuilder builder) { + private void addDateBaselineValues(TestBuilder builder) { builder .baselineValues(new DateTime(1970, 1, 1, 0, 0)) .baselineValues(new DateTime(1970, 1, 2, 0, 0)) @@ -480,16 +458,16 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { } /** - * These are the same values added in the addDateBaselineVals, shifted as corrupt values + * These are the same values added in the addDateBaselineValues, shifted as corrupt values */ - private void addCorruptedDateBaselineVals(TestBuilder builder) { + private void addCorruptedDateBaselineValues(TestBuilder builder) { builder - .baselineValues(new DateTime(15334, 03, 17, 0, 0)) - .baselineValues(new DateTime(15334, 03, 18, 0, 0)) - .baselineValues(new DateTime(15334, 03, 15, 0, 0)) - .baselineValues(new DateTime(15334, 03, 16, 0, 0)) - .baselineValues(new DateTime(15264, 03, 16, 0, 0)) - .baselineValues(new DateTime(15379, 03, 17, 0, 0)); + .baselineValues(new DateTime(15334, 3, 17, 0, 0)) + .baselineValues(new DateTime(15334, 3, 18, 0, 0)) + .baselineValues(new DateTime(15334, 3, 15, 0, 0)) + .baselineValues(new DateTime(15334, 3, 16, 0, 0)) + .baselineValues(new DateTime(15264, 3, 16, 0, 0)) + .baselineValues(new DateTime(15379, 3, 17, 0, 0)); } private void readFilesWithUserDisabledAutoCorrection() throws Exception { @@ -498,14 +476,14 @@ public class TestCorruptParquetDateCorrection extends PlanTestBase { // for bad values) to set the flag that the values are corrupt for (String selection : new String[] {"*", "date_col"}) { TestBuilder builder = testBuilder() - .sqlQuery("select " + selection + " from table(dfs.`" + MIXED_CORRUPTED_AND_CORRECTED_DATES_PATH + "`" + - "(type => 'parquet', autoCorrectCorruptDates => false))") + .sqlQuery("select %s from table(dfs.`%s` (type => 'parquet', autoCorrectCorruptDates => false))", + selection, MIXED_CORRUPTED_AND_CORRECT_DATES_PATH) .unOrdered() .baselineColumns("date_col"); - addDateBaselineVals(builder); - addDateBaselineVals(builder); - addCorruptedDateBaselineVals(builder); - addCorruptedDateBaselineVals(builder); + addDateBaselineValues(builder); + addCorruptedDateBaselineValues(builder); + addCorruptedDateBaselineValues(builder); + addCorruptedDateBaselineValues(builder); builder.go(); } } http://git-wip-us.apache.org/repos/asf/drill/blob/eef3b3fb/exec/java-exec/src/test/resources/parquet/4203_corrupt_dates/correct_dates_and_old_drill_parquet_writer.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/parquet/4203_corrupt_dates/correct_dates_and_old_drill_parquet_writer.parquet b/exec/java-exec/src/test/resources/parquet/4203_corrupt_dates/correct_dates_and_old_drill_parquet_writer.parquet new file mode 100644 index 0000000..6d81db0 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/4203_corrupt_dates/correct_dates_and_old_drill_parquet_writer.parquet differ
