This is an automated email from the ASF dual-hosted git repository.

simhadrig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new ea70fb82597 HIVE-28249: Parquet legacy timezone conversion converts 
march 1st to 29th feb and fails with not a leap year exception .(#5241) 
(Simhadri Govindappa , reviewed by Denys Kuzmenko and  Stamatis Zampetakis )
ea70fb82597 is described below

commit ea70fb82597bf001da51e7b90446442904e83fb2
Author: Simhadri Govindappa <[email protected]>
AuthorDate: Thu May 16 12:27:49 2024 +0530

    HIVE-28249: Parquet legacy timezone conversion converts march 1st to 29th 
feb and fails with not a leap year exception .(#5241) (Simhadri Govindappa , 
reviewed by Denys Kuzmenko and  Stamatis Zampetakis )
---
 .../apache/hadoop/hive/common/type/Timestamp.java  |   7 +-
 .../hadoop/hive/common/type/TimestampTZUtil.java   |  22 +++-
 data/files/legacyLeapYearInSingaporeTZ             | Bin 0 -> 365 bytes
 .../ql/io/parquet/timestamp/NanoTimeUtils.java     |  27 +++--
 .../TestParquetTimestampsHive2Compatibility.java   | 120 +++++++++++++++++++++
 .../clientpositive/parquet_historical_timestamp.q  |  14 ++-
 .../llap/parquet_historical_timestamp.q.out        |  37 ++++++-
 7 files changed, 214 insertions(+), 13 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java 
b/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
index 30b1074c934..c026f8a8e9e 100644
--- a/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
+++ b/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
@@ -191,12 +191,17 @@ public class Timestamp implements Comparable<Timestamp> {
       try {
         localDateTime = LocalDateTime.parse(s);
       } catch (DateTimeException e2) {
-        throw new IllegalArgumentException("Cannot create timestamp, parsing 
error " + s);
+        throw new IllegalArgumentException("Cannot create timestamp, parsing 
error " + s, e);
       }
     }
     return new Timestamp(localDateTime);
   }
 
+  public Timestamp minusDays(long days) {
+    localDateTime = localDateTime.minusDays(days);
+    return this;
+  }
+
   public static Timestamp getTimestampFromTime(String s) {
     return new Timestamp(LocalDateTime.of(LocalDate.now(),
         LocalTime.parse(s, DateTimeFormatter.ISO_LOCAL_TIME)));
diff --git 
a/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java 
b/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
index 7a15c4abad7..7d9208b89b4 100644
--- a/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
+++ b/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
@@ -25,6 +25,7 @@ import java.time.Instant;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
 import java.time.LocalTime;
+import java.time.Month;
 import java.time.ZoneId;
 import java.time.ZoneOffset;
 import java.time.ZonedDateTime;
@@ -39,8 +40,7 @@ import java.time.temporal.TemporalQueries;
 import java.util.TimeZone;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
-import org.apache.hive.common.util.DateUtils;
+import org.apache.commons.lang3.time.DateUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -50,6 +50,7 @@ import static java.time.temporal.ChronoField.MINUTE_OF_HOUR;
 import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
 import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
 import static java.time.temporal.ChronoField.YEAR;
+import static org.apache.hive.common.util.DateUtils.NANOS_PER_SEC;
 
 public class TimestampTZUtil {
 
@@ -58,6 +59,8 @@ public class TimestampTZUtil {
   private static final LocalTime DEFAULT_LOCAL_TIME = LocalTime.of(0, 0);
   private static final Pattern SINGLE_DIGIT_PATTERN = 
Pattern.compile("[\\+-]\\d:\\d\\d");
 
+  public static final String NOT_LEAP_YEAR = "not a leap year";
+
   private static final DateTimeFormatter TIMESTAMP_FORMATTER = new 
DateTimeFormatterBuilder()
       // Date and Time Parts
       .appendValue(YEAR, 4, 10, 
SignStyle.NORMAL).appendLiteral('-').appendValue(MONTH_OF_YEAR, 2, 2, 
SignStyle.NORMAL)
@@ -193,7 +196,18 @@ public class TimestampTZUtil {
         java.util.Date date = formatter.parse(ts.format(TIMESTAMP_FORMATTER));
         // Set the formatter to use a different timezone
         formatter.setTimeZone(TimeZone.getTimeZone(toZone));
-        Timestamp result = Timestamp.valueOf(formatter.format(date));
+        Timestamp result;
+        try {
+          result = Timestamp.valueOf(formatter.format(date));
+        } catch (IllegalArgumentException ex) {
+          if (ex.getCause().getMessage().contains(NOT_LEAP_YEAR)) {
+            int leapYearDateAdjustment = ts.getMonth() == 
Month.MARCH.getValue() ? 2 : -2;
+            result = 
Timestamp.valueOf(formatter.format(DateUtils.addDays(date, 
leapYearDateAdjustment)))
+                .minusDays(leapYearDateAdjustment);
+          } else {
+            throw ex;
+          }
+        }
         result.setNanos(ts.getNanos());
         return result;
       } catch (ParseException e) {
@@ -211,6 +225,6 @@ public class TimestampTZUtil {
   }
 
   public static double convertTimestampTZToDouble(TimestampTZ timestampTZ) {
-    return timestampTZ.getEpochSecond() + timestampTZ.getNanos() / 
DateUtils.NANOS_PER_SEC;
+    return timestampTZ.getEpochSecond() + timestampTZ.getNanos() / 
NANOS_PER_SEC;
   }
 }
diff --git a/data/files/legacyLeapYearInSingaporeTZ 
b/data/files/legacyLeapYearInSingaporeTZ
new file mode 100644
index 00000000000..d9476cedc23
Binary files /dev/null and b/data/files/legacyLeapYearInSingaporeTZ differ
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
index 78dce528842..05635ff9768 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
@@ -13,6 +13,8 @@
  */
 package org.apache.hadoop.hive.ql.io.parquet.timestamp;
 
+import java.time.DateTimeException;
+import java.time.LocalDateTime;
 import java.time.ZoneId;
 import java.time.ZoneOffset;
 import java.util.Calendar;
@@ -100,12 +102,23 @@ public class NanoTimeUtils {
       julianDay--;
     }
 
-    JulianDate jDateTime;
-    jDateTime = JulianDate.of((double) julianDay);
+    JulianDate jDateTime = JulianDate.of((double) julianDay);
+    LocalDateTime localDateTime;
+    int leapYearDateAdjustment = 0;
+    try {
+      localDateTime = jDateTime.toLocalDateTime();
+    } catch (DateTimeException e) {
+      if (e.getMessage().contains(TimestampTZUtil.NOT_LEAP_YEAR) && 
legacyConversion) {
+        leapYearDateAdjustment = 2;
+        localDateTime = 
jDateTime.add(leapYearDateAdjustment).toLocalDateTime();
+      } else {
+        throw e;
+      }
+    }
     Calendar calendar = getGMTCalendar();
-    calendar.set(Calendar.YEAR, jDateTime.toLocalDateTime().getYear());
-    calendar.set(Calendar.MONTH, 
jDateTime.toLocalDateTime().getMonth().getValue() - 1); //java calendar index 
starting at 1.
-    calendar.set(Calendar.DAY_OF_MONTH, 
jDateTime.toLocalDateTime().getDayOfMonth());
+    calendar.set(Calendar.YEAR, localDateTime.getYear());
+    calendar.set(Calendar.MONTH, localDateTime.getMonth().getValue() - 1); 
//java calendar index starting at 1.
+    calendar.set(Calendar.DAY_OF_MONTH, localDateTime.getDayOfMonth());
 
     int hour = (int) (remainder / (NANOS_PER_HOUR));
     remainder = remainder % (NANOS_PER_HOUR);
@@ -119,7 +132,9 @@ public class NanoTimeUtils {
     calendar.set(Calendar.SECOND, seconds);
 
     Timestamp ts = Timestamp.ofEpochMilli(calendar.getTimeInMillis(), (int) 
nanos);
-    ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC, 
targetZone, legacyConversion);
+    ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC, 
targetZone, legacyConversion)
+        .minusDays(leapYearDateAdjustment);
+
     return ts;
   }
 }
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
index db06e43d157..ac0ced65af1 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.hive.common.type.Timestamp;
 import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime;
 import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.MethodSource;
 
@@ -97,6 +98,69 @@ class TestParquetTimestampsHive2Compatibility {
     }
   }
 
+  /**
+   * Tests that timestamps written using Hive2 APIs on julian leap years are 
read correctly by Hive4 APIs when legacy
+   * conversion is on.
+   */
+  @ParameterizedTest(name = "{0}")
+  @MethodSource("generateTimestampsAndZoneIds")
+  void testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYears(String 
timestampString, String zoneId) {
+    TimeZone original = TimeZone.getDefault();
+    try {
+      TimeZone.setDefault(TimeZone.getTimeZone(zoneId));
+      NanoTime nt = writeHive2(timestampString);
+      Timestamp ts = readHive4(nt, zoneId, true);
+      assertEquals(timestampString, ts.toString());
+    } finally {
+      TimeZone.setDefault(original);
+    }
+  }
+
+  @ParameterizedTest(name = "{0}")
+  @MethodSource("generateTimestampsAndZoneIds28thFeb")
+  void 
testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYearsFor28thFeb(String
 timestampString,
+      String zoneId) {
+    TimeZone original = TimeZone.getDefault();
+    try {
+      TimeZone.setDefault(TimeZone.getTimeZone(zoneId));
+      NanoTime nt = writeHive2(timestampString);
+      Timestamp ts = readHive4(nt, zoneId, true);
+      assertEquals(timestampString, ts.toString());
+    } finally {
+      TimeZone.setDefault(original);
+    }
+  }
+
+  @ParameterizedTest(name = " - From: Zone {0}, timestamp: {2}, To: Zone:{1}, 
expected Timestamp {3}")
+  @MethodSource("julianLeapYearEdgeCases")
+  void 
testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYearsEdgeCase(String 
fromZoneId, String toZoneId,
+      String timestampString, String expected) {
+    TimeZone original = TimeZone.getDefault();
+    try {
+      TimeZone.setDefault(TimeZone.getTimeZone(fromZoneId));
+      NanoTime nt = writeHive2(timestampString);
+      Timestamp ts = readHive4(nt, toZoneId, true);
+      assertEquals(expected, ts.toString());
+    } finally {
+      TimeZone.setDefault(original);
+    }
+  }
+
+  private static Stream<Arguments> julianLeapYearEdgeCases() {
+    return Stream.of(Arguments.of("GMT-12:00", "GMT+14:00", "0200-02-27 
22:00:00.000000001",
+            "0200-03-01 00:00:00.000000001"),
+        Arguments.of("GMT+14:00", "GMT-12:00", "0200-03-01 00:00:00.000000001",
+            "0200-02-27 22:00:00.000000001"),
+        Arguments.of("GMT+14:00", "GMT-12:00", "0200-03-02 00:00:00.000000001",
+            "0200-02-28 22:00:00.000000001"),
+        Arguments.of("GMT-12:00", "GMT+14:00", "0200-03-02 00:00:00.000000001",
+            "0200-03-03 02:00:00.000000001"),
+        Arguments.of("GMT-12:00", "GMT+12:00", "0200-02-28 
00:00:00.000000001", "0200-03-01 00:00:00.000000001"),
+        Arguments.of("GMT+12:00", "GMT-12:00", "0200-03-01 
00:00:00.000000001", "0200-02-28 00:00:00.000000001"),
+        Arguments.of("Asia/Singapore", "Asia/Singapore", "0200-03-01 
00:00:00.000000001",
+            "0200-03-01 00:00:00.000000001"));
+  }
+
   /**
    * Tests that timestamps written using Hive4 APIs are read correctly by 
Hive4 APIs when legacy conversion is on. 
    */
@@ -178,6 +242,62 @@ class TestParquetTimestampsHive2Compatibility {
     .limit(3000), Stream.of("9999-12-31 23:59:59.999"));
   }
 
+  /** Generates timestamps for different timezone. Here we are testing UTC+14 
: Pacific/Kiritimati ,
+   *  UTC-12 : Etc/GMT+12 along with few other zones
+   */
+  private static Stream<Arguments> generateTimestampsAndZoneIds() {
+    return generateJulianLeapYearTimestamps().flatMap(
+        timestampString -> Stream.of("Asia/Singapore", "Pacific/Kiritimati", 
"Etc/GMT+12", "Pacific/Niue")
+            .map(zoneId -> Arguments.of(timestampString, zoneId)));
+  }
+
+  private static Stream<Arguments> generateTimestampsAndZoneIds28thFeb() {
+    return generateJulianLeapYearTimestamps28thFeb().flatMap(
+        timestampString -> Stream.of("Asia/Singapore", "Pacific/Kiritimati", 
"Etc/GMT+12", "Pacific/Niue")
+            .map(zoneId -> Arguments.of(timestampString, zoneId)));
+  }
+
+  private static Stream<String> generateJulianLeapYearTimestamps() {
+    return Stream.concat(Stream.generate(new Supplier<String>() {
+          int i = 0;
+
+          @Override
+          public String get() {
+            StringBuilder sb = new StringBuilder(29);
+            int year = ((i % 9999) + 1) * 100;
+            sb.append(zeros(4 - digits(year)));
+            sb.append(year);
+            sb.append("-03-01 00:00:00.000000001");
+            i++;
+            return sb.toString();
+          }
+        })
+        // Exclude dates falling in the default Gregorian change date since 
legacy code does not handle that interval
+        // gracefully. It is expected that these do not work well when legacy 
APIs are in use. 0200-03-01 01:01:01.000000001
+        .filter(s -> !s.startsWith("1582-10")).limit(3000), 
Stream.of("9999-12-31 23:59:59.999"));
+  }
+
+  private static Stream<String> generateJulianLeapYearTimestamps28thFeb() {
+    return Stream.concat(Stream.generate(new Supplier<String>() {
+          int i = 0;
+
+          @Override
+          public String get() {
+            StringBuilder sb = new StringBuilder(29);
+            int year = ((i % 9999) + 1) * 100;
+            sb.append(zeros(4 - digits(year)));
+            sb.append(year);
+            sb.append("-02-28 00:00:00.000000001");
+            i++;
+            return sb.toString();
+          }
+        })
+        // Exclude dates falling in the default Gregorian change date since 
legacy code does not handle that interval
+        // gracefully. It is expected that these do not work well when legacy 
APIs are in use. 0200-03-01 01:01:01.000000001
+        .filter(s -> !s.startsWith("1582-10")).limit(3000), 
Stream.of("9999-12-31 23:59:59.999"));
+  }
+
+
   private static int digits(int number) {
     int digits = 0;
     do {
diff --git a/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q 
b/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
index 3d2b382af74..7a4f7e44c7c 100644
--- a/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
+++ b/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
@@ -1,3 +1,4 @@
+--! qt:timezone:Asia/Singapore
 --These files were created by inserting timestamp '2019-01-01 
00:30:30.111111111' where writer time zone is Europe/Rome.
 
 --older writer: time zone dependent behavior. convert to reader time zone
@@ -13,4 +14,15 @@ create table new_table (t timestamp) stored as parquet;
 
 load data local inpath 
'../../data/files/parquet_historical_timestamp_new.parq' into table new_table;
 
-select * from new_table;
\ No newline at end of file
+select * from new_table;
+
+-- Inserted data in singapore timezone: insert into default.test_sgt700 values 
('0500-03-01 00:00:00'),('0600-03-01
+-- 00:00:00'),('0700-03-01 00:00:00'), ('0200-03-01 00:00:00'), ('0400-03-01 
00:00:00'), ('0800-03-02 00:00:00'),
+--('0800-03-03 00:00:00');
+
+CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';
+
+LOAD DATA LOCAL INPATH '../../data/files/legacyLeapYearInSingaporeTZ' INTO 
TABLE TEST_SGT1;
+SELECT * FROM TEST_SGT1;
diff --git 
a/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out 
b/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
index 9d50b226777..2839d8eeaad 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
@@ -22,7 +22,7 @@ POSTHOOK: query: select * from legacy_table
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@legacy_table
 #### A masked pattern was here ####
-2018-12-31 16:30:30.111111111
+2019-01-01 08:30:30.111111111
 PREHOOK: query: create table new_table (t timestamp) stored as parquet
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
@@ -48,3 +48,38 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@new_table
 #### A masked pattern was here ####
 2019-01-01 00:30:30.111111111
+PREHOOK: query: CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW 
FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@TEST_SGT1
+POSTHOOK: query: CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW 
FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@TEST_SGT1
+PREHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@test_sgt1
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@test_sgt1
+PREHOOK: query: SELECT * FROM TEST_SGT1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_sgt1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM TEST_SGT1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_sgt1
+#### A masked pattern was here ####
+0500-03-01 00:00:00
+0600-03-01 00:00:00
+0700-03-01 00:00:00
+0200-03-01 00:00:00
+0400-03-01 00:00:00
+0800-03-02 00:00:00
+0800-03-03 00:00:00

Reply via email to