This is an automated email from the ASF dual-hosted git repository.
simhadrig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new ea70fb82597 HIVE-28249: Parquet legacy timezone conversion converts
march 1st to 29th feb and fails with not a leap year exception .(#5241)
(Simhadri Govindappa , reviewed by Denys Kuzmenko and Stamatis Zampetakis )
ea70fb82597 is described below
commit ea70fb82597bf001da51e7b90446442904e83fb2
Author: Simhadri Govindappa <[email protected]>
AuthorDate: Thu May 16 12:27:49 2024 +0530
HIVE-28249: Parquet legacy timezone conversion converts march 1st to 29th
feb and fails with not a leap year exception .(#5241) (Simhadri Govindappa ,
reviewed by Denys Kuzmenko and Stamatis Zampetakis )
---
.../apache/hadoop/hive/common/type/Timestamp.java | 7 +-
.../hadoop/hive/common/type/TimestampTZUtil.java | 22 +++-
data/files/legacyLeapYearInSingaporeTZ | Bin 0 -> 365 bytes
.../ql/io/parquet/timestamp/NanoTimeUtils.java | 27 +++--
.../TestParquetTimestampsHive2Compatibility.java | 120 +++++++++++++++++++++
.../clientpositive/parquet_historical_timestamp.q | 14 ++-
.../llap/parquet_historical_timestamp.q.out | 37 ++++++-
7 files changed, 214 insertions(+), 13 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
b/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
index 30b1074c934..c026f8a8e9e 100644
--- a/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
+++ b/common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java
@@ -191,12 +191,17 @@ public class Timestamp implements Comparable<Timestamp> {
try {
localDateTime = LocalDateTime.parse(s);
} catch (DateTimeException e2) {
- throw new IllegalArgumentException("Cannot create timestamp, parsing
error " + s);
+ throw new IllegalArgumentException("Cannot create timestamp, parsing
error " + s, e);
}
}
return new Timestamp(localDateTime);
}
+ public Timestamp minusDays(long days) {
+ localDateTime = localDateTime.minusDays(days);
+ return this;
+ }
+
public static Timestamp getTimestampFromTime(String s) {
return new Timestamp(LocalDateTime.of(LocalDate.now(),
LocalTime.parse(s, DateTimeFormatter.ISO_LOCAL_TIME)));
diff --git
a/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
b/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
index 7a15c4abad7..7d9208b89b4 100644
--- a/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
+++ b/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java
@@ -25,6 +25,7 @@ import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
+import java.time.Month;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
@@ -39,8 +40,7 @@ import java.time.temporal.TemporalQueries;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
-import org.apache.hive.common.util.DateUtils;
+import org.apache.commons.lang3.time.DateUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -50,6 +50,7 @@ import static java.time.temporal.ChronoField.MINUTE_OF_HOUR;
import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
import static java.time.temporal.ChronoField.YEAR;
+import static org.apache.hive.common.util.DateUtils.NANOS_PER_SEC;
public class TimestampTZUtil {
@@ -58,6 +59,8 @@ public class TimestampTZUtil {
private static final LocalTime DEFAULT_LOCAL_TIME = LocalTime.of(0, 0);
private static final Pattern SINGLE_DIGIT_PATTERN =
Pattern.compile("[\\+-]\\d:\\d\\d");
+ public static final String NOT_LEAP_YEAR = "not a leap year";
+
private static final DateTimeFormatter TIMESTAMP_FORMATTER = new
DateTimeFormatterBuilder()
// Date and Time Parts
.appendValue(YEAR, 4, 10,
SignStyle.NORMAL).appendLiteral('-').appendValue(MONTH_OF_YEAR, 2, 2,
SignStyle.NORMAL)
@@ -193,7 +196,18 @@ public class TimestampTZUtil {
java.util.Date date = formatter.parse(ts.format(TIMESTAMP_FORMATTER));
// Set the formatter to use a different timezone
formatter.setTimeZone(TimeZone.getTimeZone(toZone));
- Timestamp result = Timestamp.valueOf(formatter.format(date));
+ Timestamp result;
+ try {
+ result = Timestamp.valueOf(formatter.format(date));
+ } catch (IllegalArgumentException ex) {
+ if (ex.getCause().getMessage().contains(NOT_LEAP_YEAR)) {
+ int leapYearDateAdjustment = ts.getMonth() ==
Month.MARCH.getValue() ? 2 : -2;
+ result =
Timestamp.valueOf(formatter.format(DateUtils.addDays(date,
leapYearDateAdjustment)))
+ .minusDays(leapYearDateAdjustment);
+ } else {
+ throw ex;
+ }
+ }
result.setNanos(ts.getNanos());
return result;
} catch (ParseException e) {
@@ -211,6 +225,6 @@ public class TimestampTZUtil {
}
public static double convertTimestampTZToDouble(TimestampTZ timestampTZ) {
- return timestampTZ.getEpochSecond() + timestampTZ.getNanos() /
DateUtils.NANOS_PER_SEC;
+ return timestampTZ.getEpochSecond() + timestampTZ.getNanos() /
NANOS_PER_SEC;
}
}
diff --git a/data/files/legacyLeapYearInSingaporeTZ
b/data/files/legacyLeapYearInSingaporeTZ
new file mode 100644
index 00000000000..d9476cedc23
Binary files /dev/null and b/data/files/legacyLeapYearInSingaporeTZ differ
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
index 78dce528842..05635ff9768 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
@@ -13,6 +13,8 @@
*/
package org.apache.hadoop.hive.ql.io.parquet.timestamp;
+import java.time.DateTimeException;
+import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.util.Calendar;
@@ -100,12 +102,23 @@ public class NanoTimeUtils {
julianDay--;
}
- JulianDate jDateTime;
- jDateTime = JulianDate.of((double) julianDay);
+ JulianDate jDateTime = JulianDate.of((double) julianDay);
+ LocalDateTime localDateTime;
+ int leapYearDateAdjustment = 0;
+ try {
+ localDateTime = jDateTime.toLocalDateTime();
+ } catch (DateTimeException e) {
+ if (e.getMessage().contains(TimestampTZUtil.NOT_LEAP_YEAR) &&
legacyConversion) {
+ leapYearDateAdjustment = 2;
+ localDateTime =
jDateTime.add(leapYearDateAdjustment).toLocalDateTime();
+ } else {
+ throw e;
+ }
+ }
Calendar calendar = getGMTCalendar();
- calendar.set(Calendar.YEAR, jDateTime.toLocalDateTime().getYear());
- calendar.set(Calendar.MONTH,
jDateTime.toLocalDateTime().getMonth().getValue() - 1); //java calendar index
starting at 1.
- calendar.set(Calendar.DAY_OF_MONTH,
jDateTime.toLocalDateTime().getDayOfMonth());
+ calendar.set(Calendar.YEAR, localDateTime.getYear());
+ calendar.set(Calendar.MONTH, localDateTime.getMonth().getValue() - 1);
//java calendar index starting at 1.
+ calendar.set(Calendar.DAY_OF_MONTH, localDateTime.getDayOfMonth());
int hour = (int) (remainder / (NANOS_PER_HOUR));
remainder = remainder % (NANOS_PER_HOUR);
@@ -119,7 +132,9 @@ public class NanoTimeUtils {
calendar.set(Calendar.SECOND, seconds);
Timestamp ts = Timestamp.ofEpochMilli(calendar.getTimeInMillis(), (int)
nanos);
- ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC,
targetZone, legacyConversion);
+ ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC,
targetZone, legacyConversion)
+ .minusDays(leapYearDateAdjustment);
+
return ts;
}
}
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
index db06e43d157..ac0ced65af1 100644
---
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
+++
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampsHive2Compatibility.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
@@ -97,6 +98,69 @@ class TestParquetTimestampsHive2Compatibility {
}
}
+ /**
+ * Tests that timestamps written using Hive2 APIs on julian leap years are
read correctly by Hive4 APIs when legacy
+ * conversion is on.
+ */
+ @ParameterizedTest(name = "{0}")
+ @MethodSource("generateTimestampsAndZoneIds")
+ void testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYears(String
timestampString, String zoneId) {
+ TimeZone original = TimeZone.getDefault();
+ try {
+ TimeZone.setDefault(TimeZone.getTimeZone(zoneId));
+ NanoTime nt = writeHive2(timestampString);
+ Timestamp ts = readHive4(nt, zoneId, true);
+ assertEquals(timestampString, ts.toString());
+ } finally {
+ TimeZone.setDefault(original);
+ }
+ }
+
+ @ParameterizedTest(name = "{0}")
+ @MethodSource("generateTimestampsAndZoneIds28thFeb")
+ void
testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYearsFor28thFeb(String
timestampString,
+ String zoneId) {
+ TimeZone original = TimeZone.getDefault();
+ try {
+ TimeZone.setDefault(TimeZone.getTimeZone(zoneId));
+ NanoTime nt = writeHive2(timestampString);
+ Timestamp ts = readHive4(nt, zoneId, true);
+ assertEquals(timestampString, ts.toString());
+ } finally {
+ TimeZone.setDefault(original);
+ }
+ }
+
+ @ParameterizedTest(name = " - From: Zone {0}, timestamp: {2}, To: Zone:{1},
expected Timestamp {3}")
+ @MethodSource("julianLeapYearEdgeCases")
+ void
testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYearsEdgeCase(String
fromZoneId, String toZoneId,
+ String timestampString, String expected) {
+ TimeZone original = TimeZone.getDefault();
+ try {
+ TimeZone.setDefault(TimeZone.getTimeZone(fromZoneId));
+ NanoTime nt = writeHive2(timestampString);
+ Timestamp ts = readHive4(nt, toZoneId, true);
+ assertEquals(expected, ts.toString());
+ } finally {
+ TimeZone.setDefault(original);
+ }
+ }
+
+ private static Stream<Arguments> julianLeapYearEdgeCases() {
+ return Stream.of(Arguments.of("GMT-12:00", "GMT+14:00", "0200-02-27
22:00:00.000000001",
+ "0200-03-01 00:00:00.000000001"),
+ Arguments.of("GMT+14:00", "GMT-12:00", "0200-03-01 00:00:00.000000001",
+ "0200-02-27 22:00:00.000000001"),
+ Arguments.of("GMT+14:00", "GMT-12:00", "0200-03-02 00:00:00.000000001",
+ "0200-02-28 22:00:00.000000001"),
+ Arguments.of("GMT-12:00", "GMT+14:00", "0200-03-02 00:00:00.000000001",
+ "0200-03-03 02:00:00.000000001"),
+ Arguments.of("GMT-12:00", "GMT+12:00", "0200-02-28
00:00:00.000000001", "0200-03-01 00:00:00.000000001"),
+ Arguments.of("GMT+12:00", "GMT-12:00", "0200-03-01
00:00:00.000000001", "0200-02-28 00:00:00.000000001"),
+ Arguments.of("Asia/Singapore", "Asia/Singapore", "0200-03-01
00:00:00.000000001",
+ "0200-03-01 00:00:00.000000001"));
+ }
+
/**
* Tests that timestamps written using Hive4 APIs are read correctly by
Hive4 APIs when legacy conversion is on.
*/
@@ -178,6 +242,62 @@ class TestParquetTimestampsHive2Compatibility {
.limit(3000), Stream.of("9999-12-31 23:59:59.999"));
}
+ /** Generates timestamps for different timezone. Here we are testing UTC+14
: Pacific/Kiritimati ,
+ * UTC-12 : Etc/GMT+12 along with few other zones
+ */
+ private static Stream<Arguments> generateTimestampsAndZoneIds() {
+ return generateJulianLeapYearTimestamps().flatMap(
+ timestampString -> Stream.of("Asia/Singapore", "Pacific/Kiritimati",
"Etc/GMT+12", "Pacific/Niue")
+ .map(zoneId -> Arguments.of(timestampString, zoneId)));
+ }
+
+ private static Stream<Arguments> generateTimestampsAndZoneIds28thFeb() {
+ return generateJulianLeapYearTimestamps28thFeb().flatMap(
+ timestampString -> Stream.of("Asia/Singapore", "Pacific/Kiritimati",
"Etc/GMT+12", "Pacific/Niue")
+ .map(zoneId -> Arguments.of(timestampString, zoneId)));
+ }
+
+ private static Stream<String> generateJulianLeapYearTimestamps() {
+ return Stream.concat(Stream.generate(new Supplier<String>() {
+ int i = 0;
+
+ @Override
+ public String get() {
+ StringBuilder sb = new StringBuilder(29);
+ int year = ((i % 9999) + 1) * 100;
+ sb.append(zeros(4 - digits(year)));
+ sb.append(year);
+ sb.append("-03-01 00:00:00.000000001");
+ i++;
+ return sb.toString();
+ }
+ })
+ // Exclude dates falling in the default Gregorian change date since
legacy code does not handle that interval
+ // gracefully. It is expected that these do not work well when legacy
APIs are in use. 0200-03-01 01:01:01.000000001
+ .filter(s -> !s.startsWith("1582-10")).limit(3000),
Stream.of("9999-12-31 23:59:59.999"));
+ }
+
+ private static Stream<String> generateJulianLeapYearTimestamps28thFeb() {
+ return Stream.concat(Stream.generate(new Supplier<String>() {
+ int i = 0;
+
+ @Override
+ public String get() {
+ StringBuilder sb = new StringBuilder(29);
+ int year = ((i % 9999) + 1) * 100;
+ sb.append(zeros(4 - digits(year)));
+ sb.append(year);
+ sb.append("-02-28 00:00:00.000000001");
+ i++;
+ return sb.toString();
+ }
+ })
+ // Exclude dates falling in the default Gregorian change date since
legacy code does not handle that interval
+ // gracefully. It is expected that these do not work well when legacy
APIs are in use. 0200-03-01 01:01:01.000000001
+ .filter(s -> !s.startsWith("1582-10")).limit(3000),
Stream.of("9999-12-31 23:59:59.999"));
+ }
+
+
private static int digits(int number) {
int digits = 0;
do {
diff --git a/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
b/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
index 3d2b382af74..7a4f7e44c7c 100644
--- a/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
+++ b/ql/src/test/queries/clientpositive/parquet_historical_timestamp.q
@@ -1,3 +1,4 @@
+--! qt:timezone:Asia/Singapore
--These files were created by inserting timestamp '2019-01-01
00:30:30.111111111' where writer time zone is Europe/Rome.
--older writer: time zone dependent behavior. convert to reader time zone
@@ -13,4 +14,15 @@ create table new_table (t timestamp) stored as parquet;
load data local inpath
'../../data/files/parquet_historical_timestamp_new.parq' into table new_table;
-select * from new_table;
\ No newline at end of file
+select * from new_table;
+
+-- Inserted data in singapore timezone: insert into default.test_sgt700 values
('0500-03-01 00:00:00'),('0600-03-01
+-- 00:00:00'),('0700-03-01 00:00:00'), ('0200-03-01 00:00:00'), ('0400-03-01
00:00:00'), ('0800-03-02 00:00:00'),
+--('0800-03-03 00:00:00');
+
+CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';
+
+LOAD DATA LOCAL INPATH '../../data/files/legacyLeapYearInSingaporeTZ' INTO
TABLE TEST_SGT1;
+SELECT * FROM TEST_SGT1;
diff --git
a/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
b/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
index 9d50b226777..2839d8eeaad 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_historical_timestamp.q.out
@@ -22,7 +22,7 @@ POSTHOOK: query: select * from legacy_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@legacy_table
#### A masked pattern was here ####
-2018-12-31 16:30:30.111111111
+2019-01-01 08:30:30.111111111
PREHOOK: query: create table new_table (t timestamp) stored as parquet
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
@@ -48,3 +48,38 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@new_table
#### A masked pattern was here ####
2019-01-01 00:30:30.111111111
+PREHOOK: query: CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW
FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@TEST_SGT1
+POSTHOOK: query: CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW
FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@TEST_SGT1
+PREHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@test_sgt1
+POSTHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@test_sgt1
+PREHOOK: query: SELECT * FROM TEST_SGT1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_sgt1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM TEST_SGT1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_sgt1
+#### A masked pattern was here ####
+0500-03-01 00:00:00
+0600-03-01 00:00:00
+0700-03-01 00:00:00
+0200-03-01 00:00:00
+0400-03-01 00:00:00
+0800-03-02 00:00:00
+0800-03-03 00:00:00