[GitHub] [carbondata] akashrn5 commented on a change in pull request #3896: [CARBONDATA-3955] Fix load failures due to daylight saving time changes
akashrn5 commented on a change in pull request #3896: URL: https://github.com/apache/carbondata/pull/3896#discussion_r476487539 ## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ## @@ -1592,6 +1592,15 @@ private CarbonCommonConstants() { public static final String CARBON_LUCENE_INDEX_STOP_WORDS_DEFAULT = "false"; + // Property to enable parsing the timestamp/date data with setLenient = true in load + // flow if it fails with parse invalid timestamp data. (example: 1941-03-15 00:00:00 + // is valid time in Asia/Calcutta zone and is invalid and will fail to parse in Asia/Shanghai + // zone as DST is observed and clocks were turned forward 1 hour to 1941-03-15 01:00:00) + @CarbonProperty(dynamicConfigurable = true) public static final String Review comment: `move public static final String` to next line ## File path: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ## @@ -444,9 +446,38 @@ private static Object parseTimestamp(String dimensionValue, String dateFormat) { dateFormatter = timestampFormatter.get(); } dateToStr = dateFormatter.parse(dimensionValue); - return dateToStr.getTime(); + timeValue = dateToStr.getTime(); + validateTimeStampRange(timeValue); + return timeValue; } catch (ParseException e) { - throw new NumberFormatException(e.getMessage()); + // If the parsing fails, try to parse again with setLenient to true if the property is set + if (CarbonProperties.getInstance().isSetLenientEnabled()) { +try { + dateFormatter.setLenient(true); + dateToStr = dateFormatter.parse(dimensionValue); + dateFormatter.setLenient(false); + timeValue = dateToStr.getTime(); + validateTimeStampRange(timeValue); + LOGGER.info("Parsed data with lenience as true, setting back to default mode"); + return timeValue; +} catch (ParseException ex) { + dateFormatter.setLenient(false); + LOGGER.info("Failed to parse data with lenience as true, setting back to default mode"); + throw new NumberFormatException(ex.getMessage()); +} + } else { +throw new NumberFormatException(e.getMessage()); + } +} + } + + private static void validateTimeStampRange(Long timeValue) { +if (timeValue < DateDirectDictionaryGenerator.MIN_VALUE +|| timeValue > DateDirectDictionaryGenerator.MAX_VALUE) { + throw new NumberFormatException( + "timestamp column data value: " + timeValue + "is not in valid " + "range of: " Review comment: `"is not in valid " + "range of: "` correct the formatting here, there are unnecessary concatenations ## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/util/BadRecordUtil.scala ## @@ -68,4 +71,32 @@ object BadRecordUtil { badRecordLocation = badRecordLocation + "/" + dbName + "/" + tableName FileFactory.deleteAllCarbonFilesOfDir(FileFactory.getCarbonFile(badRecordLocation)) } + + def createCSV(rows: ListBuffer[Array[String]], csvPath: String): Unit = { +val out = new BufferedWriter(new FileWriter(csvPath)) +val writer: CSVWriter = new CSVWriter(out) +try { + for (row <- rows) { +writer.writeNext(row) + } +} +catch { + case e: Exception => +Assert.fail(e.getMessage) +} +finally { + out.close() + writer.close() +} + } + + def deleteCSVFile(csvPath: String): Unit = { Review comment: i don't think this method is required, just call `FileUtils.forceDelete(new File(csvPath))` in each test case itself and no need of any Assert here, as its not doing any functional validation ## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/util/BadRecordUtil.scala ## @@ -68,4 +71,32 @@ object BadRecordUtil { badRecordLocation = badRecordLocation + "/" + dbName + "/" + tableName FileFactory.deleteAllCarbonFilesOfDir(FileFactory.getCarbonFile(badRecordLocation)) } + + def createCSV(rows: ListBuffer[Array[String]], csvPath: String): Unit = { +val out = new BufferedWriter(new FileWriter(csvPath)) Review comment: correct the formatting of this method and may be you can use try-with-resource here, you can check once This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3896: [CARBONDATA-3955] Fix load failures due to daylight saving time changes
akashrn5 commented on a change in pull request #3896: URL: https://github.com/apache/carbondata/pull/3896#discussion_r475716479 ## File path: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ## @@ -435,18 +436,48 @@ public static Object getDataDataTypeForNoDictionaryColumn(String dimensionValue, private static Object parseTimestamp(String dimensionValue, String dateFormat) { Date dateToStr; -DateFormat dateFormatter; +DateFormat dateFormatter = null; +long timeValue; try { if (null != dateFormat && !dateFormat.trim().isEmpty()) { dateFormatter = new SimpleDateFormat(dateFormat); -dateFormatter.setLenient(false); } else { dateFormatter = timestampFormatter.get(); } + dateFormatter.setLenient(false); dateToStr = dateFormatter.parse(dimensionValue); - return dateToStr.getTime(); + timeValue = dateToStr.getTime(); + validateTimeStampRange(timeValue); + return timeValue; } catch (ParseException e) { - throw new NumberFormatException(e.getMessage()); + // If the parsing fails, try to parse again with setLenient to true if the property is set + if (CarbonProperties.getInstance().isSetLenientEnabled()) { +try { + LOGGER.info("Changing setLenient to true"); + dateFormatter.setLenient(true); + dateToStr = dateFormatter.parse(dimensionValue); + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); Review comment: i think these logs not required here, may be you can add only one log say after line 462, saying you had set true as parsing failed for invalid data and parsing finished, something meaningful like this and in line 466, you can say `"failed to parse data with lenience as true, setting back to default"` like this , it will look clean and good ## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/badrecordloger/BadRecordActionTest.scala ## @@ -270,6 +274,53 @@ class BadRecordActionTest extends QueryTest { } } + test("test bad record FAIL with invalid timestamp range") { +val csvPath = s"$resourcesPath/badrecords/invalidTimeStampRange.csv" +val rows1 = new ListBuffer[Array[String]] +rows1 += Array("ID", "date", "time") +rows1 += Array("1", "2016-7-24", "342016-7-24 01:02:30") +createCSV(rows1, csvPath) +sql("DROP TABLE IF EXISTS test_time") +sql(""" + CREATE TABLE IF NOT EXISTS test_time + (ID Int, date Date, time Timestamp) + STORED AS carbondata TBLPROPERTIES('dateformat'='-MM-dd', + 'timestampformat'='-MM-dd HH:mm:ss') +""") +val exception = intercept[Exception] { + sql(s" LOAD DATA LOCAL INPATH '$resourcesPath/badrecords/invalidTimeStampRange.csv' " + + s"into table test_time options ('bad_records_action'='fail')") +} +assert(exception.getMessage + .contains( +"Data load failed due to bad record: The value with column name time and column data" + +" type TIMESTAMP is not a valid TIMESTAMP type.Please enable bad record logger to know" + +" the detail reason")) +sql("DROP TABLE IF EXISTS test_time") +deleteCSVFile(csvPath) + } + + def createCSV(rows: ListBuffer[Array[String]], csvPath: String): Unit = { +val out = new BufferedWriter(new FileWriter(csvPath)) +val writer: CSVWriter = new CSVWriter(out) + +for (row <- rows) { + writer.writeNext(row) +} +out.close() +writer.close() + } + + def deleteCSVFile(csvPath: String): Unit = { Review comment: no need to create new method only for delete, call the file delete API directly in test case as not used in many test case. ## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestLoadDataWithDiffTimestampFormat.scala ## @@ -306,7 +315,107 @@ class TestLoadDataWithDiffTimestampFormat extends QueryTest with BeforeAndAfterA } } + test("test load, update data with setlenient carbon property for daylight " + + "saving time from different timezone") { +CarbonProperties.getInstance().addProperty( + CarbonCommonConstants.CARBON_LOAD_SETLENIENT_ENABLE, "true") +val defaultTimeZone = TimeZone.getDefault +TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai")) +sql("DROP TABLE IF EXISTS test_time") +sql( + """ + CREATE TABLE IF NOT EXISTS test_time + (ID Int, date Date, time Timestamp) + STORED AS carbondata TBLPROPERTIES('dateformat'='-MM-dd', + 'timestampformat'='-MM-dd HH:mm') +""") +sql(s" LOAD DATA LOCAL INPATH '$resourcesPath/differentZoneTimeStamp.csv' into table test_time") +sql(s"insert into test_time select 11, '2016-7-24', '1941-3-15 00:00:00' ") +sql("update test_time set (time) = (
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3896: [CARBONDATA-3955] Fix load failures due to daylight saving time changes
akashrn5 commented on a change in pull request #3896: URL: https://github.com/apache/carbondata/pull/3896#discussion_r475346485 ## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ## @@ -1592,6 +1592,13 @@ private CarbonCommonConstants() { public static final String CARBON_LUCENE_INDEX_STOP_WORDS_DEFAULT = "false"; + // Property to enable parsing the timestamp/date data with setLenient = true in load + // flow if it fails with parse invalid timestamp data. Review comment: can give the timestamp example and mention DST, so that we will know in future why this property was added. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3896: [CARBONDATA-3955] Fix load failures due to daylight saving time changes
akashrn5 commented on a change in pull request #3896: URL: https://github.com/apache/carbondata/pull/3896#discussion_r474608789 ## File path: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ## @@ -435,19 +436,48 @@ public static Object getDataDataTypeForNoDictionaryColumn(String dimensionValue, private static Object parseTimestamp(String dimensionValue, String dateFormat) { Date dateToStr; -DateFormat dateFormatter; +DateFormat dateFormatter = null; try { if (null != dateFormat && !dateFormat.trim().isEmpty()) { dateFormatter = new SimpleDateFormat(dateFormat); -dateFormatter.setLenient(false); } else { dateFormatter = timestampFormatter.get(); } + dateFormatter.setLenient(false); dateToStr = dateFormatter.parse(dimensionValue); - return dateToStr.getTime(); + return validateTimeStampRange(dateToStr.getTime()); } catch (ParseException e) { - throw new NumberFormatException(e.getMessage()); + // If the parsing fails, try to parse again with setLenient to true if the property is set + if (CarbonProperties.getInstance().isSetLenientEnabled()) { +try { + LOGGER.info("Changing setLenient to true for TimeStamp: " + dimensionValue); + dateFormatter.setLenient(true); + dateToStr = dateFormatter.parse(dimensionValue); + LOGGER.info("Changing " + dimensionValue + " to " + dateToStr); + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + return validateTimeStampRange(dateToStr.getTime()); +} catch (ParseException ex) { + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + throw new NumberFormatException(ex.getMessage()); +} + } else { +throw new NumberFormatException(e.getMessage()); + } +} + } + + private static Long validateTimeStampRange(Long timeValue) { +long minValue = DateDirectDictionaryGenerator.MIN_VALUE; +long maxValue = DateDirectDictionaryGenerator.MAX_VALUE; +if (timeValue < minValue || timeValue > maxValue) { + if (LOGGER.isDebugEnabled()) { +LOGGER.debug("Value for timestamp type column is not in valid range."); + } + throw new NumberFormatException("Value for timestamp type column is not in valid range."); Review comment: ```suggestion throw new NumberFormatException("timestamp column data is not in valid range of: " + DateDirectDictionaryGenerator.MIN_VALUE + " and " + DateDirectDictionaryGenerator.MAX_VALUE ); ``` ## File path: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ## @@ -435,19 +436,48 @@ public static Object getDataDataTypeForNoDictionaryColumn(String dimensionValue, private static Object parseTimestamp(String dimensionValue, String dateFormat) { Date dateToStr; -DateFormat dateFormatter; +DateFormat dateFormatter = null; try { if (null != dateFormat && !dateFormat.trim().isEmpty()) { dateFormatter = new SimpleDateFormat(dateFormat); -dateFormatter.setLenient(false); } else { dateFormatter = timestampFormatter.get(); } + dateFormatter.setLenient(false); dateToStr = dateFormatter.parse(dimensionValue); - return dateToStr.getTime(); + return validateTimeStampRange(dateToStr.getTime()); } catch (ParseException e) { - throw new NumberFormatException(e.getMessage()); + // If the parsing fails, try to parse again with setLenient to true if the property is set + if (CarbonProperties.getInstance().isSetLenientEnabled()) { +try { + LOGGER.info("Changing setLenient to true for TimeStamp: " + dimensionValue); + dateFormatter.setLenient(true); + dateToStr = dateFormatter.parse(dimensionValue); + LOGGER.info("Changing " + dimensionValue + " to " + dateToStr); + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + return validateTimeStampRange(dateToStr.getTime()); +} catch (ParseException ex) { + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + throw new NumberFormatException(ex.getMessage()); +} + } else { +throw new NumberFormatException(e.getMessage()); + } +} + } + + private static Long validateTimeStampRange(Long timeValue) { +long minValue = DateDirectDictionaryGenerator.MIN_VALUE; +long maxValue = DateDirectDictionaryGenerator.MAX_VALUE; +if (timeValue < minValue || timeValue > maxValue) { + if (LOGGER.isDebugEnabled()) { Review comment: here debug log is not required, because always the exception is thrown here. ## File path: core/src/main/java/org/apache/carbondata/core/constants/Ca