[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3819: [CARBONDATA-3855]support carbon SDK to load data from different files
CarbonDataQA1 commented on pull request #3819: URL: https://github.com/apache/carbondata/pull/3819#issuecomment-678553514 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2094/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3819: [CARBONDATA-3855]support carbon SDK to load data from different files
CarbonDataQA1 commented on pull request #3819: URL: https://github.com/apache/carbondata/pull/3819#issuecomment-678551401 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3835/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678404450 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2092/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678401075 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3833/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] Karan980 commented on pull request #3876: TestingCI
Karan980 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678350769 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678310007 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2091/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678306832 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3832/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3896: [CARBONDATA-3955] Fix load failures due to daylight saving time changes
akashrn5 commented on a change in pull request #3896: URL: https://github.com/apache/carbondata/pull/3896#discussion_r474608789 ## File path: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ## @@ -435,19 +436,48 @@ public static Object getDataDataTypeForNoDictionaryColumn(String dimensionValue, private static Object parseTimestamp(String dimensionValue, String dateFormat) { Date dateToStr; -DateFormat dateFormatter; +DateFormat dateFormatter = null; try { if (null != dateFormat && !dateFormat.trim().isEmpty()) { dateFormatter = new SimpleDateFormat(dateFormat); -dateFormatter.setLenient(false); } else { dateFormatter = timestampFormatter.get(); } + dateFormatter.setLenient(false); dateToStr = dateFormatter.parse(dimensionValue); - return dateToStr.getTime(); + return validateTimeStampRange(dateToStr.getTime()); } catch (ParseException e) { - throw new NumberFormatException(e.getMessage()); + // If the parsing fails, try to parse again with setLenient to true if the property is set + if (CarbonProperties.getInstance().isSetLenientEnabled()) { +try { + LOGGER.info("Changing setLenient to true for TimeStamp: " + dimensionValue); + dateFormatter.setLenient(true); + dateToStr = dateFormatter.parse(dimensionValue); + LOGGER.info("Changing " + dimensionValue + " to " + dateToStr); + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + return validateTimeStampRange(dateToStr.getTime()); +} catch (ParseException ex) { + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + throw new NumberFormatException(ex.getMessage()); +} + } else { +throw new NumberFormatException(e.getMessage()); + } +} + } + + private static Long validateTimeStampRange(Long timeValue) { +long minValue = DateDirectDictionaryGenerator.MIN_VALUE; +long maxValue = DateDirectDictionaryGenerator.MAX_VALUE; +if (timeValue < minValue || timeValue > maxValue) { + if (LOGGER.isDebugEnabled()) { +LOGGER.debug("Value for timestamp type column is not in valid range."); + } + throw new NumberFormatException("Value for timestamp type column is not in valid range."); Review comment: ```suggestion throw new NumberFormatException("timestamp column data is not in valid range of: " + DateDirectDictionaryGenerator.MIN_VALUE + " and " + DateDirectDictionaryGenerator.MAX_VALUE ); ``` ## File path: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ## @@ -435,19 +436,48 @@ public static Object getDataDataTypeForNoDictionaryColumn(String dimensionValue, private static Object parseTimestamp(String dimensionValue, String dateFormat) { Date dateToStr; -DateFormat dateFormatter; +DateFormat dateFormatter = null; try { if (null != dateFormat && !dateFormat.trim().isEmpty()) { dateFormatter = new SimpleDateFormat(dateFormat); -dateFormatter.setLenient(false); } else { dateFormatter = timestampFormatter.get(); } + dateFormatter.setLenient(false); dateToStr = dateFormatter.parse(dimensionValue); - return dateToStr.getTime(); + return validateTimeStampRange(dateToStr.getTime()); } catch (ParseException e) { - throw new NumberFormatException(e.getMessage()); + // If the parsing fails, try to parse again with setLenient to true if the property is set + if (CarbonProperties.getInstance().isSetLenientEnabled()) { +try { + LOGGER.info("Changing setLenient to true for TimeStamp: " + dimensionValue); + dateFormatter.setLenient(true); + dateToStr = dateFormatter.parse(dimensionValue); + LOGGER.info("Changing " + dimensionValue + " to " + dateToStr); + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + return validateTimeStampRange(dateToStr.getTime()); +} catch (ParseException ex) { + dateFormatter.setLenient(false); + LOGGER.info("Changing setLenient back to false"); + throw new NumberFormatException(ex.getMessage()); +} + } else { +throw new NumberFormatException(e.getMessage()); + } +} + } + + private static Long validateTimeStampRange(Long timeValue) { +long minValue = DateDirectDictionaryGenerator.MIN_VALUE; +long maxValue = DateDirectDictionaryGenerator.MAX_VALUE; +if (timeValue < minValue || timeValue > maxValue) { + if (LOGGER.isDebugEnabled()) { Review comment: here debug log is not required, because always the exception is thrown here. ## File path:
[jira] [Resolved] (CARBONDATA-3928) Handle the Strings which length is greater than 32000 as a bad record.
[ https://issues.apache.org/jira/browse/CARBONDATA-3928?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Akash R Nilugal resolved CARBONDATA-3928. - Fix Version/s: 2.1.0 Resolution: Fixed > Handle the Strings which length is greater than 32000 as a bad record. > -- > > Key: CARBONDATA-3928 > URL: https://issues.apache.org/jira/browse/CARBONDATA-3928 > Project: CarbonData > Issue Type: Task >Reporter: Nihal kumar ojha >Priority: Major > Fix For: 2.1.0 > > Time Spent: 12.5h > Remaining Estimate: 0h > > Currently, when the string length exceeds 32000 then the load is failed. > Suggestion: > 1. Bad record can handle string length greater than 32000 and load should not > be failed because only a few records string length is greater than 32000. > 2. Include some more information in the log message like which record and > column have the problem. -- This message was sent by Atlassian Jira (v8.3.4#803005)
[GitHub] [carbondata] asfgit closed pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
asfgit closed pull request #3865: URL: https://github.com/apache/carbondata/pull/3865 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] Karan980 commented on pull request #3876: TestingCI
Karan980 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678256931 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] VenuReddy2103 commented on pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
VenuReddy2103 commented on pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#issuecomment-678243983 LGTM This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
CarbonDataQA1 commented on pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#issuecomment-678224590 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2090/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
CarbonDataQA1 commented on pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#issuecomment-678220508 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3831/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678214601 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3830/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678191650 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2089/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3856: [CARBONDATA-3929]Improve CDC performance
CarbonDataQA1 commented on pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#issuecomment-678168176 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2088/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3856: [CARBONDATA-3929]Improve CDC performance
CarbonDataQA1 commented on pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#issuecomment-678164659 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3829/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] ShreelekhyaG commented on a change in pull request #3896: [CARBONDATA-3955] Fix load failures due to daylight saving time changes
ShreelekhyaG commented on a change in pull request #3896: URL: https://github.com/apache/carbondata/pull/3896#discussion_r474557660 ## File path: integration/spark/src/main/scala/org/apache/carbondata/spark/rdd/CarbonDataRDDFactory.scala ## @@ -816,10 +816,20 @@ object CarbonDataRDDFactory { val partitionByRdd = keyRDD.partitionBy( new SegmentPartitioner(segmentIdIndex, segmentUpdateParallelism)) + val conf = SparkSQLUtil.broadCastHadoopConf(sqlContext.sparkSession.sparkContext, hadoopConf) + val carbonSessionInfo: CarbonSessionInfo = { Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] nihal0107 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
nihal0107 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474543420 ## File path: integration/spark/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala ## @@ -145,47 +152,153 @@ class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach { sql("drop table if exists carbon_table") } - test("test insert / update with data more than 32000 characters") { + private def createTableAndLoadData (badRecordAction: String): Unit = { +BadRecordUtil.cleanBadRecordPath("default", "longerthan32kchar") +sql("CREATE TABLE longerthan32kchar(dim1 String, dim2 String, mes1 int) STORED AS carbondata") +sql(s"LOAD DATA LOCAL INPATH '$testdata' into table longerThan32kChar OPTIONS('FILEHEADER'='dim1,dim2,mes1', " + + s"'BAD_RECORDS_ACTION'='${badRecordAction}','BAD_RECORDS_LOGGER_ENABLE'='TRUE')") + } + + test("test load / insert / update with data more than 32000 characters and bad record action as Redirect") { +createTableAndLoadData("REDIRECT") +var redirectCsvPath = BadRecordUtil + .getRedirectCsvPath("default", "longerthan32kchar", "0", "0") +assert(BadRecordUtil.checkRedirectedCsvContentAvailableInSource(testdata, redirectCsvPath)) +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "REDIRECT"); +sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") +checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2))) +redirectCsvPath = BadRecordUtil.getRedirectCsvPath("default", "longerthan32kchar", "1", "0") +var redirectedFileLineList = FileUtils.readLines(redirectCsvPath) +var iterator = redirectedFileLineList.iterator() +while (iterator.hasNext) { + assert(iterator.next().equals("33000,"+longChar+",4")) +} + +// Update strings of length greater than 32000 +sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + + "where longerthan32kchar.mes1=1").show() +checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("itsok", "hello", 2))) +redirectCsvPath = BadRecordUtil.getRedirectCsvPath("default", "longerthan32kchar", "0", "1") +redirectedFileLineList = FileUtils.readLines(redirectCsvPath) +iterator = redirectedFileLineList.iterator() +while (iterator.hasNext) { + assert(iterator.next().equals("ok,"+longChar+",1")) +} +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") + +// Insert longer string without converter step will throw exception +intercept[Exception] { + sql(s"insert into longerthan32kchar values('32000', '$longChar', 3)") +} +BadRecordUtil.cleanBadRecordPath("default", "longerthan32kchar") + } + + test("test load / insert / update with data more than 32000 characters and bad record action as Force") { +createTableAndLoadData("FORCE") +checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2), Row("32123", null, 3))) CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") -val testdata =s"$resourcesPath/32000char.csv" -sql("drop table if exists load32000chardata") -sql("drop table if exists load32000chardata_dup") -sql("CREATE TABLE load32000chardata(dim1 String, dim2 String, mes1 int) STORED AS carbondata") -sql("CREATE TABLE load32000chardata_dup(dim1 String, dim2 String, mes1 int) STORED AS carbondata") -sql(s"LOAD DATA LOCAL INPATH '$testdata' into table load32000chardata OPTIONS('FILEHEADER'='dim1,dim2,mes1')") +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FORCE"); +sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") +checkAnswer(sql("select * from longerthan32kchar"), + Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2), Row("32123", null, 3), Row("33000", null, 4))) + +// Update strings of length greater than 32000 +sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + + "where longerthan32kchar.mes1=1").show() +checkAnswer(sql("select * from longerthan32kchar"), + Seq(Row("ok", null, 1), Row("itsok", "hello", 2), Row("32123", null, 3), Row("33000", null, 4))) +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") + +// Insert longer string without converter step will throw exception intercept[Exception] { - sql("insert into load32000chardata_dup select
[GitHub] [carbondata] nihal0107 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
nihal0107 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474540605 ## File path: streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala ## @@ -54,11 +54,12 @@ object FieldConverter { value match { case s: String => if (!isVarcharType && !isComplexType && s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) { - throw new IllegalArgumentException(stringLengthExceedErrorMsg + -CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters") -} else { - s + if (!CarbonProperties.getInstance.getProperty(CarbonCommonConstants Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] nihal0107 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
nihal0107 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474518906 ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/RowConverterImpl.java ## @@ -130,9 +135,14 @@ public CarbonRow convert(CarbonRow row) throws CarbonDataLoadingException { } fieldConverters[i].convert(row, logHolder); if (!logHolder.isLogged() && logHolder.isBadRecordNotAdded()) { -badRecordLogger.addBadRecordsToBuilder(row.getRawData(), logHolder.getReason()); +String reason = logHolder.getReason(); Review comment: We have to set the record also in log and we don't pass the rawdata to `NonDictionaryFieldConverterImpl.convert()` . So we have to do formatting in `RowConverterImpl` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] Karan980 commented on pull request #3876: TestingCI
Karan980 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678114121 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3856: [CARBONDATA-3929]Improve CDC performance
akashrn5 commented on a change in pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#discussion_r474501846 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/mutation/merge/CarbonMergeDataSetCommand.scala ## @@ -194,29 +210,32 @@ case class CarbonMergeDataSetCommand( tuple._2.asJava) } } - Some(UpdateTableModel(true, trxMgr.getLatestTrx, -executorErrors, tuple._2, true)) + Some(UpdateTableModel(isUpdate = true, trxMgr.getLatestTrx, +executorErrors, tuple._2, loadAsNewSegment = true)) } else { None } -CarbonInsertIntoWithDf( - databaseNameOp = Some(carbonTable.getDatabaseName), +val dataFrame = loadDF.select(tableCols.map(col): _*) +CarbonInsertIntoCommand(databaseNameOp = Some(carbonTable.getDatabaseName), tableName = carbonTable.getTableName, - options = Map("fileheader" -> header, "sort_scope" -> "nosort"), + options = Map("fileheader" -> header, "sort_scope" -> "no_sort"), Review comment: if we focus on the cdc performance, existing no sort is fine. As your point, we cant blindly go with target table's sort scope, because it can happen that the target table has the sort scope has no sort. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3856: [CARBONDATA-3929]Improve CDC performance
akashrn5 commented on a change in pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#discussion_r474501846 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/mutation/merge/CarbonMergeDataSetCommand.scala ## @@ -194,29 +210,32 @@ case class CarbonMergeDataSetCommand( tuple._2.asJava) } } - Some(UpdateTableModel(true, trxMgr.getLatestTrx, -executorErrors, tuple._2, true)) + Some(UpdateTableModel(isUpdate = true, trxMgr.getLatestTrx, +executorErrors, tuple._2, loadAsNewSegment = true)) } else { None } -CarbonInsertIntoWithDf( - databaseNameOp = Some(carbonTable.getDatabaseName), +val dataFrame = loadDF.select(tableCols.map(col): _*) +CarbonInsertIntoCommand(databaseNameOp = Some(carbonTable.getDatabaseName), tableName = carbonTable.getTableName, - options = Map("fileheader" -> header, "sort_scope" -> "nosort"), + options = Map("fileheader" -> header, "sort_scope" -> "no_sort"), Review comment: if we focus on the cdc performance, existing no sort is fine. As your point, we can't blindly go with target table's sort scope, because it can happen that the target table has the sort scope has no sort. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3856: [CARBONDATA-3929]Improve CDC performance
akashrn5 commented on a change in pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#discussion_r474500865 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/mutation/merge/CarbonMergeDataSetCommand.scala ## @@ -106,18 +106,34 @@ case class CarbonMergeDataSetCommand( // decide join type based on match conditions val joinType = decideJoinType +val joinColumn = mergeMatches.joinExpr.expr.asInstanceOf[EqualTo].left + .asInstanceOf[UnresolvedAttribute].nameParts.tail.head +// repartition the the srsDs, if the target as bucketing and the bucketing column and join +// column are same +val repartitionedSrsDs = Review comment: done ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/mutation/merge/CarbonMergeDataSetCommand.scala ## @@ -106,18 +106,34 @@ case class CarbonMergeDataSetCommand( // decide join type based on match conditions val joinType = decideJoinType +val joinColumn = mergeMatches.joinExpr.expr.asInstanceOf[EqualTo].left + .asInstanceOf[UnresolvedAttribute].nameParts.tail.head +// repartition the the srsDs, if the target as bucketing and the bucketing column and join Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3856: [CARBONDATA-3929]Improve CDC performance
akashrn5 commented on a change in pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#discussion_r474500538 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonInsertIntoCommand.scala ## @@ -439,6 +449,11 @@ case class CarbonInsertIntoCommand(databaseNameOp: Option[String], def insertData(loadParams: CarbonLoadParams): (Seq[Row], LoadMetadataDetails) = { var rows = Seq.empty[Row] +val loadDataFrame = if (updateModel.isDefined && !updateModel.get.loadAsNewSegment) { + Some(CommonLoadUtils.getDataFrameWithTupleID(Some(dataFrame))) Review comment: done ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonInsertIntoCommand.scala ## @@ -439,6 +449,11 @@ case class CarbonInsertIntoCommand(databaseNameOp: Option[String], def insertData(loadParams: CarbonLoadParams): (Seq[Row], LoadMetadataDetails) = { var rows = Seq.empty[Row] +val loadDataFrame = if (updateModel.isDefined && !updateModel.get.loadAsNewSegment) { + Some(CommonLoadUtils.getDataFrameWithTupleID(Some(dataFrame))) Review comment: > This InsertIntoCommand flow is not meant for update flow yet. Because update will have an implicit column and rearrange schema and all will fail. so, I suggest if `updateModel.get.loadAsNewSegment` is `false` throw unsupported exception now and handle this requirement later. > > Also when `updateModel.get.loadAsNewSegment = true` (which is our current cdc history data case), **this flow can be used** (as it is just a insert, no actual update flow used). only when `updateModel.get.loadAsNewSegment = false` we cannot use this flow. > > so someone might use it because of update model support. so, I suggest to throw an exception in the beginning of this function when `updateModel.get.loadAsNewSegment = false` done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3856: [CARBONDATA-3929]Improve CDC performance
akashrn5 commented on a change in pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#discussion_r474500375 ## File path: integration/spark/src/main/spark2.3/com/databricks/spark/avro/AvroWriter.scala ## @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.databricks.spark.avro + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.datasources.OutputWriterFactory + +/** + * This class is to get the avro writer from databricks avro module, as its not present in spark2.3 + * and spark-avro module is included in spark project from spark-2.4. So for spark-2.4, we use Avro + * writer from spark project. + */ +object AvroWriter { + + def getWriter(spark: org.apache.spark.sql.SparkSession, + job: org.apache.hadoop.mapreduce.Job, + dataSchema: org.apache.spark.sql.types.StructType, + options: scala.Predef.Map[scala.Predef.String, scala.Predef.String] = Map.empty) + : OutputWriterFactory = { +new DefaultSource().prepareWrite(spark, job, + options, dataSchema) + } +} + +/** + * This reds the avro files from the given path and return the RDD[Row] Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
VenuReddy2103 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474466612 ## File path: streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala ## @@ -54,11 +54,12 @@ object FieldConverter { value match { case s: String => if (!isVarcharType && !isComplexType && s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) { - throw new IllegalArgumentException(stringLengthExceedErrorMsg + -CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters") -} else { - s + if (!CarbonProperties.getInstance.getProperty(CarbonCommonConstants +.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT).toBoolean) { +throw new IllegalArgumentException(CarbonCommonConstants.STRING_LENGTH_EXCEEDED_MESSAGE) Review comment: `STRING_LENGTH_EXCEEDED_MESSAGE` has format specifiers("%s") in it. Instead of throwing with format specifiers here and then catching it in `carbonScalaUtil.getString()` and formatting it, suggest to throw the formatted string itself from here. Can cause security vulnerability issue if we for forget to catch such exception and modify it. Also suggest to add FMT prefix to string `STRING_LENGTH_EXCEEDED_MESSAGE` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (CARBONDATA-3954) Global sorting with array, if read from ORC format, write to carbon, error; If you use no_sort, success;
[ https://issues.apache.org/jira/browse/CARBONDATA-3954?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] xiaohui updated CARBONDATA-3954: Description: orc table sql test: create table array_orc(name string, col array,fee int) STORED AS orc; insert into array_orc values("xiao3",array('上呼吸道疾病 1','白内障1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1 ','白内障1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障 1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障1','胃溃疡 1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障1','胃溃疡1'),2); insert into array_orc values("xiao5",array(null,'白内障1','胃溃疡1'),2); insert into array_orc values("xiao5",null,2); insert into array_orc values("xiao3",array('j'),2); insert into array_orc values("xiao4",array('j','j'),2); insert into array_orc values("xiao4",NULL,2); 0: jdbc:hive2://localhost:1> use dict; +-+--+ | Result | +-+--+ +-+--+ No rows selected (0.391 seconds) 0: jdbc:hive2://localhost:1> select * from array_orc; ++---+--+--+ | name | col | fee | ++---+--+--+ | xiao3 | ["",null,"j"] | 3| | xiao2 | ["上呼吸道疾病1","白内障1","胃溃疡1"] | 2| | xiao3 | ["",null,"j"] | 3| | xiao1 | ["上呼吸道疾病","白内障","胃溃疡"]| 1| | xiao9 | NULL | 3| | xiao9 | NULL | 3| | xiao3 | NULL | 3| | xiao6 | NULL | 3| | xiao2 | ["上呼吸道疾病 1","白内障 1","胃溃疡 1"] | 2| | xiao1 | ["上呼吸道疾病 ","白内障 ","胃溃疡 "] | 1| | xiao3 | NULL | 3| | xiao3 | [null]| 3| | xiao3 | [""] | 3| ++---+--+--+ 13 rows selected (0.416 seconds) 0: jdbc:hive2://localhost:1> create table array_carbon4(name string, col array,fee int) STORED AS carbondata TBLPROPERTIES ('SORT_COLUMNS'='name', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKSIZE'='128', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKLET_SIZE'='128', 0: jdbc:hive2://localhost:1> 'SORT_SCOPE'='no_SORT'); +-+--+ | Result | +-+--+ +-+--+ No rows selected (1.04 seconds) 0: jdbc:hive2://localhost:1> insert overwrite table array_carbon4 select name,col,fee from array_orc; +-+--+ | Result | +-+--+ +-+--+ No rows selected (5.065 seconds) 0: jdbc:hive2://localhost:1> create table array_carbon5(name string, col array,fee int) STORED AS carbondata TBLPROPERTIES ('SORT_COLUMNS'='name', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKSIZE'='128', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKLET_SIZE'='128', 0: jdbc:hive2://localhost:1> 'SORT_SCOPE'='global_SORT'); +-+--+ | Result | +-+--+ +-+--+ No rows selected (0.098 seconds) 0: jdbc:hive2://localhost:1> insert overwrite table array_carbon5 select name,col,fee from array_orc; Error: java.lang.Exception: DataLoad failure (state=,code=0) was: orcdata create table array_orc(name string, col array,fee int) STORED AS orc; insert into array_orc values("xiao3",array('上呼吸道疾病 1','白内障1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1 ','白内障1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障 1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障1','胃溃疡 1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障1','胃溃疡1'),2); insert into array_orc values("xiao5",array(null,'白内障1','胃溃疡1'),2); insert into array_orc values("xiao5",null,2); insert into array_orc values("xiao3",array('j'),2); insert into array_orc values("xiao4",array('j','j'),2); insert into array_orc values("xiao4",NULL,2); 0: jdbc:hive2://localhost:1> use dict; +-+--+ | Result | +-+--+ +-+--+ No rows selected (0.391 seconds) 0: jdbc:hive2://localhost:1> select * from array_orc; ++---+--+--+ | name | col | fee | ++---+--+--+ | xiao3 | ["",null,"j"] | 3| | xiao2 | ["上呼吸道疾病1","白内障1","胃溃疡1"] | 2| | xiao3 | ["",null,"j"] | 3| | xiao1 | ["上呼吸道疾病","白内障","胃溃疡"]| 1| | xiao9 | NULL | 3| | xiao9 | NULL | 3| | xiao3 | NULL | 3| | xiao6 | NULL | 3| | xiao2 | ["上呼吸道疾病 1","白内障 1","胃溃疡 1"] | 2| | xiao1 | ["上呼吸道疾病 ","白内障 ","胃溃疡 "] | 1| | xiao3 | NULL | 3| | xiao3 | [null]| 3| | xiao3 | [""] | 3|
[jira] [Updated] (CARBONDATA-3954) Global sorting with array, if read from ORC format, write to carbon, error; If you use no_sort, success;
[ https://issues.apache.org/jira/browse/CARBONDATA-3954?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] xiaohui updated CARBONDATA-3954: Description: orcdata create table array_orc(name string, col array,fee int) STORED AS orc; insert into array_orc values("xiao3",array('上呼吸道疾病 1','白内障1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1 ','白内障1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障 1','胃溃疡1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障1','胃溃疡 1'),2); insert into array_orc values("xiao3",array('上呼吸道疾病1','白内障1','胃溃疡1'),2); insert into array_orc values("xiao5",array(null,'白内障1','胃溃疡1'),2); insert into array_orc values("xiao5",null,2); insert into array_orc values("xiao3",array('j'),2); insert into array_orc values("xiao4",array('j','j'),2); insert into array_orc values("xiao4",NULL,2); 0: jdbc:hive2://localhost:1> use dict; +-+--+ | Result | +-+--+ +-+--+ No rows selected (0.391 seconds) 0: jdbc:hive2://localhost:1> select * from array_orc; ++---+--+--+ | name | col | fee | ++---+--+--+ | xiao3 | ["",null,"j"] | 3| | xiao2 | ["上呼吸道疾病1","白内障1","胃溃疡1"] | 2| | xiao3 | ["",null,"j"] | 3| | xiao1 | ["上呼吸道疾病","白内障","胃溃疡"]| 1| | xiao9 | NULL | 3| | xiao9 | NULL | 3| | xiao3 | NULL | 3| | xiao6 | NULL | 3| | xiao2 | ["上呼吸道疾病 1","白内障 1","胃溃疡 1"] | 2| | xiao1 | ["上呼吸道疾病 ","白内障 ","胃溃疡 "] | 1| | xiao3 | NULL | 3| | xiao3 | [null]| 3| | xiao3 | [""] | 3| ++---+--+--+ 13 rows selected (0.416 seconds) 0: jdbc:hive2://localhost:1> create table array_carbon4(name string, col array,fee int) STORED AS carbondata TBLPROPERTIES ('SORT_COLUMNS'='name', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKSIZE'='128', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKLET_SIZE'='128', 0: jdbc:hive2://localhost:1> 'SORT_SCOPE'='no_SORT'); +-+--+ | Result | +-+--+ +-+--+ No rows selected (1.04 seconds) 0: jdbc:hive2://localhost:1> insert overwrite table array_carbon4 select name,col,fee from array_orc; +-+--+ | Result | +-+--+ +-+--+ No rows selected (5.065 seconds) 0: jdbc:hive2://localhost:1> create table array_carbon5(name string, col array,fee int) STORED AS carbondata TBLPROPERTIES ('SORT_COLUMNS'='name', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKSIZE'='128', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKLET_SIZE'='128', 0: jdbc:hive2://localhost:1> 'SORT_SCOPE'='global_SORT'); +-+--+ | Result | +-+--+ +-+--+ No rows selected (0.098 seconds) 0: jdbc:hive2://localhost:1> insert overwrite table array_carbon5 select name,col,fee from array_orc; Error: java.lang.Exception: DataLoad failure (state=,code=0) was: 0: jdbc:hive2://localhost:1> use dict; +-+--+ | Result | +-+--+ +-+--+ No rows selected (0.391 seconds) 0: jdbc:hive2://localhost:1> select * from array_orc; ++---+--+--+ | name | col | fee | ++---+--+--+ | xiao3 | ["",null,"j"] | 3| | xiao2 | ["上呼吸道疾病1","白内障1","胃溃疡1"] | 2| | xiao3 | ["",null,"j"] | 3| | xiao1 | ["上呼吸道疾病","白内障","胃溃疡"]| 1| | xiao9 | NULL | 3| | xiao9 | NULL | 3| | xiao3 | NULL | 3| | xiao6 | NULL | 3| | xiao2 | ["上呼吸道疾病 1","白内障 1","胃溃疡 1"] | 2| | xiao1 | ["上呼吸道疾病 ","白内障 ","胃溃疡 "] | 1| | xiao3 | NULL | 3| | xiao3 | [null]| 3| | xiao3 | [""] | 3| ++---+--+--+ 13 rows selected (0.416 seconds) 0: jdbc:hive2://localhost:1> create table array_carbon4(name string, col array,fee int) STORED AS carbondata TBLPROPERTIES ('SORT_COLUMNS'='name', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKSIZE'='128', 0: jdbc:hive2://localhost:1> 'TABLE_BLOCKLET_SIZE'='128', 0: jdbc:hive2://localhost:1> 'SORT_SCOPE'='no_SORT'); +-+--+ | Result | +-+--+ +-+--+ No rows selected (1.04 seconds) 0: jdbc:hive2://localhost:1> insert overwrite table array_carbon4 select name,col,fee from array_orc; +-+--+ | Result | +-+--+ +-+--+ No rows selected (5.065 seconds) 0: jdbc:hive2://localhost:1> create table
[GitHub] [carbondata] akashrn5 commented on a change in pull request #3856: [CARBONDATA-3929]Improve CDC performance
akashrn5 commented on a change in pull request #3856: URL: https://github.com/apache/carbondata/pull/3856#discussion_r474478394 ## File path: integration/spark/pom.xml ## @@ -153,6 +153,28 @@ + + com.databricks + spark-avro_${scala.binary.version} + 4.0.0 + + + org.apache.avro + avro + + + + + org.apache.spark + spark-avro_${scala.binary.version} Review comment: since we are integrated with spark, spark-avro is preferable, but as we know spark-2.3 does not have spark-avro, so anyway once in future we remove support for spark-2.3, so databricks spark-avro will be removed for our project. So here i'm avoiding out of spark project dependency as its possible in spark-2.4 ## File path: integration/spark/src/main/spark2.3/org/apache/spark/sql/avro/AvroFileFormatFactory.scala ## @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.avro + +import com.databricks.spark.avro.{AvroReader, AvroWriter} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.datasources.OutputWriterFactory + +object AvroFileFormatFactory { Review comment: replied above This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
VenuReddy2103 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474476530 ## File path: streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala ## @@ -54,11 +54,12 @@ object FieldConverter { value match { case s: String => if (!isVarcharType && !isComplexType && s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) { - throw new IllegalArgumentException(stringLengthExceedErrorMsg + -CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters") -} else { - s + if (!CarbonProperties.getInstance.getProperty(CarbonCommonConstants Review comment: `CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT)` can return null. Please use `isBadRecordHandlingEnabledForInsert()` method instead of it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678092439 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2087/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA1 commented on pull request #3876: TestingCI
CarbonDataQA1 commented on pull request #3876: URL: https://github.com/apache/carbondata/pull/3876#issuecomment-678091678 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/3828/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
VenuReddy2103 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474466612 ## File path: streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala ## @@ -54,11 +54,12 @@ object FieldConverter { value match { case s: String => if (!isVarcharType && !isComplexType && s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) { - throw new IllegalArgumentException(stringLengthExceedErrorMsg + -CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters") -} else { - s + if (!CarbonProperties.getInstance.getProperty(CarbonCommonConstants +.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT).toBoolean) { +throw new IllegalArgumentException(CarbonCommonConstants.STRING_LENGTH_EXCEEDED_MESSAGE) Review comment: `STRING_LENGTH_EXCEEDED_MESSAGE` has format specifiers("%s") in it. It is directly passed `IllegalArgumentException` without formatting. Can cause security vulnerability issue. Also suggest to add FMT prefix to `STRING_LENGTH_EXCEEDED_MESSAGE` so that we don't miss such issues. `public static final String STRING_LENGTH_EXCEEDED_MESSAGE = "Record %s of column %s exceeded " + MAX_CHARS_PER_COLUMN_DEFAULT + " characters. Please consider long string data type.";` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
VenuReddy2103 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474446976 ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/RowConverterImpl.java ## @@ -130,9 +135,14 @@ public CarbonRow convert(CarbonRow row) throws CarbonDataLoadingException { } fieldConverters[i].convert(row, logHolder); if (!logHolder.isLogged() && logHolder.isBadRecordNotAdded()) { -badRecordLogger.addBadRecordsToBuilder(row.getRawData(), logHolder.getReason()); +String reason = logHolder.getReason(); Review comment: If we had set the correct reason with row and column in `NonDictionaryFieldConverterImpl.convert()` instead of setting reason as `STRING_LENGTH_EXCEEDED_MESSAGE`, alone in it, we could have avoided changes in this file(RowConverterImpl) to fixup the the correct reason for `STRING_LENGTH_EXCEEDED_MESSAGE` again here at a common convert for all the columns of row. That looks better. right ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.
VenuReddy2103 commented on a change in pull request #3865: URL: https://github.com/apache/carbondata/pull/3865#discussion_r474425224 ## File path: integration/spark/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala ## @@ -145,47 +152,153 @@ class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach { sql("drop table if exists carbon_table") } - test("test insert / update with data more than 32000 characters") { + private def createTableAndLoadData (badRecordAction: String): Unit = { +BadRecordUtil.cleanBadRecordPath("default", "longerthan32kchar") +sql("CREATE TABLE longerthan32kchar(dim1 String, dim2 String, mes1 int) STORED AS carbondata") +sql(s"LOAD DATA LOCAL INPATH '$testdata' into table longerThan32kChar OPTIONS('FILEHEADER'='dim1,dim2,mes1', " + + s"'BAD_RECORDS_ACTION'='${badRecordAction}','BAD_RECORDS_LOGGER_ENABLE'='TRUE')") + } + + test("test load / insert / update with data more than 32000 characters and bad record action as Redirect") { +createTableAndLoadData("REDIRECT") +var redirectCsvPath = BadRecordUtil + .getRedirectCsvPath("default", "longerthan32kchar", "0", "0") +assert(BadRecordUtil.checkRedirectedCsvContentAvailableInSource(testdata, redirectCsvPath)) +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "REDIRECT"); +sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") +checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2))) +redirectCsvPath = BadRecordUtil.getRedirectCsvPath("default", "longerthan32kchar", "1", "0") +var redirectedFileLineList = FileUtils.readLines(redirectCsvPath) +var iterator = redirectedFileLineList.iterator() +while (iterator.hasNext) { + assert(iterator.next().equals("33000,"+longChar+",4")) +} + +// Update strings of length greater than 32000 +sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + + "where longerthan32kchar.mes1=1").show() +checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("itsok", "hello", 2))) +redirectCsvPath = BadRecordUtil.getRedirectCsvPath("default", "longerthan32kchar", "0", "1") +redirectedFileLineList = FileUtils.readLines(redirectCsvPath) +iterator = redirectedFileLineList.iterator() +while (iterator.hasNext) { + assert(iterator.next().equals("ok,"+longChar+",1")) +} +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") + +// Insert longer string without converter step will throw exception +intercept[Exception] { + sql(s"insert into longerthan32kchar values('32000', '$longChar', 3)") +} +BadRecordUtil.cleanBadRecordPath("default", "longerthan32kchar") + } + + test("test load / insert / update with data more than 32000 characters and bad record action as Force") { +createTableAndLoadData("FORCE") +checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2), Row("32123", null, 3))) CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") -val testdata =s"$resourcesPath/32000char.csv" -sql("drop table if exists load32000chardata") -sql("drop table if exists load32000chardata_dup") -sql("CREATE TABLE load32000chardata(dim1 String, dim2 String, mes1 int) STORED AS carbondata") -sql("CREATE TABLE load32000chardata_dup(dim1 String, dim2 String, mes1 int) STORED AS carbondata") -sql(s"LOAD DATA LOCAL INPATH '$testdata' into table load32000chardata OPTIONS('FILEHEADER'='dim1,dim2,mes1')") +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FORCE"); +sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") +checkAnswer(sql("select * from longerthan32kchar"), + Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2), Row("32123", null, 3), Row("33000", null, 4))) + +// Update strings of length greater than 32000 +sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + + "where longerthan32kchar.mes1=1").show() +checkAnswer(sql("select * from longerthan32kchar"), + Seq(Row("ok", null, 1), Row("itsok", "hello", 2), Row("32123", null, 3), Row("33000", null, 4))) +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") + +// Insert longer string without converter step will throw exception intercept[Exception] { - sql("insert into load32000chardata_dup select