Repository: spark Updated Branches: refs/heads/master 00e103a6e -> 993923c8f
[SPARK-16985] Change dataFormat from yyyyMMddHHmm to yyyyMMddHHmmss ## What changes were proposed in this pull request? In our cluster, sometimes the sql output maybe overrided. When I submit some sql, all insert into the same table, and the sql will cost less one minute, here is the detail, 1 sql1, 11:03 insert into table. 2 sql2, 11:04:11 insert into table. 3 sql3, 11:04:48 insert into table. 4 sql4, 11:05 insert into table. 5 sql5, 11:06 insert into table. The sql3's output file will override the sql2's output file. here is the log: ``` 16/05/04 11:04:11 INFO hive.SparkHiveHadoopWriter: XXfinalPath=hdfs://tl-sng-gdt-nn-tdw.tencent-distribute.com:54310/tmp/assorz/tdw-tdwadmin/20160504/04559505496526517_-1_1204544348/10000/_tmp.p_20160428/attempt_201605041104_0001_m_000000_1 16/05/04 11:04:48 INFO hive.SparkHiveHadoopWriter: XXfinalPath=hdfs://tl-sng-gdt-nn-tdw.tencent-distribute.com:54310/tmp/assorz/tdw-tdwadmin/20160504/04559505496526517_-1_212180468/10000/_tmp.p_20160428/attempt_201605041104_0001_m_000000_1 ``` The reason is the output file use SimpleDateFormat("yyyyMMddHHmm"), if two sql insert into the same table in the same minute, the output will be overrite. I think we should change dateFormat to "yyyyMMddHHmmss", in our cluster, we can't finished a sql in one second. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: hongshen <shenh062...@126.com> Closes #14574 from shenh062326/SPARK-16985. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/993923c8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/993923c8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/993923c8 Branch: refs/heads/master Commit: 993923c8f5ca719daf905285738b7fdcaf944d8c Parents: 00e103a Author: hongshen <shenh062...@126.com> Authored: Fri Aug 12 09:58:02 2016 +0100 Committer: Sean Owen <so...@cloudera.com> Committed: Fri Aug 12 09:58:02 2016 +0100 ---------------------------------------------------------------------- core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala | 4 ++-- core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala | 2 +- core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 2 +- core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/993923c8/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala index 17daac1..6550d70 100644 --- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala +++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala @@ -67,7 +67,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable { def setup(jobid: Int, splitid: Int, attemptid: Int) { setIDs(jobid, splitid, attemptid) - HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmm").format(now), + HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(now), jobid, splitID, attemptID, conf.value) } @@ -162,7 +162,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable { private[spark] object SparkHadoopWriter { def createJobID(time: Date, id: Int): JobID = { - val formatter = new SimpleDateFormat("yyyyMMddHHmm") + val formatter = new SimpleDateFormat("yyyyMMddHHmmss") val jobtrackerID = formatter.format(time) new JobID(jobtrackerID, id) } http://git-wip-us.apache.org/repos/asf/spark/blob/993923c8/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 99afe02..fd3a14b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -241,7 +241,7 @@ class HadoopRDD[K, V]( var reader: RecordReader[K, V] = null val inputFormat = getInputFormat(jobConf) - HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmm").format(createTime), + HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(createTime), context.stageId, theSplit.index, context.attemptNumber, jobConf) reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL) http://git-wip-us.apache.org/repos/asf/spark/blob/993923c8/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index b086baa..be919e6 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -77,7 +77,7 @@ class NewHadoopRDD[K, V]( // private val serializableConf = new SerializableWritable(_conf) private val jobTrackerId: String = { - val formatter = new SimpleDateFormat("yyyyMMddHHmm") + val formatter = new SimpleDateFormat("yyyyMMddHHmmss") formatter.format(new Date()) } http://git-wip-us.apache.org/repos/asf/spark/blob/993923c8/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 104e0cb..7d6a880 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -1079,7 +1079,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038). val hadoopConf = conf val job = NewAPIHadoopJob.getInstance(hadoopConf) - val formatter = new SimpleDateFormat("yyyyMMddHHmm") + val formatter = new SimpleDateFormat("yyyyMMddHHmmss") val jobtrackerID = formatter.format(new Date()) val stageId = self.id val jobConfiguration = job.getConfiguration --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org