[GitHub] [spark] iRakson commented on a change in pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
iRakson commented on a change in pull request #28485: URL: https://github.com/apache/spark/pull/28485#discussion_r422815037 ## File path: sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala ## @@ -86,34 +86,36 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B case _: StreamingQueryException => } - eventually(timeout(30.seconds), interval(100.milliseconds)) { + eventually(timeout(300.seconds), interval(100.milliseconds)) { Review comment: This was modified accidentally, I will update this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] uncleGen commented on a change in pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
uncleGen commented on a change in pull request #28485: URL: https://github.com/apache/spark/pull/28485#discussion_r422801194 ## File path: sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala ## @@ -35,11 +40,218 @@ private[ui] class StreamingQueryPage(parent: StreamingQueryTab) SparkUIUtils.headerSparkPage(request, "Streaming Query", content, parent) } - def generateDataRow(request: HttpServletRequest, queryActive: Boolean) -(query: StreamingQueryUIData): Seq[Node] = { + private def generateStreamingQueryTable(request: HttpServletRequest): Seq[Node] = { +val (activeQueries, inactiveQueries) = parent.statusListener.allQueryStatus + .partition(_.isActive) + +val content = mutable.ListBuffer[Node]() +// show active queries table only if there is at least one active query +if (activeQueries.nonEmpty) { + // scalastyle:off + content ++= + + + +Active Streaming Queries ({activeQueries.length}) + + ++ + + + {queryTable(activeQueries, request, "active")} + + + // scalastyle:on +} +// show active queries table only if there is at least one completed query +if (inactiveQueries.nonEmpty) { + // scalastyle:off + content ++= + + + +Completed Streaming Queries ({inactiveQueries.length}) + + ++ + + + {queryTable(inactiveQueries, request, "completed")} + + + // scalastyle:on +} +content + } + + private def queryTable(data: Seq[StreamingQueryUIData], request: HttpServletRequest, + tableTag: String): Seq[Node] = { + +val isActive = if (tableTag.contains("active")) true else false +val parameterOtherTable = request.getParameterMap.asScala + .filterNot(_._1.startsWith(tableTag)) + .map { case (name, vals) => +name + "=" + vals(0) + } + +val parameterPage = request.getParameter(s"$tableTag.page") +val parameterSortColumn = request.getParameter(s"$tableTag.sort") +val parameterSortDesc = request.getParameter(s"$tableTag.desc") +val parameterPageSize = request.getParameter(s"$tableTag.pageSize") + +val page = Option(parameterPage).map(_.toInt).getOrElse(1) +val sortColumn = Option(parameterSortColumn).map { sortColumn => + SparkUIUtils.decodeURLParameter(sortColumn) +}.getOrElse("Start Time") +val sortDesc = Option(parameterSortDesc).map(_.toBoolean).getOrElse(sortColumn == "Start Time") +val pageSize = Option(parameterPageSize).map(_.toInt).getOrElse(100) + +try { + new StreamingQueryPagedTable( +request, +parent, +data, +tableTag, +pageSize, +sortColumn, +sortDesc, +isActive, +parameterOtherTable, +SparkUIUtils.prependBaseUri(request, parent.basePath), +"StreamingQuery" + ).table(page) +} catch { + case e@(_: IllegalArgumentException | _: IndexOutOfBoundsException) => + + Error while rendering execution table: + +{Utils.exceptionString(e)} + + +} + } +} + +class StreamingQueryPagedTable( +request: HttpServletRequest, +parent: StreamingQueryTab, +data: Seq[StreamingQueryUIData], +tableTag: String, +pageSize: Int, +sortColumn: String, +sortDesc: Boolean, +isActive: Boolean, +parameterOtherTable: Iterable[String], +basePath: String, +subPath: String) extends PagedTable[StructuredStreamingRow] { + + private val parameterPath = s"$basePath/$subPath/?${parameterOtherTable.mkString("&")}" + private val encodedSortColumn = URLEncoder.encode(sortColumn, UTF_8.name()) + + override def tableId: String = s"$tableTag-table" + + override def tableCssClass: String = +"table table-bordered table-sm table-striped table-head-clickable table-cell-width-limited" + + override def pageSizeFormField: String = s"$tableTag.pageSize" + + override def pageNumberFormField: String = s"$tableTag.page" + + override def pageLink(page: Int): String = { +parameterPath + +s"&$pageNumberFormField=$page" + +s"&$tableTag.sort=$encodedSortColumn" + +s"&$tableTag.desc=$sortDesc" + +s"&$pageSizeFormField=$pageSize" + +s"#$tableTag" Review comment: nit: indent This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apach
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins removed a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626498246 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626498246 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
SparkQA removed a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626432135 **[Test build #122485 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122485/testReport)** for PR 28485 at commit [`df61b7c`](https://github.com/apache/spark/commit/df61b7c61dbdbd977bec76cf22a5f269b418035b). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
SparkQA commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626497339 **[Test build #122485 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122485/testReport)** for PR 28485 at commit [`df61b7c`](https://github.com/apache/spark/commit/df61b7c61dbdbd977bec76cf22a5f269b418035b). * This patch passes all tests. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `class StreamingQueryPagedTable(` * `case class StructuredStreamingRow(` * `class StreamingQueryDataSource(uiData: Seq[StreamingQueryUIData], sortColumn: String, desc: Boolean,` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28477: [SPARK-31405][SQL] Fail by default when reading/writing legacy datetime values from/to Parquet/Avro files
SparkQA commented on pull request #28477: URL: https://github.com/apache/spark/pull/28477#issuecomment-626495706 **[Test build #122489 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122489/testReport)** for PR 28477 at commit [`fc31eea`](https://github.com/apache/spark/commit/fc31eeaea1ec407a670f41709a709958d192563f). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28477: [SPARK-31405][SQL] Fail by default when reading/writing legacy datetime values from/to Parquet/Avro files
AmplabJenkins removed a comment on pull request #28477: URL: https://github.com/apache/spark/pull/28477#issuecomment-626493276 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28477: [SPARK-31405][SQL] Fail by default when reading/writing legacy datetime values from/to Parquet/Avro files
AmplabJenkins commented on pull request #28477: URL: https://github.com/apache/spark/pull/28477#issuecomment-626493276 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] igreenfield commented on a change in pull request #26624: [SPARK-8981][CORE][test-hadoop3.2][test-java11] Add MDC support in Executor
igreenfield commented on a change in pull request #26624: URL: https://github.com/apache/spark/pull/26624#discussion_r422802737 ## File path: core/src/main/scala/org/apache/spark/util/ThreadUtils.scala ## @@ -157,23 +259,23 @@ private[spark] object ThreadUtils { */ def newDaemonFixedThreadPool(nThreads: Int, prefix: String): ThreadPoolExecutor = { val threadFactory = namedThreadFactory(prefix) -Executors.newFixedThreadPool(nThreads, threadFactory).asInstanceOf[ThreadPoolExecutor] +MDCAwareThreadPoolExecutor.newFixedThreadPool(nThreads, threadFactory) Review comment: OK This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] MaxGekk commented on a change in pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
MaxGekk commented on a change in pull request #28481: URL: https://github.com/apache/spark/pull/28481#discussion_r422800558 ## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala ## @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. + while (milliseconds < -6213574080L) { +// 25340232959L is the number of milliseconds since +// January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". Review comment: It is not related to this PR. If you believe we should change the lower and upper bounds, we should do that separately. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] MaxGekk commented on a change in pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
MaxGekk commented on a change in pull request #28481: URL: https://github.com/apache/spark/pull/28481#discussion_r422799964 ## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala ## @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. Review comment: Officially, not. See descriptions of TimestampType and DateType. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Gschiavon removed a comment on pull request #27578: [SPARK-30828][SQL] Improving insertInto behaviour
Gschiavon removed a comment on pull request #27578: URL: https://github.com/apache/spark/pull/27578#issuecomment-593260363 ping @dongjoon-hyun Hi! what do you think about this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Gschiavon removed a comment on pull request #27578: [SPARK-30828][SQL] Improving insertInto behaviour
Gschiavon removed a comment on pull request #27578: URL: https://github.com/apache/spark/pull/27578#issuecomment-626487699 @AmplabJenkins This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Gschiavon commented on pull request #27578: [SPARK-30828][SQL] Improving insertInto behaviour
Gschiavon commented on pull request #27578: URL: https://github.com/apache/spark/pull/27578#issuecomment-626487699 @AmplabJenkins This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] maropu commented on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
maropu commented on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626485974 The change looks useful to me. cc: @MaxGekk This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Ngone51 commented on a change in pull request #27937: [SPARK-30127][SQL] Support case class parameter for typed Scala UDF
Ngone51 commented on a change in pull request #27937: URL: https://github.com/apache/spark/pull/27937#discussion_r422793783 ## File path: sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala ## @@ -93,7 +93,7 @@ sealed abstract class UserDefinedFunction { private[spark] case class SparkUserDefinedFunction( f: AnyRef, dataType: DataType, -inputSchemas: Seq[Option[ScalaReflection.Schema]], Review comment: > @Ngone51 Can you take a closer look and see how to make it completed? Yea, I'm looking into it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #26624: [SPARK-8981][CORE][test-hadoop3.2][test-java11] Add MDC support in Executor
cloud-fan commented on a change in pull request #26624: URL: https://github.com/apache/spark/pull/26624#discussion_r422791868 ## File path: core/src/main/scala/org/apache/spark/util/ThreadUtils.scala ## @@ -157,23 +259,23 @@ private[spark] object ThreadUtils { */ def newDaemonFixedThreadPool(nThreads: Int, prefix: String): ThreadPoolExecutor = { val threadFactory = namedThreadFactory(prefix) -Executors.newFixedThreadPool(nThreads, threadFactory).asInstanceOf[ThreadPoolExecutor] +MDCAwareThreadPoolExecutor.newFixedThreadPool(nThreads, threadFactory) Review comment: We don't commit dead code that no place uses. Please only leave the code that is effective. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dongjoon-hyun commented on pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
dongjoon-hyun commented on pull request #28495: URL: https://github.com/apache/spark/pull/28495#issuecomment-626478958 Thank you so much, @dbtsai . Merged to master/3.0. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] beliefer commented on pull request #28164: [SPARK-31393][SQL] Show the correct alias in schema for expression
beliefer commented on pull request #28164: URL: https://github.com/apache/spark/pull/28164#issuecomment-626477321 cc @gatorsmile This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dbtsai commented on pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
dbtsai commented on pull request #28495: URL: https://github.com/apache/spark/pull/28495#issuecomment-626475204 LGTM. Thanks. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dilipbiswal commented on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
dilipbiswal commented on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626473070 cc @gatorsmile @maropu This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
cloud-fan commented on a change in pull request #28481: URL: https://github.com/apache/spark/pull/28481#discussion_r422784030 ## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala ## @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. Review comment: I'm a bit unsure about this. Spark does support negative years for now, and we should keep testing it. ## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala ## @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. + while (milliseconds < -6213574080L) { +// 25340232959L is the number of milliseconds since +// January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". Review comment: ditto. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above
[GitHub] [spark] cloud-fan commented on pull request #28489: [SPARK-31672][SQL] Fix loading of timestamps before 1582-10-15 from dictionary encoded Parquet columns
cloud-fan commented on pull request #28489: URL: https://github.com/apache/spark/pull/28489#issuecomment-626469503 LGTM, merging to master/3.0! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #27937: [SPARK-30127][SQL] Support case class parameter for typed Scala UDF
cloud-fan commented on a change in pull request #27937: URL: https://github.com/apache/spark/pull/27937#discussion_r422778641 ## File path: sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala ## @@ -93,7 +93,7 @@ sealed abstract class UserDefinedFunction { private[spark] case class SparkUserDefinedFunction( f: AnyRef, dataType: DataType, -inputSchemas: Seq[Option[ScalaReflection.Schema]], Review comment: `inputSchemas` is composable but it contains too little information, that's why the Spark UDF was so limited before. Our goal is to make Spark UDF powerful enough so that people don't need to use internal APIs to build inhouse UDFs. But you are right that the support is not completed. @Ngone51 Can you take a closer look and see how to make it completed? BTW, if you do need to keep your inhouse UDFs for a while, there is a way to create `ExpressionEncoder` from `Seq[DataType]`, by calling `RowEncoder.apply`. It only supports standard Spark external types, i.e. `Row`, not `case class`, which is the same as older versions of Spark. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
AmplabJenkins removed a comment on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626465447 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
AmplabJenkins commented on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626465447 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
SparkQA commented on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626465224 **[Test build #122488 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122488/testReport)** for PR 28481 at commit [`a78e18b`](https://github.com/apache/spark/commit/a78e18b2631ed29703e4791cc850d62e3c58faaa). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on pull request #28486: [SPARK-31669][SQL][TESTS] Fix RowEncoderSuite failures on non-existing dates/timestamps
cloud-fan commented on pull request #28486: URL: https://github.com/apache/spark/pull/28486#issuecomment-626464599 Can we generate `LocalDate`/`Instant`? Spark supports these 2 types when creating DataFrame/Dataset. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] MaxGekk commented on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
MaxGekk commented on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626464195 jenkins, retest this, please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
AmplabJenkins removed a comment on pull request #28495: URL: https://github.com/apache/spark/pull/28495#issuecomment-626463755 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
AmplabJenkins commented on pull request #28495: URL: https://github.com/apache/spark/pull/28495#issuecomment-626463755 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dongjoon-hyun commented on pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
dongjoon-hyun commented on pull request #28495: URL: https://github.com/apache/spark/pull/28495#issuecomment-626463597 Could you review this PR when you have some time, @dbtsai ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
SparkQA commented on pull request #28495: URL: https://github.com/apache/spark/pull/28495#issuecomment-626463445 **[Test build #122487 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122487/testReport)** for PR 28495 at commit [`bad1068`](https://github.com/apache/spark/commit/bad1068ee6b2ef5e05dbdbdd3aa7a737f7bb9cd2). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dongjoon-hyun opened a new pull request #28495: [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental
dongjoon-hyun opened a new pull request #28495: URL: https://github.com/apache/spark/pull/28495 ### What changes were proposed in this pull request? ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] huaxingao commented on pull request #28451: [SPARK-31636][SQL][DOCS] Remove HTML syntax in SQL reference
huaxingao commented on pull request #28451: URL: https://github.com/apache/spark/pull/28451#issuecomment-626460022 @gatorsmile @dilipbiswal Anything else you want to add in 3.0? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
AmplabJenkins removed a comment on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626453008 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
AmplabJenkins commented on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626453008 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
SparkQA removed a comment on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626438326 **[Test build #122486 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122486/testReport)** for PR 28413 at commit [`63167fd`](https://github.com/apache/spark/commit/63167fd328c08564f2a2ab59f47f22fc33f09106). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
SparkQA commented on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626452744 **[Test build #122486 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122486/testReport)** for PR 28413 at commit [`63167fd`](https://github.com/apache/spark/commit/63167fd328c08564f2a2ab59f47f22fc33f09106). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
AmplabJenkins removed a comment on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626450849 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
AmplabJenkins commented on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626450849 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
SparkQA removed a comment on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626403485 **[Test build #122484 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122484/testReport)** for PR 28493 at commit [`62dbc83`](https://github.com/apache/spark/commit/62dbc8320641e791b06ba989b029e4d20d724407). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
SparkQA commented on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626450430 **[Test build #122484 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122484/testReport)** for PR 28493 at commit [`62dbc83`](https://github.com/apache/spark/commit/62dbc8320641e791b06ba989b029e4d20d724407). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on pull request #28336: [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available
HeartSaVioR commented on pull request #28336: URL: https://github.com/apache/spark/pull/28336#issuecomment-626446309 Appreciate another round of reviews. Thanks in advance! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] koertkuipers commented on a change in pull request #27937: [SPARK-30127][SQL] Support case class parameter for typed Scala UDF
koertkuipers commented on a change in pull request #27937: URL: https://github.com/apache/spark/pull/27937#discussion_r422699979 ## File path: sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala ## @@ -93,7 +93,7 @@ sealed abstract class UserDefinedFunction { private[spark] case class SparkUserDefinedFunction( f: AnyRef, dataType: DataType, -inputSchemas: Seq[Option[ScalaReflection.Schema]], Review comment: in my testing i also found the api now to be somewhat inconsistent/confusing. basically sometimes `CatalystTypeConverters.createToScalaConverter` is used and sometimes `ExpressionEncoder.fromRow`, depending solely on if the the argument is a top level struct or not. but `CatalystTypeConverters.createToScalaConverter` and `ExpressionEncoder.fromRow` behave very differently, leading to inconsistent application. for example this (contrived) usage works: ``` case class Person(name: String, age: Option[Int]) Seq((1, Person("john", Some(55))), (2, Person("mary", None))).toDF("id", "person").withColumn("age", udf{ p: Person1 => p.age }.apply(col("person"))) ``` but this does not: ``` Seq((1, Seq(Person("john", Some(55, (2, Seq(Person("mary", None.toDF("id", "persons").withColumn("ages", udf{ s: Seq[Person1] => s.map(_.age) }.apply(col("persons"))) ``` and while Option works nicely in Person case class (and also in tuples) Option does not work in a simple Seq: ``` Seq(Seq(Some(1), None)).toDF.withColumn("value", udf{ s: Seq[Option[Int]] => s.map(_.map(_ + 1)) }.apply(col("value")) ) ``` and Option also does not work for a function argument: ``` Seq(None, Some(1), None).toDF.withColumn("value", udf{ o: Option[Int] => o.map(_ + 1) }.apply(col("value"))) ``` this inconsistency will be hard to understand. and this inconsistency is not limited to Options. it also applies to many other things. for example tuples inside maps will not work (still have to use Row there) but tuples inside maps will work if its inside a case class. that is hard to explain to a user... finally let me give some background why i am a little nervous about this change... spark udfs have been somewhat limited for a long time. no support for case class, tuples, options. so many libraries have worked around that by defining their own udfs on top on SparkUserDefinedFunction. we do this inhouse too. it is easy to do this with type classes thanks to the composability of inputSchemas. so now you replaced inputSchemas with inputEncoders. but ExpressionEncoder and TypeTags are not composable. i do not see a way for us to build on top of this for our own inhouse udfs. so then the alternative for us is to abandon our inhouse udfs and start using spark's udfs again, which now support case classes and tuples, which is nice! but the inconsistency of the api and lack of support for option makes that currently not viable to me. i realize this is a spark internal api and this is entirely my own problem. but i thought it was worth pointing out because i suspect i am not the only one that has done this. i think this is one of the more typical workarounds people have done using spark (and i am aware of multiple implementations of this workaround). sorry for the long posts(s) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Ngone51 commented on a change in pull request #28444: [SPARK-31632][CORE][WEBUI] Make the ApplicationInfo always available when accessed
Ngone51 commented on a change in pull request #28444: URL: https://github.com/apache/spark/pull/28444#discussion_r422750454 ## File path: core/src/main/scala/org/apache/spark/status/AppStatusStore.scala ## @@ -35,8 +36,24 @@ private[spark] class AppStatusStore( val store: KVStore, val listener: Option[AppStatusListener] = None) { + /** + * This method contains an automatic retry logic and tries to get a valid [[v1.ApplicationInfo]]. + * See [SPARK-31632] The ApplicationInfo in KVStore may be accessed before it's prepared + */ def applicationInfo(): v1.ApplicationInfo = { Review comment: Also sounds reasonable to me. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Ngone51 commented on a change in pull request #28258: [SPARK-31486] [CORE] spark.submit.waitAppCompletion flag to control spark-submit exit in Standalone Cluster Mode
Ngone51 commented on a change in pull request #28258: URL: https://github.com/apache/spark/pull/28258#discussion_r422744560 ## File path: core/src/main/scala/org/apache/spark/deploy/Client.scala ## @@ -176,6 +190,25 @@ private class ClientEndpoint( } else if (!Utils.responseFromBackup(message)) { System.exit(-1) } + +case DriverStatusResponse(found, state, _, _, _) => + if (found) { +state.get match { + case DriverState.FINISHED | DriverState.FAILED | + DriverState.ERROR | DriverState.KILLED => +logInfo(s"State of $submittedDriverID is ${state.get}, " + Review comment: nit: s"State of **driver** $submittedDriverID ..." ## File path: core/src/main/scala/org/apache/spark/deploy/Client.scala ## @@ -176,6 +190,25 @@ private class ClientEndpoint( } else if (!Utils.responseFromBackup(message)) { System.exit(-1) } + +case DriverStatusResponse(found, state, _, _, _) => + if (found) { +state.get match { + case DriverState.FINISHED | DriverState.FAILED | + DriverState.ERROR | DriverState.KILLED => +logInfo(s"State of $submittedDriverID is ${state.get}, " + + s"exiting spark-submit JVM.") +System.exit(0) + case _ => +Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL) +logInfo(s"State of $submittedDriverID is ${state.get}, " + Review comment: nit: s"State of driver $submittedDriverID ..." ## File path: core/src/main/scala/org/apache/spark/deploy/Client.scala ## @@ -176,6 +190,25 @@ private class ClientEndpoint( } else if (!Utils.responseFromBackup(message)) { System.exit(-1) } + +case DriverStatusResponse(found, state, _, _, _) => + if (found) { +state.get match { + case DriverState.FINISHED | DriverState.FAILED | + DriverState.ERROR | DriverState.KILLED => +logInfo(s"State of $submittedDriverID is ${state.get}, " + + s"exiting spark-submit JVM.") +System.exit(0) + case _ => +Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL) Review comment: `Thread.sleep` could still has the same issue, imaging the network drop happens during sleeping. We should control the period sending logic out of `receive`. We could mimic `CoarseGrainedSchedulerBackend` to do the same work here: https://github.com/apache/spark/blob/9faad07ce706890008a8a3ce675fa95b0bdf7c14/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala#L137-L139 ## File path: core/src/main/scala/org/apache/spark/deploy/Client.scala ## @@ -176,6 +190,25 @@ private class ClientEndpoint( } else if (!Utils.responseFromBackup(message)) { System.exit(-1) } + +case DriverStatusResponse(found, state, _, _, _) => + if (found) { +state.get match { + case DriverState.FINISHED | DriverState.FAILED | + DriverState.ERROR | DriverState.KILLED => +logInfo(s"State of $submittedDriverID is ${state.get}, " + + s"exiting spark-submit JVM.") +System.exit(0) + case _ => +Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL) +logInfo(s"State of $submittedDriverID is ${state.get}, " + Review comment: Since status polling will happen every second, I'm afraid logs can be too verbose. We can log it after a constant polling times, e.g. log every 60 times. ## File path: docs/spark-standalone.md ## @@ -374,6 +374,25 @@ To run an interactive Spark shell against the cluster, run the following command You can also pass an option `--total-executor-cores ` to control the number of cores that spark-shell uses on the cluster. +# Client Properties + +Spark applications supports the following configuration properties specific to standalone mode: + + + Property NameDefault ValueMeaningSince Version + + spark.standalone.submit.waitAppCompletion + false + + In standalone cluster mode, controls whether the client waits to exit until the application completes. + If set to true, the client process will stay alive polling the application's status. Review comment: nit: `application's` or `driver`? ## File path: core/src/main/scala/org/apache/spark/deploy/Client.scala ## @@ -61,6 +61,10 @@ private class ClientEndpoint( private val lostMasters = new HashSet[RpcAddress] private var activeMasterEndpoint: RpcEndpointRef = null + private val waitAppCompletion = conf.getBoolean("spark.standalone.submit.waitAppCompletion", Review comment: We usually use `ConfigEntry` for a new conf. Could you add it too? This is an automated message from the Apache Git Service. To respond to the messag
[GitHub] [spark] sarutak commented on a change in pull request #28448: [SPARK-31638][WEBUI]Clean Pagination code for all webUI pages
sarutak commented on a change in pull request #28448: URL: https://github.com/apache/spark/pull/28448#discussion_r422748282 ## File path: core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala ## @@ -523,13 +517,11 @@ private[ui] class JobPagedTable( store, data, basePath, -currentTime, Review comment: If we can remove `currentTime` here, can we also remove `currentTime` from `JobPagedTable`? ## File path: core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala ## @@ -573,17 +563,9 @@ private[ui] class JobPagedTable( - { -if (tooltip.nonEmpty) { - -{header} {Unparsed(arrow)} - -} else { - -{header} {Unparsed(arrow)} - -} - } + Review comment: Nice catch. Tooltips with zero-length titles are never displayed. ## File path: core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala ## @@ -501,25 +499,21 @@ private[ui] class TaskPagedTable( override val dataSource: TaskDataSource = new TaskDataSource( stage, -currentTime, Review comment: Similar to the comment for AllJobsPage, can we remove `currentTime` from `TaskPagedTable`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
AmplabJenkins removed a comment on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626438622 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
AmplabJenkins commented on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626438622 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28413: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving&loading bug and expose hashFunc property in HashingTF
SparkQA commented on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626438326 **[Test build #122486 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122486/testReport)** for PR 28413 at commit [`63167fd`](https://github.com/apache/spark/commit/63167fd328c08564f2a2ab59f47f22fc33f09106). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] WeichenXu123 commented on pull request #28413: [SPARK-31610][ML] Expose hashFunc property in HashingTF
WeichenXu123 commented on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626436050 @huaxingao I will integrate the change into my PR. :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] uncleGen edited a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
uncleGen edited a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626431760 Seems like an unrelated and flaky failure and will fixed in #28391 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins removed a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626432365 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626432365 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
SparkQA commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626432135 **[Test build #122485 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122485/testReport)** for PR 28485 at commit [`df61b7c`](https://github.com/apache/spark/commit/df61b7c61dbdbd977bec76cf22a5f269b418035b). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] uncleGen commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
uncleGen commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626431951 Jenkins retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] uncleGen edited a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
uncleGen edited a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626431760 Seems like an unrelated failure and will fixed in #28391 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] uncleGen commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
uncleGen commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626431760 Retest this please. Seems like an unrelated failure and will fixed in #28391 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] maropu commented on pull request #28451: [SPARK-31636][SQL][DOCS] Remove HTML syntax in SQL reference
maropu commented on pull request #28451: URL: https://github.com/apache/spark/pull/28451#issuecomment-626421386 All the document works for 3.0 have been done? https://issues.apache.org/jira/browse/SPARK-28588 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] maropu commented on pull request #28368: [SPARK-31575][SQL] Synchronise global JVM security configuration modification
maropu commented on pull request #28368: URL: https://github.com/apache/spark/pull/28368#issuecomment-626420310 This feature (Kerberos support for JDBC connections) is only for 3.1, so I think this issue doesn't happen in 3.0/2.4. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] huaxingao commented on pull request #28413: [SPARK-31610][ML] Expose hashFunc property in HashingTF
huaxingao commented on pull request #28413: URL: https://github.com/apache/spark/pull/28413#issuecomment-626408391 @WeichenXu123 Do you want to integrate the change into your PR? Or you want me to submit a PR after yours is merged? I am OK either way. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
AmplabJenkins removed a comment on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626407379 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/122483/ Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
AmplabJenkins removed a comment on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626407376 Merged build finished. Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
SparkQA removed a comment on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626388490 **[Test build #122483 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122483/testReport)** for PR 28482 at commit [`fa6caf0`](https://github.com/apache/spark/commit/fa6caf0f1b529bbec7043fe197583aa3768b7e1b). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
AmplabJenkins commented on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626407376 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
SparkQA commented on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626407268 **[Test build #122483 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122483/testReport)** for PR 28482 at commit [`fa6caf0`](https://github.com/apache/spark/commit/fa6caf0f1b529bbec7043fe197583aa3768b7e1b). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds no public classes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] huaxingao commented on pull request #28451: [SPARK-31636][SQL][DOCS] Remove HTML syntax in SQL reference
huaxingao commented on pull request #28451: URL: https://github.com/apache/spark/pull/28451#issuecomment-626406578 Thanks all! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
AmplabJenkins removed a comment on pull request #28488: URL: https://github.com/apache/spark/pull/28488#issuecomment-626404988 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/122482/ Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
AmplabJenkins removed a comment on pull request #28488: URL: https://github.com/apache/spark/pull/28488#issuecomment-626404987 Merged build finished. Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
SparkQA removed a comment on pull request #28488: URL: https://github.com/apache/spark/pull/28488#issuecomment-626386553 **[Test build #122482 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122482/testReport)** for PR 28488 at commit [`d0957cb`](https://github.com/apache/spark/commit/d0957cb1e207d806b9ea6333b0deb4866dcecbbb). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
AmplabJenkins commented on pull request #28488: URL: https://github.com/apache/spark/pull/28488#issuecomment-626404987 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
SparkQA commented on pull request #28488: URL: https://github.com/apache/spark/pull/28488#issuecomment-626404817 **[Test build #122482 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122482/testReport)** for PR 28488 at commit [`d0957cb`](https://github.com/apache/spark/commit/d0957cb1e207d806b9ea6333b0deb4866dcecbbb). * This patch **fails PySpark unit tests**. * This patch merges cleanly. * This patch adds no public classes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28494: SPARK-31456 Fix shutdown hook priority edge cases
AmplabJenkins removed a comment on pull request #28494: URL: https://github.com/apache/spark/pull/28494#issuecomment-626404335 Can one of the admins verify this patch? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28494: SPARK-31456 Fix shutdown hook priority edge cases
AmplabJenkins commented on pull request #28494: URL: https://github.com/apache/spark/pull/28494#issuecomment-626404498 Can one of the admins verify this patch? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28494: SPARK-31456 Fix shutdown hook priority edge cases
AmplabJenkins commented on pull request #28494: URL: https://github.com/apache/spark/pull/28494#issuecomment-626404335 Can one of the admins verify this patch? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
AmplabJenkins removed a comment on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626403637 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] oleg-smith opened a new pull request #28494: SPARK-31456 Fix shutdown hook priority edge cases
oleg-smith opened a new pull request #28494: URL: https://github.com/apache/spark/pull/28494 ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
AmplabJenkins commented on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626403637 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
SparkQA commented on pull request #28493: URL: https://github.com/apache/spark/pull/28493#issuecomment-626403485 **[Test build #122484 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122484/testReport)** for PR 28493 at commit [`62dbc83`](https://github.com/apache/spark/commit/62dbc8320641e791b06ba989b029e4d20d724407). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dilipbiswal opened a new pull request #28493: [SPARK-31673][SQL] QueryExection.debug.toFile() to take an addtional explain mode param.
dilipbiswal opened a new pull request #28493: URL: https://github.com/apache/spark/pull/28493 ### What changes were proposed in this pull request? Currently QueryExecution.debug.toFile dumps the query plan information in a fixed format. This PR adds an additional explain mode parameter that writes the debug information as per the user supplied format. ``` df.queryExecution.debug.toFile("/tmp/plan.txt", explainMode = ExplainMode.fromString("formatted")) ``` ``` == Physical Plan == * Filter (2) +- Scan hive default.s1 (1) (1) Scan hive default.s1 Output [2]: [c1#15, c2#16] Arguments: [c1#15, c2#16], HiveTableRelation `default`.`s1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [c1#15, c2#16] (2) Filter [codegen id : 1] Input [2]: [c1#15, c2#16] Condition : (isnotnull(c1#15) AND (c1#15 > 0)) == Whole Stage Codegen == Found 1 WholeStageCodegen subtrees. == Subtree 1 / 1 (maxMethodCodeSize:220; maxConstantPoolSize:105(0.16% used); numInnerClasses:0) == *(1) Filter (isnotnull(c1#15) AND (c1#15 > 0)) +- Scan hive default.s1 [c1#15, c2#16], HiveTableRelation `default`.`s1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [c1#15, c2#16] Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage1(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=1 /* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private scala.collection.Iterator inputadapter_input_0; /* 010 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] filter_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1]; /* 011 */ /* 012 */ public GeneratedIteratorForCodegenStage1(Object[] references) { /* 013 */ this.references = references; /* 014 */ } /* 015 */ /* 016 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 017 */ partitionIndex = index; /* 018 */ this.inputs = inputs; /* 019 */ inputadapter_input_0 = inputs[0]; /* 020 */ filter_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 021 */ /* 022 */ } /* 023 */ /* 024 */ protected void processNext() throws java.io.IOException { /* 025 */ while ( inputadapter_input_0.hasNext()) { /* 026 */ InternalRow inputadapter_row_0 = (InternalRow) inputadapter_input_0.next(); /* 027 */ /* 028 */ do { /* 029 */ boolean inputadapter_isNull_0 = inputadapter_row_0.isNullAt(0); /* 030 */ int inputadapter_value_0 = inputadapter_isNull_0 ? /* 031 */ -1 : (inputadapter_row_0.getInt(0)); /* 032 */ /* 033 */ boolean filter_value_2 = !inputadapter_isNull_0; /* 034 */ if (!filter_value_2) continue; /* 035 */ /* 036 */ boolean filter_value_3 = false; /* 037 */ filter_value_3 = inputadapter_value_0 > 0; /* 038 */ if (!filter_value_3) continue; /* 039 */ /* 040 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1); /* 041 */ /* 042 */ boolean inputadapter_isNull_1 = inputadapter_row_0.isNullAt(1); /* 043 */ int inputadapter_value_1 = inputadapter_isNull_1 ? /* 044 */ -1 : (inputadapter_row_0.getInt(1)); /* 045 */ filter_mutableStateArray_0[0].reset(); /* 046 */ /* 047 */ filter_mutableStateArray_0[0].zeroOutNullBytes(); /* 048 */ /* 049 */ filter_mutableStateArray_0[0].write(0, inputadapter_value_0); /* 050 */ /* 051 */ if (inputadapter_isNull_1) { /* 052 */ filter_mutableStateArray_0[0].setNullAt(1); /* 053 */ } else { /* 054 */ filter_mutableStateArray_0[0].write(1, inputadapter_value_1); /* 055 */ } /* 056 */ append((filter_mutableStateArray_0[0].getRow())); /* 057 */ /* 058 */ } while(false); /* 059 */ if (shouldStop()) return; /* 060 */ } /* 061 */ } /* 062 */ /* 063 */ } ``` ### Why are the changes needed? Hopefully enhances the usability of debug.toFile(..) ### Does this PR introduce any user-facing change? No ### How was this patch tested? Added a test in QueryExecutionSuite This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --
[GitHub] [spark] srowen commented on a change in pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
srowen commented on a change in pull request #28488: URL: https://github.com/apache/spark/pull/28488#discussion_r422711668 ## File path: core/src/main/scala/org/apache/spark/rdd/RDD.scala ## @@ -1031,20 +1033,24 @@ abstract class RDD[T: ClassTag]( Array.concat(results: _*) } + def toLocalIterator : Iterator[T] = toLocalIterator(false) + /** * Return an iterator that contains all of the elements in this RDD. * * The iterator will consume as much memory as the largest partition in this RDD. + * With prefetch it may consume up to the memory of the 2 largest partitions. + * + * @param prefetchPartitions If Spark should pre-fetch the next partition before it is needed. * * @note This results in multiple Spark jobs, and if the input RDD is the result * of a wide transformation (e.g. join with different partitioners), to avoid * recomputing the input RDD should be cached first. */ - def toLocalIterator: Iterator[T] = withScope { -def collectPartition(p: Int): Array[T] = { - sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head -} -partitions.indices.iterator.flatMap(i => collectPartition(i)) + def toLocalIterator(prefetchPartitions: Boolean = false): Iterator[T] = withScope { +val iterator = new PrefetchingIterator(this, prefetchPartitions) +if (prefetchPartitions) iterator.hasNext Review comment: Agreed, I'd push evaluation until later, until hasNext or next is called. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] felixcheung commented on pull request #28386: [SPARK-26199][SPARK-31517][R] fix strategy for handling ... names in mutate
felixcheung commented on pull request #28386: URL: https://github.com/apache/spark/pull/28386#issuecomment-626400503 Is this a problem with only the new R 4.0.0 release? Maybe Spark doesn’t support it for now, if there isn’t a clean (or software license compatible) way to do this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
AmplabJenkins removed a comment on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626400072 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/122481/ Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
SparkQA removed a comment on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626379984 **[Test build #122481 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122481/testReport)** for PR 28481 at commit [`a78e18b`](https://github.com/apache/spark/commit/a78e18b2631ed29703e4791cc850d62e3c58faaa). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
AmplabJenkins removed a comment on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626400070 Merged build finished. Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
AmplabJenkins commented on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626400070 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28481: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
SparkQA commented on pull request #28481: URL: https://github.com/apache/spark/pull/28481#issuecomment-626400020 **[Test build #122481 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122481/testReport)** for PR 28481 at commit [`a78e18b`](https://github.com/apache/spark/commit/a78e18b2631ed29703e4791cc850d62e3c58faaa). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds no public classes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] koertkuipers commented on a change in pull request #27937: [SPARK-30127][SQL] Support case class parameter for typed Scala UDF
koertkuipers commented on a change in pull request #27937: URL: https://github.com/apache/spark/pull/27937#discussion_r422699979 ## File path: sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala ## @@ -93,7 +93,7 @@ sealed abstract class UserDefinedFunction { private[spark] case class SparkUserDefinedFunction( f: AnyRef, dataType: DataType, -inputSchemas: Seq[Option[ScalaReflection.Schema]], Review comment: in my testing i also found the api now to be somewhat inconsistent/confusing. basically sometimes `CatalystTypeConverters.createToScalaConverter` is used and sometimes `ExpressionEncoder.fromRow`, depending solely on if the the argument is a top level struct or not. but `CatalystTypeConverters.createToScalaConverter` and `ExpressionEncoder.fromRow` behave very differently, leading to inconsistent application. for example this (contrived) usage works: ``` case class Person(name: String, age: Option[Int]) Seq((1, Person("john", Some(55))), (2, Person("mary", None))).toDF("id", "person").withColumn("age", udf{ p: Person1 => p.age }.apply(col("person"))) ``` but this does not: ``` Seq((1, Seq(Person("john", Some(55, (2, Seq(Person("mary", None.toDF("id", "persons").withColumn("ages", udf{ s: Seq[Person1] => s.map(_.age) }.apply(col("persons"))) ``` and while Option works nicely in Person case class (and also in tuples) Option does not work in a simple Seq: ``` Seq(Seq(Some(1), None)).toDF.withColumn("value", udf{ s: Seq[Option[Int]] => s.map(_.map(_ + 1)) }.apply(col("value")) ) ``` and Option also does not work for a function argument: ``` Seq(None, Some(1), None).toDF.withColumn("value", udf{ o: Option[Int] => o.map(_ + 1) }.apply(col("value"))) ``` this inconsistency will be hard to understand. and this inconsistency is not limited to Options. it also applies to many other things. for example tuples inside maps will not work (still have to use Row there), etc. finally let me give some background why i am a little nervous about this change... spark udfs have been somewhat limited for a long time. no support for case class, tuples, options. so many libraries have worked around that by defining their own udfs on top on SparkUserDefinedFunction. we do this inhouse too. it is easy to do this with type classes thanks to the composability of inputSchemas. so now you replaced inputSchemas with inputEncoders. but ExpressionEncoder and TypeTags are not composable. i do not see a way for us to build on top of this for our own inhouse udfs. so then the alternative for us is to abandon our inhouse udfs and start using spark's udfs again, which now support case classes and tuples, which is nice! but the inconsistency of the api and lack of support for option makes that currently not viable to me. i realize this is a spark internal api and this is entirely my own problem. but i thought it was worth pointing out because i suspect i am not the only one that has done this. i think this is one of the more typical workarounds people have done using spark (and i am aware of multiple implementations of this workaround). sorry for the long posts(s) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins removed a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626394270 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/122479/ Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626394264 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
AmplabJenkins removed a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626394264 Merged build finished. Test FAILed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA removed a comment on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
SparkQA removed a comment on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626366090 **[Test build #122479 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122479/testReport)** for PR 28485 at commit [`df61b7c`](https://github.com/apache/spark/commit/df61b7c61dbdbd977bec76cf22a5f269b418035b). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28485: [SPARK-31642] Add Pagination Support for Structured Streaming Page
SparkQA commented on pull request #28485: URL: https://github.com/apache/spark/pull/28485#issuecomment-626394163 **[Test build #122479 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122479/testReport)** for PR 28485 at commit [`df61b7c`](https://github.com/apache/spark/commit/df61b7c61dbdbd977bec76cf22a5f269b418035b). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `class StreamingQueryPagedTable(` * `case class StructuredStreamingRow(` * `class StreamingQueryDataSource(uiData: Seq[StreamingQueryUIData], sortColumn: String, desc: Boolean,` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] koertkuipers commented on a change in pull request #27937: [SPARK-30127][SQL] Support case class parameter for typed Scala UDF
koertkuipers commented on a change in pull request #27937: URL: https://github.com/apache/spark/pull/27937#discussion_r422699979 ## File path: sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala ## @@ -93,7 +93,7 @@ sealed abstract class UserDefinedFunction { private[spark] case class SparkUserDefinedFunction( f: AnyRef, dataType: DataType, -inputSchemas: Seq[Option[ScalaReflection.Schema]], Review comment: in my testing i also found the api now to be somewhat inconsistent/confusing. basically sometimes `CatalystTypeConverters.createToScalaConverter` is used and sometimes `ExpressionEncoder.fromRow`, depending solely on if the the argument is a top level struct or not. but `CatalystTypeConverters.createToScalaConverter` and `ExpressionEncoder.fromRow` behave very differently, leading to inconsistent application. for example this (contrived) usage works: ``` case class Person(name: String, age: Option[Int]) Seq((1, Person("john", Some(55))), (2, Person("mary", None))).toDF("id", "person").withColumn("age", udf{ p: Person1 => p.age }.apply(col("person"))) ``` but this does not: ``` Seq((1, Seq(Person("john", Some(55, (2, Seq(Person("mary", None.toDF("id", "persons").withColumn("ages", udf{ s: Seq[Person1] => s.map(_.age) }.apply(col("persons"))) ``` and while Option works nicely in Person case class (and also in tuples) Option does not work in a simple Seq: ``` Seq(Seq(Some(1), None)).toDF.withColumn("value", udf{ s: Seq[Option[Int]] => s.map(_.map(_ + 1)) }.apply(col("value")) ) ``` this inconsistency will be hard to understand. finally let me give some background why i am a little nervous about this change... spark udfs have been somewhat limited for a long time. no support for case class, tuples, options. so many libraries have worked around that by defining their own udfs on top on SparkUserDefinedFunction. we do this inhouse too. it is easy to do this with type classes thanks to the composability of inputSchemas. so now you replaced inputSchemas with inputEncoders. but ExpressionEncoder and TypeTags are not composable. i do not see a way for us to build on top of this for our own inhouse udfs. so then the alternative for us is to abandon our inhouse udfs and start using spark's udfs again, which now support case classes and tuples, which is nice! but the inconsistency of the api and lack of support for option makes that currently not viable to me. i realize this is a spark internal api and this is entirely my own problem. but i thought it was worth pointing out because i suspect i am not the only one that has done this. i think this is one of the more typical workarounds people have done using spark (and i am aware of multiple implementations of this workaround). sorry for the long posts(s) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins removed a comment on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
AmplabJenkins removed a comment on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626388652 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
AmplabJenkins commented on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626388652 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #28482: [DONOTMERGE][DEBUG] Debug the test issues in SPARK-20732
SparkQA commented on pull request #28482: URL: https://github.com/apache/spark/pull/28482#issuecomment-626388490 **[Test build #122483 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/122483/testReport)** for PR 28482 at commit [`fa6caf0`](https://github.com/apache/spark/commit/fa6caf0f1b529bbec7043fe197583aa3768b7e1b). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] oleg-smith commented on a change in pull request #28488: [SPARK-29083][CORE] Prefetch elements in rdd.toLocalIterator
oleg-smith commented on a change in pull request #28488: URL: https://github.com/apache/spark/pull/28488#discussion_r422698223 ## File path: core/src/main/scala/org/apache/spark/rdd/RDD.scala ## @@ -1031,20 +1033,24 @@ abstract class RDD[T: ClassTag]( Array.concat(results: _*) } + def toLocalIterator : Iterator[T] = toLocalIterator(false) + /** * Return an iterator that contains all of the elements in this RDD. * * The iterator will consume as much memory as the largest partition in this RDD. + * With prefetch it may consume up to the memory of the 2 largest partitions. + * + * @param prefetchPartitions If Spark should pre-fetch the next partition before it is needed. * * @note This results in multiple Spark jobs, and if the input RDD is the result * of a wide transformation (e.g. join with different partitioners), to avoid * recomputing the input RDD should be cached first. */ - def toLocalIterator: Iterator[T] = withScope { -def collectPartition(p: Int): Array[T] = { - sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head -} -partitions.indices.iterator.flatMap(i => collectPartition(i)) + def toLocalIterator(prefetchPartitions: Boolean = false): Iterator[T] = withScope { +val iterator = new PrefetchingIterator(this, prefetchPartitions) +if (prefetchPartitions) iterator.hasNext Review comment: @holdenk @srowen Should we not prefetch the very first element here? For a user, it might be better to get iterator without side effects first, and submit jobs or receive errors only if iteration was intentionally started. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org