[GitHub] spark pull request #19327: [WIP] Implement stream-stream outer joins.
Github user joseph-torres commented on a diff in the pull request: https://github.com/apache/spark/pull/19327#discussion_r140968754 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala --- @@ -87,70 +87,157 @@ class SymmetricHashJoinStateManager( } /** - * Remove using a predicate on keys. See class docs for more context and implement details. + * Remove using a predicate on keys. + * + * This produces an iterator over the (key, value) pairs satisfying condition(key), where the + * underlying store is updated as a side-effect of producing next. + * + * This implies the iterator must be consumed fully without any other operations on this manager + * or the underlying store being interleaved. */ - def removeByKeyCondition(condition: UnsafeRow => Boolean): Unit = { -val allKeyToNumValues = keyToNumValues.iterator - -while (allKeyToNumValues.hasNext) { - val keyToNumValue = allKeyToNumValues.next - if (condition(keyToNumValue.key)) { -keyToNumValues.remove(keyToNumValue.key) -keyWithIndexToValue.removeAllValues(keyToNumValue.key, keyToNumValue.numValue) + def removeByKeyCondition(condition: UnsafeRow => Boolean): Iterator[UnsafeRowPair] = { +new NextIterator[UnsafeRowPair] { + + private val allKeyToNumValues = keyToNumValues.iterator + + private var currentKeyToNumValue: KeyAndNumValues = null + private var currentValues: Iterator[KeyWithIndexAndValue] = null + + private def currentKey = currentKeyToNumValue.key + + private val reusedPair = new UnsafeRowPair() + + private def getAndRemoveValue() = { +val keyWithIndexAndValue = currentValues.next() +keyWithIndexToValue.remove(currentKey, keyWithIndexAndValue.valueIndex) +reusedPair.withRows(currentKey, keyWithIndexAndValue.value) + } + + override def getNext(): UnsafeRowPair = { +if (currentValues != null && currentValues.hasNext) { + return getAndRemoveValue() +} else { + while (allKeyToNumValues.hasNext) { +currentKeyToNumValue = allKeyToNumValues.next() +if (condition(currentKey)) { + currentValues = keyWithIndexToValue.getAll( +currentKey, currentKeyToNumValue.numValue) + keyToNumValues.remove(currentKey) + + if (currentValues.hasNext) { +return getAndRemoveValue() + } +} + } +} + +finished = true +null } + + override def close: Unit = {} } } /** - * Remove using a predicate on values. See class docs for more context and implementation details. + * Remove using a predicate on values. + * + * At a high level, this produces an iterator over the (key, value) pairs such that value + * satisfies the predicate, where producing an element removes the value from the state store + * and producing all elements with a given key updates it accordingly. + * + * This implies the iterator must be consumed fully without any other operations on this manager + * or the underlying store being interleaved. */ - def removeByValueCondition(condition: UnsafeRow => Boolean): Unit = { -val allKeyToNumValues = keyToNumValues.iterator + def removeByValueCondition(condition: UnsafeRow => Boolean): Iterator[UnsafeRowPair] = { +new NextIterator[UnsafeRowPair] { -while (allKeyToNumValues.hasNext) { - val keyToNumValue = allKeyToNumValues.next - val key = keyToNumValue.key + // Reuse this object to avoid creation+GC overhead. + private val reusedPair = new UnsafeRowPair() - var numValues: Long = keyToNumValue.numValue - var index: Long = 0L - var valueRemoved: Boolean = false - var valueForIndex: UnsafeRow = null + private val allKeyToNumValues = keyToNumValues.iterator - while (index < numValues) { -if (valueForIndex == null) { - valueForIndex = keyWithIndexToValue.get(key, index) + private var currentKey: UnsafeRow = null + private var numValues: Long = 0L + private var index: Long = 0L + private var valueRemoved: Boolean = false + + // Push the data for the current key to the numValues store, and reset the tracking variables + // to their empty state. + private def storeCurrentKey(): Unit = { +if (valueRemoved) { + if (numValues >= 1) { +keyToNumValues.put(currentKey, nu
[GitHub] spark issue #19181: [SPARK-21907][CORE] oom during spill
Github user eyalfa commented on the issue: https://github.com/apache/spark/pull/19181 @hvanhovell ? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18747: [WIP][SPARK-20822][SQL] Generate code to directly get va...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/18747 **[Test build #82182 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82182/testReport)** for PR 18747 at commit [`2508593`](https://github.com/apache/spark/commit/25085934b443d5d47fd02fd6d84dae11eaa5bf8b). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user ueshin commented on the issue: https://github.com/apache/spark/pull/19349 The performance test I did in my local based on @BryanCutler's (https://github.com/apache/spark/pull/18659#issuecomment-315879173) is as follows: ```python from pyspark.sql.functions import * from pyspark.sql.types import * @udf(DoubleType()) def my_udf(p1, p2): from math import log, exp return exp(log(p1) + log(p2) - log(0.5)) @pandas_udf(DoubleType()) def my_pandas_udf(p1, p2): from numpy import log, exp return exp(log(p1) + log(p2) - log(0.5)) df = spark.range(1 << 24, numPartitions=16).toDF("id") \ .withColumn("p1", rand()).withColumn("p2", rand()) df_udf = df.withColumn("p", my_udf(col("p1"), col("p2"))) df_pandas_udf = df.withColumn("p", my_pandas_udf(col("p1"), col("p2"))) ``` ``` %timeit -n2 df_udf.select(sum(col('p'))).collect() 12.2 s ± 456 ms per loop (mean ± std. dev. of 7 runs, 2 loops each) ``` ``` spark.conf.set("spark.sql.execution.arrow.stream.enable", "false") %timeit -n2 df_pandas_udf.select(sum(col('p'))).collect() 1.91 s ± 195 ms per loop (mean ± std. dev. of 7 runs, 2 loops each) ``` ``` spark.conf.set("spark.sql.execution.arrow.stream.enable", "true") %timeit -n2 df_pandas_udf.select(sum(col('p'))).collect() 1.67 s ± 223 ms per loop (mean ± std. dev. of 7 runs, 2 loops each) ``` Environment: - Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz - Java HotSpot(TM) 64-Bit Server VM 1.8.0_144-b01 on Mac OS X 10.12.6 - Python 3.6.1 64bit [GCC 4.2.1 Compatible Apple LLVM 6.1.0 (clang-602.0.53)] - pandas 0.20.1 - pyarrow 0.4.1 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/19343 Spark SQL might not be deployed in the HDFS system. Conceptually, this HDFS-specific codes should not be part of our `HiveExternalCatalog` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: [SAPRK-20785][WEB-UI][SQL] Spark should provide jump lin...
Github user jerryshao commented on the issue: https://github.com/apache/spark/pull/19346 GLTM. @gatorsmile , would you please take a look at this PR, is it good for you? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19175 **[Test build #82181 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82181/testReport)** for PR 19175 at commit [`c87edeb`](https://github.com/apache/spark/commit/c87edeb74603846c7e7f0b4c36999a0bbf06be31). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/19343 This is not `[CORE]`. The title should be updated to `[SQL]` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: [SAPRK-20785][WEB-UI][SQL] Spark should provide jump lin...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19346 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: [SAPRK-20785][WEB-UI][SQL] Spark should provide jump lin...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19346 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82171/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: [SAPRK-20785][WEB-UI][SQL] Spark should provide jump lin...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19346 **[Test build #82171 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82171/testReport)** for PR 19346 at commit [`305689f`](https://github.com/apache/spark/commit/305689f1da70e86838796564cc4c53d80f96a1f2). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19349 **[Test build #82180 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82180/testReport)** for PR 19349 at commit [`14aa3b6`](https://github.com/apache/spark/commit/14aa3b641fd0c7f3a6feb6869508703b113b6ce6). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19338: [SPARK-22123][CORE] Add latest failure reason for task s...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19338 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82169/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19338: [SPARK-22123][CORE] Add latest failure reason for task s...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19338 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19338: [SPARK-22123][CORE] Add latest failure reason for task s...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19338 **[Test build #82169 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82169/testReport)** for PR 19338 at commit [`57190ef`](https://github.com/apache/spark/commit/57190ef25fc255755b0d50b8f5de402592f153d1). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19186: [SPARK-21972][ML] Add param handlePersistence
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19186 **[Test build #82179 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82179/testReport)** for PR 19186 at commit [`18f9903`](https://github.com/apache/spark/commit/18f9903707d029d40c5eb03dc8e856a6607ac723). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19175 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19175 **[Test build #82178 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82178/testReport)** for PR 19175 at commit [`44ecbca`](https://github.com/apache/spark/commit/44ecbcab68404d04ef9eb69262ed3421a8b8e920). * This patch **fails to build**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19175 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82178/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19345: [SPARK-22124][SQL] Sample and Limit should also defer in...
Github user viirya commented on the issue: https://github.com/apache/spark/pull/19345 cc @cloud-fan or @hvanhovell This should be a straightforward change. Please take a quick look. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19349 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19349 **[Test build #82175 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82175/testReport)** for PR 19349 at commit [`e62d619`](https://github.com/apache/spark/commit/e62d619e13f63af5af2f386c0d7ab554ad3c6336). * This patch **fails MiMa tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `case class ExtractPythonUDFs(conf: SQLConf) extends Rule[SparkPlan] with PredicateHelper ` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19349 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82175/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19175 **[Test build #82178 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82178/testReport)** for PR 19175 at commit [`44ecbca`](https://github.com/apache/spark/commit/44ecbcab68404d04ef9eb69262ed3421a8b8e920). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19175 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19175 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82177/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19175 **[Test build #82177 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82177/testReport)** for PR 19175 at commit [`9448371`](https://github.com/apache/spark/commit/94483714e3008848d53fcf240bf4b07419ee7e77). * This patch **fails Scala style tests**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19175: [SPARK-21964][SQL]Enable splitting the Aggregate (on Exp...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19175 **[Test build #82177 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82177/testReport)** for PR 19175 at commit [`9448371`](https://github.com/apache/spark/commit/94483714e3008848d53fcf240bf4b07419ee7e77). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19349 **[Test build #82175 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82175/testReport)** for PR 19349 at commit [`e62d619`](https://github.com/apache/spark/commit/e62d619e13f63af5af2f386c0d7ab554ad3c6336). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19287: [SPARK-22074][Core] Task killed by other attempt task sh...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19287 **[Test build #82176 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82176/testReport)** for PR 19287 at commit [`039591d`](https://github.com/apache/spark/commit/039591d4fb2cbe3292b4f0ce33ba605bed895453). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format f...
Github user ueshin commented on the issue: https://github.com/apache/spark/pull/19349 cc @BryanCutler @HyukjinKwon @viirya @cloud-fan --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19349: [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream f...
GitHub user ueshin opened a pull request: https://github.com/apache/spark/pull/19349 [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream format for vectorized UDF. ## What changes were proposed in this pull request? Currently we use Arrow File format to communicate with Python worker when invoking vectorized UDF but we can use Arrow Stream format. This pr adds a config `"spark.sql.execution.arrow.stream.enable"` to enable Arrow Stream format. ## How was this patch tested? Existing tests, and tests for vectorized UDF with the stream format enabled. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ueshin/apache-spark issues/SPARK-22125 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19349.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19349 commit 3c45c5c132f91f32878dd52245a1beb55eca05e7 Author: Takuya UESHIN Date: 2017-09-21T05:33:01Z Extract PythonRunner from PythonRDD.scala file. commit 1cd832cb796bdb7e330e56a953274ed577dc8876 Author: Takuya UESHIN Date: 2017-09-21T06:33:43Z Extract writer thread. commit 919811d9ffacb8218acf7148b2f0918b255c4f3a Author: Takuya UESHIN Date: 2017-09-21T07:23:55Z Extract reader iterator. commit b2fed104ee00f5bf8235e21b01f89c98ec9400fc Author: Takuya UESHIN Date: 2017-09-21T09:00:42Z Introduce ArrowStreamPythonUDFRunner. commit 937292d0a2a2145be3dbc6314cf0da1b41e71b6e Author: Takuya UESHIN Date: 2017-09-22T11:03:07Z Add ArrowStreamPandasSerializer. commit 80167219abf98b8c019df3582a8c2b3ec6697753 Author: Takuya UESHIN Date: 2017-09-22T11:14:08Z Introduce ArrowStreamEvalPythonExec. commit e62d619e13f63af5af2f386c0d7ab554ad3c6336 Author: Takuya UESHIN Date: 2017-09-22T11:36:11Z Enable vectorized UDF via Arrow stream protocol. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19342: [MINOR][SparkR] minor fixes for CRAN compliance
Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/19342#discussion_r140958369 --- Diff: R/pkg/DESCRIPTION --- @@ -59,3 +59,4 @@ Collate: 'window.R' RoxygenNote: 5.0.1 VignetteBuilder: knitr +SystemRequirements: Apache Spark --- End diff -- And btw, the issue with vigenettes is already fixed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19342: [MINOR][SparkR] minor fixes for CRAN compliance
Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/19342#discussion_r140958315 --- Diff: R/pkg/DESCRIPTION --- @@ -59,3 +59,4 @@ Collate: 'window.R' RoxygenNote: 5.0.1 VignetteBuilder: knitr +SystemRequirements: Apache Spark --- End diff -- and FYI, this was the relevant part from the email ``` Apparently you expect some installed Hadoop or SPark software for running the vignettes? But there is no SystemRequeirements field? ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19342: [MINOR][SparkR] minor fixes for CRAN compliance
Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/19342#discussion_r140957541 --- Diff: R/pkg/DESCRIPTION --- @@ -59,3 +59,4 @@ Collate: 'window.R' RoxygenNote: 5.0.1 VignetteBuilder: knitr +SystemRequirements: Apache Spark --- End diff -- I'm not sure we should add this - my experience with SystemRequirements is that there is a fix list of things that it will understand, and I'm reasonably sure "Apache Spark" is not one of them, and likely we cause build or submission failure by having it. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19342: [MINOR][SparkR] minor fixes for CRAN compliance
Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/19342#discussion_r140957870 --- Diff: R/pkg/R/DataFrame.R --- @@ -3250,6 +3250,7 @@ setMethod("attach", function(what, pos = 2, name = deparse(substitute(what)), warn.conflicts = TRUE) { newEnv <- assignNewEnv(what) attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts) +on.exit(detach(newEnv, pos = pos, name = name)) --- End diff -- The reported point is a NOTE, as per my understanding, we should be able to release with a NOTE (in fact, there is another NOTE on an unknown maintainer...) Also, I'm not sure how by addition this would address the NOTE? From what I can see, the NOTE says we should be careful about having an `attach` method... for which I see the only way to address it or to avoid the NOTE is... to remove the `attach` method? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19330: [SPARK-18134][SQL] Orderable MapType
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19330 **[Test build #82174 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82174/testReport)** for PR 19330 at commit [`0385487`](https://github.com/apache/spark/commit/0385487bbe713f14e523d8af68a36d4e5b83c690). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19345: [SPARK-22124][SQL] Sample and Limit should also defer in...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19345 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19345: [SPARK-22124][SQL] Sample and Limit should also defer in...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19345 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82168/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19345: [SPARK-22124][SQL] Sample and Limit should also defer in...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19345 **[Test build #82168 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82168/testReport)** for PR 19345 at commit [`0252663`](https://github.com/apache/spark/commit/0252663f998edd1deb75e99b9a1d7ab9ec806a29). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19330: Orderable MapType
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19330 **[Test build #82173 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82173/testReport)** for PR 19330 at commit [`33e532b`](https://github.com/apache/spark/commit/33e532b2d548bd482e0fe4ca645ad4261e900011). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19343 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19343 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82170/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19343 **[Test build #82170 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82170/testReport)** for PR 19343 at commit [`c6acdf2`](https://github.com/apache/spark/commit/c6acdf213e1465384b11232ca3d963d819414dce). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19348: [BUILD] Close stale PRs
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19348 **[Test build #82172 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82172/testReport)** for PR 19348 at commit [`a71bb48`](https://github.com/apache/spark/commit/a71bb48243b28cbf97e144b94bc48881d2b8792e). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19344: [SPARK-22122][SQL] Respect WITH clauses to count input r...
Github user maropu commented on the issue: https://github.com/apache/spark/pull/19344 @gatorsmile if you get time, please check this. thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19348: [BUILD] Close stale PRs
GitHub user HyukjinKwon opened a pull request: https://github.com/apache/spark/pull/19348 [BUILD] Close stale PRs Closes #13794 Closes #18474 Closes #18897 Closes #18978 Closes #19152 Closes #19238 Closes #19295 Closes #19334 Closes #19335 Closes #19347 You can merge this pull request into a Git repository by running: $ git pull https://github.com/HyukjinKwon/spark stale-prs Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19348.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19348 commit a71bb48243b28cbf97e144b94bc48881d2b8792e Author: hyukjinkwon Date: 2017-09-26T04:12:06Z Close stale PRs --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18887: [SPARK-20642][core] Store FsHistoryProvider listing data...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/18887 LGTM --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18887: [SPARK-20642][core] Store FsHistoryProvider listing data...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/18887 Thanks for your work! --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19336: [SPARK-21947][SS] Check and report error when monotonica...
Github user viirya commented on the issue: https://github.com/apache/spark/pull/19336 cc @zsxwing for review. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18817: [SPARK-21612] Allow unicode strings in __getitem__ of St...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/18817 ping @rik-coenders --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19344: [SPARK-22122][SQL] Respect WITH clauses to count input r...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19344 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19344: [SPARK-22122][SQL] Respect WITH clauses to count input r...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19344 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82167/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19344: [SPARK-22122][SQL] Respect WITH clauses to count input r...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19344 **[Test build #82167 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82167/testReport)** for PR 19344 at commit [`d9be37e`](https://github.com/apache/spark/commit/d9be37e9e3168d4adb340bb82a48006863c11636). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19347: Branch 2.2 sparkmlib's output of many algorithms is not ...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/19347 @ithjz, If you'd like to ask a question, please ask this to the mailing list (see https://spark.apache.org/community.html). Could you close this please? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19347: Branch 2.2 sparkmlib's output of many algorithms is not ...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19347 Can one of the admins verify this patch? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19347: Branch 2.2 sparkmlib's output of many algorithms ...
GitHub user ithjz opened a pull request: https://github.com/apache/spark/pull/19347 Branch 2.2 sparkmlib'soutput of many algorithms is not clear What's the use of these **results?** JavaGradientBoostingRegressionExample Test Mean Squared Error: 0.12503 Learned regression GBT model: TreeEnsembleModel regressor with 3 trees Tree 0: If (feature 351 <= 15.0) Predict: 0.0 Else (feature 351 > 15.0) Predict: 1.0 Tree 1: Predict: 0.0 Tree 2: Predict: 0.0 You can merge this pull request into a Git repository by running: $ git pull https://github.com/apache/spark branch-2.2 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19347.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19347 commit e936a96badfeeb2051ee35dc4b0fbecefa9bf4cb Author: Peng Date: 2017-05-24T11:54:17Z [SPARK-20764][ML][PYSPARK][FOLLOWUP] Fix visibility discrepancy with numInstances and degreesOfFreedom in LR and GLR - Python version ## What changes were proposed in this pull request? Add test cases for PR-18062 ## How was this patch tested? The existing UT Author: Peng Closes #18068 from mpjlu/moreTest. (cherry picked from commit 9afcf127d31b5477a539dde6e5f01861532a1c4c) Signed-off-by: Yanbo Liang commit 1d107242f8ec842c009e0b427f6e4a8313d99aa2 Author: zero323 Date: 2017-05-24T11:57:44Z [SPARK-20631][FOLLOW-UP] Fix incorrect tests. ## What changes were proposed in this pull request? - Fix incorrect tests for `_check_thresholds`. - Move test to `ParamTests`. ## How was this patch tested? Unit tests. Author: zero323 Closes #18085 from zero323/SPARK-20631-FOLLOW-UP. (cherry picked from commit 1816eb3bef930407dc9e083de08f5105725c55d1) Signed-off-by: Yanbo Liang commit 83aeac9e0590e99010d0af8e067822d0ed0971fe Author: Bago Amirbekian Date: 2017-05-24T14:55:38Z [SPARK-20862][MLLIB][PYTHON] Avoid passing float to ndarray.reshape in LogisticRegressionModel ## What changes were proposed in this pull request? Fixed TypeError with python3 and numpy 1.12.1. Numpy's `reshape` no longer takes floats as arguments as of 1.12. Also, python3 uses float division for `/`, we should be using `//` to ensure that `_dataWithBiasSize` doesn't get set to a float. ## How was this patch tested? Existing tests run using python3 and numpy 1.12. Author: Bago Amirbekian Closes #18081 from MrBago/BF-py3floatbug. (cherry picked from commit bc66a77bbe2120cc21bd8da25194efca4cde13c3) Signed-off-by: Yanbo Liang commit c59ad420b5fda29567f4a06b5f71df76e70e269a Author: Liang-Chi Hsieh Date: 2017-05-24T16:35:40Z [SPARK-20848][SQL] Shutdown the pool after reading parquet files ## What changes were proposed in this pull request? From JIRA: On each call to spark.read.parquet, a new ForkJoinPool is created. One of the threads in the pool is kept in the WAITING state, and never stopped, which leads to unbounded growth in number of threads. We should shutdown the pool after reading parquet files. ## How was this patch tested? Added a test to ParquetFileFormatSuite. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Liang-Chi Hsieh Closes #18073 from viirya/SPARK-20848. (cherry picked from commit f72ad303f05a6d99513ea3b121375726b177199c) Signed-off-by: Wenchen Fan commit b7a2a16b1e01375292938fc48b0a333ec4e7cd30 Author: Reynold Xin Date: 2017-05-24T20:57:19Z [SPARK-20867][SQL] Move hints from Statistics into HintInfo class ## What changes were proposed in this pull request? This is a follow-up to SPARK-20857 to move the broadcast hint from Statistics into a new HintInfo class, so we can be more flexible in adding new hints in the future. ## How was this patch tested? Updated test cases to reflect the change. Author: Reynold Xin Closes #18087 from rxin/SPARK-20867. (cherry picked from commit a64746677bf09ef67e3fd538355a6ee9b5ce8cf4) Signed-off-by: Xiao Li commit 2405afce4e87c0486f2aef1d068f17aea2480b17 Author: Kris Mok Date: 2017-05-25T00:19:35Z [SPARK-20872][SQL] ShuffleExchange.nodeName should handle null coordinator ## What changes were proposed in this pull request? A one-liner change in `ShuffleExchange.nodeName` to cover the case when `coordinator` is `null`, so that the match expression is exhaustive. Please refer to [SPARK-20872](https://issues.apache.org/jira/browse/SPARK-20872) for
[GitHub] spark issue #19346: Spark should provide jump links and add (count) in the S...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19346 **[Test build #82171 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82171/testReport)** for PR 19346 at commit [`305689f`](https://github.com/apache/spark/commit/305689f1da70e86838796564cc4c53d80f96a1f2). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: Spark should provide jump links and add (count) in the S...
Github user jerryshao commented on the issue: https://github.com/apache/spark/pull/19346 Please fix the title. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: Spark should provide jump links and add (count) in the S...
Github user jerryshao commented on the issue: https://github.com/apache/spark/pull/19346 ok to test. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19346: Spark should provide jump links and add (count) i...
Github user guoxiaolongzte closed the pull request at: https://github.com/apache/spark/pull/19346 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: Spark should provide jump links and add (count) in the S...
Github user guoxiaolongzte commented on the issue: https://github.com/apache/spark/pull/19346 @HyukjinKwon @jerryshao @ajbozarth In the new PR I fix the indentation problem. Thanks. The latest verification results are as followsï¼ ![3](https://user-images.githubusercontent.com/26266482/30840889-57faf5e8-a2ac-11e7-8418-f879a841bc8d.png) --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18015: [SAPRK-20785][WEB-UI][SQL]Spark should provide ju...
Github user guoxiaolongzte closed the pull request at: https://github.com/apache/spark/pull/18015 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19346: Spark should provide jump links and add (count) i...
GitHub user guoxiaolongzte reopened a pull request: https://github.com/apache/spark/pull/19346 Spark should provide jump links and add (count) in the SQL web ui. ## What changes were proposed in this pull request? propose: it provide links that jump to Running Queries,Completed Queries and Failed Queries. it add (count) about Running Queries,Completed Queries and Failed Queries. This is a small optimization in in the SQL web ui. fix before: ![1](https://user-images.githubusercontent.com/26266482/30840686-36025cc0-a2ab-11e7-8d8d-1de0122a84fb.png) fix after: ![2](https://user-images.githubusercontent.com/26266482/30840723-6cc67a52-a2ab-11e7-8002-9191a55895a6.png) ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. You can merge this pull request into a Git repository by running: $ git pull https://github.com/guoxiaolongzte/spark SPARK-20785 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19346.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19346 commit 305689f1da70e86838796564cc4c53d80f96a1f2 Author: guoxiaolong Date: 2017-09-26T03:05:07Z Spark should provide jump links and add (count) in the SQL web ui. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19346: Spark should provide jump links and add (count) in the S...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19346 Can one of the admins verify this patch? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18015: [SAPRK-20785][WEB-UI][SQL]Spark should provide jump link...
Github user guoxiaolongzte commented on the issue: https://github.com/apache/spark/pull/18015 Sorry, I accidentally deleted the code branch. I'm going to close this PR. I created a new PR https://github.com/apache/spark/pull/19346, which was modified and created based on the latest code. In the new PR I fix the indentation problem. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19346: Spark should provide jump links and add (count) i...
GitHub user guoxiaolongzte opened a pull request: https://github.com/apache/spark/pull/19346 Spark should provide jump links and add (count) in the SQL web ui. ## What changes were proposed in this pull request? propose: it provide links that jump to Running Queries,Completed Queries and Failed Queries. it add (count) about Running Queries,Completed Queries and Failed Queries. This is a small optimization in in the SQL web ui. fix before: ![1](https://user-images.githubusercontent.com/26266482/30840686-36025cc0-a2ab-11e7-8d8d-1de0122a84fb.png) fix after: ![2](https://user-images.githubusercontent.com/26266482/30840723-6cc67a52-a2ab-11e7-8002-9191a55895a6.png) ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. You can merge this pull request into a Git repository by running: $ git pull https://github.com/guoxiaolongzte/spark SPARK-20785 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19346.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19346 commit 305689f1da70e86838796564cc4c53d80f96a1f2 Author: guoxiaolong Date: 2017-09-26T03:05:07Z Spark should provide jump links and add (count) in the SQL web ui. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18576: [SPARK-21351][SQL] Update nullability based on children'...
Github user maropu commented on the issue: https://github.com/apache/spark/pull/18576 kindly ping --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18931: [SPARK-21717][SQL] Decouple consume functions of physica...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18931 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82164/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18931: [SPARK-21717][SQL] Decouple consume functions of physica...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18931 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18931: [SPARK-21717][SQL] Decouple consume functions of physica...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/18931 **[Test build #82164 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82164/testReport)** for PR 18931 at commit [`b75ce8f`](https://github.com/apache/spark/commit/b75ce8fdd539f3d80b3fb3edcfc72dce82227c87). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18931: [SPARK-21717][SQL] Decouple consume functions of physica...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18931 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18931: [SPARK-21717][SQL] Decouple consume functions of physica...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/18931 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82165/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18931: [SPARK-21717][SQL] Decouple consume functions of physica...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/18931 **[Test build #82165 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82165/testReport)** for PR 18931 at commit [`e36ec3c`](https://github.com/apache/spark/commit/e36ec3c513c7da8500a703d7afa4860caa135e54). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19335: mapPartitions Api
Github user caneGuy commented on the issue: https://github.com/apache/spark/pull/19335 å¨spark-user listæé®å§ï¼ http://apache-spark-user-list.1001560.n3.nabble.com/ 2017-09-25 11:29 GMT+08:00 listenLearning : > æ¨å¥½ï¼æè¿æå¨å¼åçæ¶åéå°ä¸ä¸ªé®é¢ï¼å°±æ¯å¦ææç¨mappartitionsè¿ä¸ªapiå»åå¨æ°æ®å° > hbaseï¼ä¼åºç°ä¸ä¸ªæ¾ä¸å°partitionçé误ï¼ç¶åè·çå°±ä¼åºç°ä¸ä¸ªæ¾ä¸å°å¹¿æåéçé误ï¼è¯·é®è¿ä¸ªæ¯ä¸ºä»å¢ï¼ï¼ï¼ä¸ä¸æ¯ä»£ç 以åé误 > def ASpan(span: DataFrame, time: String): Unit = { > try { > span.mapPartitions(iter=>{ > iter.map(line => { > val put = new Put(Bytes.toBytes(CreateRowkey.Bit16(line.getString(0)) + > "_101301")) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_TIME1PER_30"), > Bytes.toBytes(line.getString(1))) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_TIME2PER_30"), > Bytes.toBytes(line.getString(2))) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_TIME3PER_30"), > Bytes.toBytes(line.getString(3))) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_TIME4PER_30"), > Bytes.toBytes(line.getString(4))) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_HASCALL_1"), > Bytes.toBytes(line.getLong(5).toString)) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_HASCALL_3"), > Bytes.toBytes(line.getLong(6).toString)) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_HASCALL_6"), > Bytes.toBytes(line.getLong(7).toString)) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_NOCALL_1"), > Bytes.toBytes(line.getLong(8).toString)) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_NOCALL_3"), > Bytes.toBytes(line.getLong(9).toString)) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("CALLDT_NOCALL_6"), > Bytes.toBytes(line.getLong(10).toString)) > put.addColumn(Bytes.toBytes("CF"), Bytes.toBytes("DB_TIME"), > Bytes.toBytes(time)) > (new ImmutableBytesWritable, put) > }) > }).saveAsNewAPIHadoopDataset(shuliStreaming.indexTable) > } catch { > case e: Exception => > shuliStreaming.WriteIn.writeLog("shuli", time, "éé»æ&è¿å ææ¯å¦éè¯å¨é误", e) > e.printStackTrace() > println("éé»æ&è¿å ææ¯å¦éè¯å¨é误" + e) > } > } > errorï¼ > 17/09/24 23:04:17 INFO spark.CacheManager: Partition rdd_11_1 not found, > computing it > 17/09/24 23:04:17 INFO rdd.HadoopRDD: Input split: > hdfs://nameservice1/data/input/common/phlibrary/OFFLINEPHONELIBRARY.dat: > 1146925+1146926 > 17/09/24 23:04:17 INFO broadcast.TorrentBroadcast: Started reading > broadcast variable 1 > 17/09/24 23:04:17 ERROR executor.Executor: Exception in task 1.0 in stage > 250804.0 (TID 3190467) > java.io.IOException: org.apache.spark.SparkException: Failed to get > broadcast_1_piece0 of broadcast_1 > at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1223) > at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock( > TorrentBroadcast.scala:165) > at org.apache.spark.broadcast.TorrentBroadcast._value$ > lzycompute(TorrentBroadcast.scala:64) > at org.apache.spark.broadcast.TorrentBroadcast._value( > TorrentBroadcast.scala:64) > at org.apache.spark.broadcast.TorrentBroadcast.getValue( > TorrentBroadcast.scala:88) > at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70) > at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:144) > at org.apache.spark.rdd.HadoopRDD$$anon$1.(HadoopRDD.scala:212) > at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:208) > at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > -- > You can view, comment on, or merge this pull request online at: > > https://github.com/apache/spark/pull/19335 > Commit Summary > >- [SPARK-13969][ML] Add FeatureHasher transformer >- [SPARK-21656][CORE] spark dynamic allocation should not idle timeout >executors when tasks still to run >- [SPARK-21603][SQL] The wholestage codegen will be much slower then >that is closed when the function is too long >- [SPARK-21738] Thriftserver doesn't cancel jobs when session is closed >- [SPARK-21680][ML][MLLIB] optimize Vector compress >- [SPARK-3151][BLOCK MANAGER] DiskStore.getBy
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19343 **[Test build #82170 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82170/testReport)** for PR 19343 at commit [`c6acdf2`](https://github.com/apache/spark/commit/c6acdf213e1465384b11232ca3d963d819414dce). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19327: [WIP] Implement stream-stream outer joins.
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19327 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19327: [WIP] Implement stream-stream outer joins.
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19327 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82163/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19327: [WIP] Implement stream-stream outer joins.
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19327 **[Test build #82163 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82163/testReport)** for PR 19327 at commit [`8d83155`](https://github.com/apache/spark/commit/8d8315570aa8b96d7c90ec1d6b631eb4abbde65f). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19338: [SPARK-22123][CORE] Add latest failure reason for task s...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19338 **[Test build #82169 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82169/testReport)** for PR 19338 at commit [`57190ef`](https://github.com/apache/spark/commit/57190ef25fc255755b0d50b8f5de402592f153d1). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19327: [WIP] Implement stream-stream outer joins.
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19327 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82162/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19327: [WIP] Implement stream-stream outer joins.
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19327 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19327: [WIP] Implement stream-stream outer joins.
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19327 **[Test build #82162 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82162/testReport)** for PR 19327 at commit [`02b3352`](https://github.com/apache/spark/commit/02b3352b409c20146fc64418b1c8e2a12880c1c5). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19345: [SPARK-22124][SQL] Sample and Limit should also defer in...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19345 **[Test build #82168 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82168/testReport)** for PR 19345 at commit [`0252663`](https://github.com/apache/spark/commit/0252663f998edd1deb75e99b9a1d7ab9ec806a29). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19345: [SPARK-22124][SQL] Sample and Limit should also d...
GitHub user viirya opened a pull request: https://github.com/apache/spark/pull/19345 [SPARK-22124][SQL] Sample and Limit should also defer input evaluation under codegen ## What changes were proposed in this pull request? We can override `usedInputs` to claim that an operator defers input evaluation. `Sample` and `Limit` are two operators which should claim it but don't. We should do it. ## How was this patch tested? Existing tests. You can merge this pull request into a Git repository by running: $ git pull https://github.com/viirya/spark-1 SPARK-22124 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19345.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19345 commit 0252663f998edd1deb75e99b9a1d7ab9ec806a29 Author: Liang-Chi Hsieh Date: 2017-09-26T02:11:02Z Claim deferred input evaluation for Sample and Limit. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19325: [SPARK-22106][PYSPARK][SQL] Disable 0-parameter p...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/19325 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19338: [SPARK-22123][CORE] Add latest failure reason for...
Github user caneGuy commented on a diff in the pull request: https://github.com/apache/spark/pull/19338#discussion_r140941633 --- Diff: core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala --- @@ -94,7 +96,9 @@ private[scheduler] class TaskSetBlacklist(val conf: SparkConf, val stageId: Int, private[scheduler] def updateBlacklistForFailedTask( host: String, exec: String, - index: Int): Unit = { + index: Int, + failureReason: Option[String] = None): Unit = { --- End diff -- Actually , you are right.For feature completion i should modify this. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19325: [SPARK-22106][PYSPARK][SQL] Disable 0-parameter p...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19325#discussion_r140941580 --- Diff: python/pyspark/sql/functions.py --- @@ -2183,14 +2187,28 @@ def pandas_udf(f=None, returnType=StringType()): :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object -# TODO: doctest +>>> from pyspark.sql.types import IntegerType, StringType +>>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) +>>> @pandas_udf(returnType=StringType()) +... def to_upper(s): +... return s.str.upper() +... +>>> @pandas_udf(returnType="integer") +... def add_one(x): +... return x + 1 +... +>>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) +>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ +... .show() # doctest: +SKIP --- End diff -- (D'oh, not a big deal but two spaces before inline comments..) --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19325: [SPARK-22106][PYSPARK][SQL] Disable 0-parameter pandas_u...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/19325 Merged to master. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19325: [SPARK-22106][PYSPARK][SQL] Disable 0-parameter p...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19325#discussion_r140941134 --- Diff: python/pyspark/sql/functions.py --- @@ -2183,14 +2187,28 @@ def pandas_udf(f=None, returnType=StringType()): :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object -# TODO: doctest +>>> from pyspark.sql.types import IntegerType, StringType +>>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) +>>> @pandas_udf(returnType=StringType()) +... def to_upper(s): +... return s.str.upper() +... +>>> @pandas_udf(returnType="integer") +... def add_one(x): +... return x + 1 +... +>>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) +>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ +... .show() # doctest: +SKIP --- End diff -- I just double checked it passes ``` ./run-tests --python-executables=pypy --modules pyspark-sql ... Will test against the following Python executables: ['pypy'] Will test the following Python modules: ['pyspark-sql'] Starting test(pypy): pyspark.sql.functions ... Finished test(pypy): pyspark.sql.functions (74s) ... ``` Also, checked without ` # doctest: +SKIP`: ```diff diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 63e9a830bbc..3265ecc974b 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2199,7 +2199,7 @@ def pandas_udf(f=None, returnType=StringType()): ... >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ -... .show() # doctest: +SKIP +... .show() +--+--++ |slen(name)|to_upper(name)|add_one(age)| +--+--++ ``` ``` ./run-tests --python-executables=pypy --modules pyspark-sql ... Will test against the following Python executables: ['pypy'] Will test the following Python modules: ['pyspark-sql'] ... Starting test(pypy): pyspark.sql.functions ... Failed example: df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \ .show() Exception raised: Traceback (most recent call last): File "/usr/local/Cellar/pypy/5.8.0/libexec/lib-python/2.7/doctest.py", line 1315, in __run compileflags, 1) in test.globs File "", line 1, in df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \ File "/.../spark/python/pyspark/sql/dataframe.py", line 347, in show print(self._jdf.showString(n, 20, vertical)) File "/.../spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/.../spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/.../spark/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py", line 320, in get_return_value format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o1373.showString. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 93.0 failed 1 times, most recent failure: Lost task 0.0 in stage 93.0 (TID 1093, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 190, in main func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type) File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 112, in read_udfs arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type) File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 102, in read_single_udf return arg_offsets, wrap_pandas_udf(row_func, return_type) File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 77, in wrap_pandas_udf arrow_return_type = toArrowType(return_type) File "/.../spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1603, in toArrowType import pyarrow as pa ImportError: No module named pyarrow ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19325: [SPARK-22106][PYSPARK][SQL] Disable 0-parameter p...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/19325#discussion_r140940569 --- Diff: python/pyspark/sql/functions.py --- @@ -2183,14 +2187,28 @@ def pandas_udf(f=None, returnType=StringType()): :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object -# TODO: doctest +>>> from pyspark.sql.types import IntegerType, StringType +>>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) +>>> @pandas_udf(returnType=StringType()) +... def to_upper(s): +... return s.str.upper() +... +>>> @pandas_udf(returnType="integer") +... def add_one(x): +... return x + 1 +... +>>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) +>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ +... .show() # doctest: +SKIP --- End diff -- Yeah. It is. :) --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19338: [SPARK-21539][CORE] Add latest failure reason for task s...
Github user caneGuy commented on the issue: https://github.com/apache/spark/pull/19338 Thanks for your time @squito I will open an other jira for this pr.And update code as soon as possible. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18015: [SAPRK-20785][WEB-UI][SQL]Spark should provide jump link...
Github user jerryshao commented on the issue: https://github.com/apache/spark/pull/18015 There's still left comment not addressed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19344: [SPARK-22122][SQL] Respect WITH clauses to count input r...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19344 **[Test build #82167 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82167/testReport)** for PR 19344 at commit [`d9be37e`](https://github.com/apache/spark/commit/d9be37e9e3168d4adb340bb82a48006863c11636). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19344: [SPARK-22122][SQL] Respect WITH clauses to count ...
GitHub user maropu opened a pull request: https://github.com/apache/spark/pull/19344 [SPARK-22122][SQL] Respect WITH clauses to count input rows in TPCDSQueryBenchmark ## What changes were proposed in this pull request? Since the current code ignores WITH clauses to check input relations in TPCDS queries, this leads to inaccurate per-row processing time for benchmark results. For example, in `q2`, this fix could catch all the input relations: `web_sales`, `date_dim`, and `catalog_sales` (the current code catches `date_dim` only). The one-third of the TPCDS queries uses WITH clauses, so I think it is worth fixing this. ## How was this patch tested? Manually checked. You can merge this pull request into a Git repository by running: $ git pull https://github.com/maropu/spark RespectWithInTPCDSBench Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19344.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19344 commit d9be37e9e3168d4adb340bb82a48006863c11636 Author: Takeshi Yamamuro Date: 2017-09-25T11:12:50Z Respect With in TPCDSQueryBenchmark --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19343 **[Test build #82166 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/82166/testReport)** for PR 19343 at commit [`c2e125e`](https://github.com/apache/spark/commit/c2e125eacb48971ee72dd61859a95ca8ae6a9fc8). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19343 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19325: [SPARK-22106][PYSPARK][SQL] Disable 0-parameter p...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19325#discussion_r140938172 --- Diff: python/pyspark/sql/functions.py --- @@ -2183,14 +2187,28 @@ def pandas_udf(f=None, returnType=StringType()): :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object -# TODO: doctest +>>> from pyspark.sql.types import IntegerType, StringType +>>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) +>>> @pandas_udf(returnType=StringType()) +... def to_upper(s): +... return s.str.upper() +... +>>> @pandas_udf(returnType="integer") +... def add_one(x): +... return x + 1 +... +>>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) +>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ +... .show() # doctest: +SKIP --- End diff -- Looks actually we do :). Let me test this one for sure in my local before merging it, (I have `pypy` installed in my local that does not have `pyarrow` or `pandas`). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19343: [SPARK-22121][CORE] Correct database location for nameno...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19343 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/82166/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18015: [SAPRK-20785][WEB-UI][SQL]Spark should provide jump link...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/18015 I think @jerryshao is an active committer who knows this one better than me. Since he is here, let me leave it to him. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org