[GitHub] [spark] itholic commented on pull request #37948: [SPARK-40327][PS][DOCS] Add resampling to API references
itholic commented on PR #37948: URL: https://github.com/apache/spark/pull/37948#issuecomment-1253238299 > ``` > Warning, treated as error: > /__w/spark/spark/python/docs/source/reference/pyspark.pandas/resampling.rst:2:Explicit markup ends without a blank line; unexpected unindent. > make: *** [Makefile:35: html] Error 2 > > Jekyll 4.2.1 Please append `--trace` to the `build` command > for any additional information or backtrace. > > ``` > > @itholic @HyukjinKwon @Yikun Do you have any ideas? It keep failing like this, even though I tried several changs Seems like this should be aligned. ```diff .. Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at ..http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] sadikovi commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
sadikovi commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r976046176 ## docs/sql-data-sources-csv.md: ## @@ -111,7 +111,7 @@ Data source options of CSV can be set via: prefersDate false -During schema inference (inferSchema), attempts to infer string columns that contain dates or timestamps as Date if the values satisfy the dateFormat option and failed to be parsed by the respective formatter. With a user-provided schema, attempts to parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, in this case the parsed values will be cast to timestamp type afterwards. +During schema inference (inferSchema), attempts to infer string columns that contain dates as Date if the values satisfy the dateFormat option or default date format. For columns that contain mixing dates and timestamps, infer them as StringType. Review Comment: nit: `... a mix/mixture of dates and timestamps ...`? ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala: ## @@ -233,7 +237,39 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { * is compatible with both input data types. */ private def compatibleType(t1: DataType, t2: DataType): Option[DataType] = { -TypeCoercion.findTightestCommonType(t1, t2).orElse(findCompatibleTypeForCSV(t1, t2)) +(t1, t2) match { + case (DateType, TimestampType) | (DateType, TimestampNTZType) | + (TimestampNTZType, DateType) | (TimestampType, DateType) => +// For a column containing mixing dates and timestamps Review Comment: Same here, just a bit of rewording. ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala: ## @@ -233,7 +237,39 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { * is compatible with both input data types. */ private def compatibleType(t1: DataType, t2: DataType): Option[DataType] = { -TypeCoercion.findTightestCommonType(t1, t2).orElse(findCompatibleTypeForCSV(t1, t2)) +(t1, t2) match { + case (DateType, TimestampType) | (DateType, TimestampNTZType) | + (TimestampNTZType, DateType) | (TimestampType, DateType) => +// For a column containing mixing dates and timestamps +// infer it as timestamp type if its dates can be inferred as timestamp type +// otherwise infer it as StringType +val dateFormat = options.dateFormatInRead.getOrElse(DateFormatter.defaultPattern) +t1 match { + case DateType if canParseDateAsTimestamp(dateFormat, t2) => +Some(t2) + case TimestampType | TimestampNTZType if canParseDateAsTimestamp(dateFormat, t1) => +Some(t1) + case _ => Some(StringType) +} + case _ => TypeCoercion.findTightestCommonType(t1, t2).orElse(findCompatibleTypeForCSV(t1, t2)) +} + } + + /** + * Return if strings of given date format can be parsed as timestamps Review Comment: nit: Returns `true` if strings ... ## sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala: ## @@ -2819,51 +2819,68 @@ abstract class CSVSuite } } - test("SPARK-39469: Infer schema for date type") { -val options1 = Map( - "header" -> "true", - "inferSchema" -> "true", - "timestampFormat" -> "-MM-dd'T'HH:mm:ss", - "dateFormat" -> "-MM-dd", - "prefersDate" -> "true") -val options2 = Map( - "header" -> "true", - "inferSchema" -> "true", - "prefersDate" -> "true") - -// Error should be thrown when attempting to prefersDate with Legacy parser -if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { - checkError( -exception = intercept[SparkIllegalArgumentException] { - spark.read.format("csv").options(options1).load(testFile(dateInferSchemaFile)) -}, -errorClass = "CANNOT_INFER_DATE") -} else { - // 1. Specify date format and timestamp format - // 2. Date inference should work with default date format when dateFormat is not provided - Seq(options1, options2).foreach {options => + test("SPARK-39469: Infer schema for columns with only dates " + +"and columns with mixing date and timestamps correctly") { +def checkCSVReadDatetime( + options: Map[String, String], + expectedSchema: StructType, + expectedData: Seq[Seq[Any]]): Unit = { + + // Error should be thrown when attempting to prefersDate with Legacy parser Review Comment: nit: `to use prefersDate ...`. ## sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala: ## @@ -2819,51 +2819,68 @@ abstract class CSVSuite } } - test("SPARK-39469: Infer schema for date type") { -val options1 = Map( - "header" -> "true", -
[GitHub] [spark] itholic commented on a diff in pull request #37948: [SPARK-40327][PS][DOCS] Add resampling to API references
itholic commented on code in PR #37948: URL: https://github.com/apache/spark/pull/37948#discussion_r976058915 ## python/pyspark/pandas/resample.py: ## @@ -412,21 +412,267 @@ def _handle_output(self, psdf: DataFrame) -> FrameLike: pass def min(self) -> FrameLike: +""" +Compute max of resampled values. + +.. versionadded:: 3.4.0 + +See Also + +pyspark.pandas.Series.groupby +pyspark.pandas.DataFrame.groupby + +Examples + +>>> np.random.seed(22) +>>> dates = [ +...datetime(2022, 5, 1, 4, 5, 6), +...datetime(2022, 5, 3), +...datetime(2022, 5, 3, 23, 59, 59), +...datetime(2022, 5, 4), +...pd.NaT, +...datetime(2022, 5, 4, 0, 0, 1), +...datetime(2022, 5, 11), +... ] +>>> df = ps.DataFrame( +...np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"] +... ) +>>> df +A B +2022-05-01 04:05:06 0.208461 0.481681 +2022-05-03 00:00:00 0.420538 0.859182 +2022-05-03 23:59:59 0.171162 0.338864 +2022-05-04 00:00:00 0.270533 0.691041 +NaT 0.220405 0.811951 +2022-05-04 00:00:01 0.010527 0.561204 +2022-05-11 00:00:00 0.813726 0.745100 +>>> df.resample("3D").min().sort_index() + A B +2022-05-01 0.171162 0.338864 +2022-05-04 0.010527 0.561204 +2022-05-07 NaN NaN +2022-05-10 0.813726 0.745100 +""" return self._handle_output(self._downsample("min")) def max(self) -> FrameLike: +""" +Compute max of resampled values. + +.. versionadded:: 3.4.0 + +See Also + +pyspark.pandas.Series.groupby +pyspark.pandas.DataFrame.groupby + +Examples + +>>> np.random.seed(22) +>>> dates = [ +...datetime(2022, 5, 1, 4, 5, 6), +...datetime(2022, 5, 3), +...datetime(2022, 5, 3, 23, 59, 59), +...datetime(2022, 5, 4), +...pd.NaT, +...datetime(2022, 5, 4, 0, 0, 1), +...datetime(2022, 5, 11), +... ] +>>> df = ps.DataFrame( +...np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"] +... ) +>>> df +A B +2022-05-01 04:05:06 0.208461 0.481681 +2022-05-03 00:00:00 0.420538 0.859182 +2022-05-03 23:59:59 0.171162 0.338864 +2022-05-04 00:00:00 0.270533 0.691041 +NaT 0.220405 0.811951 +2022-05-04 00:00:01 0.010527 0.561204 +2022-05-11 00:00:00 0.813726 0.745100 +>>> df.resample("3D").max().sort_index() + A B +2022-05-01 0.420538 0.859182 +2022-05-04 0.270533 0.691041 +2022-05-07 NaN NaN +2022-05-10 0.813726 0.745100 +""" return self._handle_output(self._downsample("max")) def sum(self) -> FrameLike: +""" +Compute sum of resampled values. + +.. versionadded:: 3.4.0 + +See Also + +pyspark.pandas.Series.groupby +pyspark.pandas.DataFrame.groupby + +Examples + +>>> np.random.seed(22) +>>> dates = [ +...datetime(2022, 5, 1, 4, 5, 6), +...datetime(2022, 5, 3), +...datetime(2022, 5, 3, 23, 59, 59), +...datetime(2022, 5, 4), +...pd.NaT, +...datetime(2022, 5, 4, 0, 0, 1), +...datetime(2022, 5, 11), +... ] +>>> df = ps.DataFrame( +...np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"] +... ) +>>> df +A B +2022-05-01 04:05:06 0.208461 0.481681 +2022-05-03 00:00:00 0.420538 0.859182 +2022-05-03 23:59:59 0.171162 0.338864 +2022-05-04 00:00:00 0.270533 0.691041 +NaT 0.220405 0.811951 +2022-05-04 00:00:01 0.010527 0.561204 +2022-05-11 00:00:00 0.813726 0.745100 +>>> df.resample("3D").sum().sort_index() + A B +2022-05-01 0.800160 1.679727 +2022-05-04 0.281060 1.252245 +2022-05-07 0.00 0.00 +2022-05-10 0.813726 0.745100 +""" return self._handle_output(self._downsample("sum").fillna(0.0)) def mean(self) -> FrameLike: +""" +Compute mean of resampled values. + +.. versionadded:: 3.4.0 + +See Also + +
[GitHub] [spark] itholic commented on a diff in pull request #37945: [SPARK-40498][PS] Implement `kendall` and `min_periods` in `Series.corr`
itholic commented on code in PR #37945: URL: https://github.com/apache/spark/pull/37945#discussion_r975971640 ## python/pyspark/pandas/series.py: ## @@ -3312,16 +3318,25 @@ def autocorr(self, periods: int = 1) -> float: ) return np.nan if corr is None else corr -def corr(self, other: "Series", method: str = "pearson") -> float: +def corr( +self, other: "Series", method: str = "pearson", min_periods: Optional[int] = None +) -> float: """ Compute correlation with `other` Series, excluding missing values. +.. versionadded:: 3.3.0 + Parameters -- other : Series -method : {'pearson', 'spearman'} +method : {'pearson', 'spearman', 'kendall'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation +* kendall : Kendall Tau correlation coefficient Review Comment: Maybe we should add `.. versionchanged:: 3.4.0` for mentioning about the `kendall` ?? ```python method : {'pearson', 'spearman', 'kendall'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation * kendall : Kendall Tau correlation coefficient .. versionchanged:: 3.4.0 support 'kendall' for method parameter ``` ## python/pyspark/pandas/tests/test_stats.py: ## @@ -258,8 +258,6 @@ def test_skew_kurt_numerical_stability(self): self.assert_eq(psdf.kurt(), pdf.kurt(), almost=True) def test_dataframe_corr(self): -# existing 'test_corr' is mixed by df.corr and ser.corr, will delete 'test_corr' -# when we have separate tests for df.corr and ser.corr Review Comment: Nice! ## python/pyspark/pandas/series.py: ## @@ -,29 +3348,74 @@ def corr(self, other: "Series", method: str = "pearson") -> float: ...'s2': [.3, .6, .0, .1]}) >>> s1 = df.s1 >>> s2 = df.s2 ->>> s1.corr(s2, method='pearson') # doctest: +ELLIPSIS --0.851064... +>>> s1.corr(s2, method='pearson') +-0.85106... ->>> s1.corr(s2, method='spearman') # doctest: +ELLIPSIS --0.948683... +>>> s1.corr(s2, method='spearman') +-0.94868... -Notes -- -There are behavior differences between pandas-on-Spark and pandas. Review Comment: Oh, so now we can have the same behavior with pandas ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] sadikovi commented on pull request #37942: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
sadikovi commented on PR #37942: URL: https://github.com/apache/spark/pull/37942#issuecomment-1253226110 Thank you . -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #37931: [SPARK-40488] Do not wrap exceptions thrown when datasource write fails
cloud-fan commented on code in PR #37931: URL: https://github.com/apache/spark/pull/37931#discussion_r976048440 ## sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala: ## @@ -65,14 +61,7 @@ case class WriteToContinuousDataSourceExec(write: StreamingWrite, query: SparkPl } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. - case cause: Throwable => -cause match { - // Do not wrap interruption exceptions that will be handled by streaming specially. - case _ if StreamExecution.isInterruptionException(cause, sparkContext) => throw cause - // Only wrap non fatal exceptions. - case NonFatal(e) => throw QueryExecutionErrors.writingJobAbortedError(e) - case _ => throw cause -} + case cause: Throwable => throw cause Review Comment: since we can simply remove this line? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #37931: [SPARK-40488] Do not wrap exceptions thrown when datasource write fails
cloud-fan commented on code in PR #37931: URL: https://github.com/apache/spark/pull/37931#discussion_r976048121 ## core/src/test/scala/org/apache/spark/SparkThrowableSuite.scala: ## @@ -209,22 +209,6 @@ class SparkThrowableSuite extends SparkFunSuite { } } - test("Try catching SparkError with error class") { -try { - throw new SparkException( -errorClass = "WRITING_JOB_ABORTED", Review Comment: we need to keep this test, but with a different error class -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan closed pull request #37942: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
cloud-fan closed pull request #37942: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback" URL: https://github.com/apache/spark/pull/37942 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on pull request #37942: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
cloud-fan commented on PR #37942: URL: https://github.com/apache/spark/pull/37942#issuecomment-1253216836 thanks, merging to master! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r976042644 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Made some follow-up changes, please check the updated description for the behavior after changes and semantics. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #36027: [SPARK-38717][SQL] Handle Hive's bucket spec case preserving behaviour
cloud-fan commented on code in PR #36027: URL: https://github.com/apache/spark/pull/36027#discussion_r976029766 ## sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala: ## @@ -1026,7 +1026,14 @@ private[hive] object HiveClientImpl extends Logging { } else { CharVarcharUtils.getRawTypeString(c.metadata).getOrElse(c.dataType.catalogString) } -new FieldSchema(c.name, typeString, c.getComment().orNull) +val name = if (lowerCase) { + // scalastyle:off caselocale + c.name.toLowerCase + // scalastyle:on caselocale +} else { + c.name Review Comment: We can use `Object` to store the raw hive table, so that we don't need to expose the Hive classes. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng commented on pull request #37954: [SPARK-40332][PS][DOCS][FOLLOWUP] Fix wrong underline length
zhengruifeng commented on PR #37954: URL: https://github.com/apache/spark/pull/37954#issuecomment-1253185599 cc @Yikun -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng commented on pull request #37948: [SPARK-40327][PS][DOCS] Add resampling to API references
zhengruifeng commented on PR #37948: URL: https://github.com/apache/spark/pull/37948#issuecomment-1253185396 ``` Warning, treated as error: /__w/spark/spark/python/docs/source/reference/pyspark.pandas/resampling.rst:2:Explicit markup ends without a blank line; unexpected unindent. make: *** [Makefile:35: html] Error 2 Jekyll 4.2.1 Please append `--trace` to the `build` command for any additional information or backtrace. ``` @itholic @HyukjinKwon @Yikun Do you have any ideas? It keep failing like this, even though I tried several changs -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng opened a new pull request, #37954: [SPARK-40332][PS][DOCS][FOLLOWUP] Fix wrong underline length
zhengruifeng opened a new pull request, #37954: URL: https://github.com/apache/spark/pull/37954 ### What changes were proposed in this pull request? Fix wrong underline length ### Why are the changes needed? there is a warning in doc build ``` reading sources... [ 68%] reference/pyspark.pandas/groupby /usr/local/lib/python3.9/dist-packages/numpydoc/docscrape.py:449: UserWarning: potentially wrong underline length... Notes --- in Return group values at the given quantile. ... in the docstring of quantile in /__w/spark/spark/python/pyspark/pandas/groupby.py. reading sources... [ 68%] reference/pyspark.pandas/index reading sources... [ 68%] reference/pyspark.pandas/indexing ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing suites -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] panbingkun commented on a diff in pull request #37941: [SPARK-40501][SQL] Enhance 'SpecialLimits' to support project(..., limit(...))
panbingkun commented on code in PR #37941: URL: https://github.com/apache/spark/pull/37941#discussion_r976007571 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala: ## @@ -830,6 +831,21 @@ object PushProjectionThroughUnion extends Rule[LogicalPlan] { } } +/** + * Pushes Project operator to Limit operator. + */ +object PushProjectionThroughLimit extends Rule[LogicalPlan] { Review Comment: I have solved the above scenario by adding a new match case in SpecialLimits. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng opened a new pull request, #37953: [SPARK-40510][PS] Implement `ddof` in `Series.cov`
zhengruifeng opened a new pull request, #37953: URL: https://github.com/apache/spark/pull/37953 ### What changes were proposed in this pull request? Implement `ddof` in `Series.cov`, by switch to `SF.covar` ### Why are the changes needed? for API coverage ### Does this PR introduce _any_ user-facing change? yes, `ddof` supported now ``` >>> s1 = ps.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = ps.Series([0.12528585, 0.26962463, 0.5198]) >>> with ps.option_context("compute.ops_on_diff_frames", True): ... s1.cov(s2) -0.016857... >>> with ps.option_context("compute.ops_on_diff_frames", True): ... s1.cov(s2, ddof=2) ``` ### How was this patch tested? added UT -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] Kwafoor commented on pull request #37951: [SPARK-40506]Spark Streaming metrics name doesn't need application name
Kwafoor commented on PR #37951: URL: https://github.com/apache/spark/pull/37951#issuecomment-1253163925 Hi, @HeartSaVioR could you please take a look whenever you have a chance? Thanks! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] mridulm commented on a diff in pull request #37855: [SPARK-40407][SQL] Fix the potential data skew caused by df.repartition
mridulm commented on code in PR #37855: URL: https://github.com/apache/spark/pull/37855#discussion_r976002343 ## sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala: ## @@ -299,7 +300,8 @@ object ShuffleExchangeExec { def getPartitionKeyExtractor(): InternalRow => Any = newPartitioning match { case RoundRobinPartitioning(numPartitions) => // Distributes elements evenly across output partitions, starting from a random partition. -var position = new Random(TaskContext.get().partitionId()).nextInt(numPartitions) Review Comment: This was fixed in SPARK-21782 for RDD - looks like the sql version did not leverage it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] LuciferYang commented on a diff in pull request #37940: [SPARK-40494][CORE][SQL][ML][MLLIB] Optimize the performance of `keys.zipWithIndex.toMap` code pattern
LuciferYang commented on code in PR #37940: URL: https://github.com/apache/spark/pull/37940#discussion_r976000233 ## core/src/main/scala/org/apache/spark/util/collection/Utils.scala: ## @@ -79,6 +79,20 @@ private[spark] object Utils { builder.result() } + /** + * Same function as `keys.zipWithIndex.toMap`, but has perf gain. + */ + def toMap[K](keys: Iterable[K]): Map[K, Int] = { Review Comment: [405c625](https://github.com/apache/spark/pull/37940/commits/405c6252893c0df6cdf57cea8da94d1ebe7f56c4) fix this, waiting ci -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] LuciferYang commented on a diff in pull request #37940: [SPARK-40494][CORE][SQL][ML][MLLIB] Optimize the performance of `keys.zipWithIndex.toMap` code pattern
LuciferYang commented on code in PR #37940: URL: https://github.com/apache/spark/pull/37940#discussion_r975997057 ## core/src/main/scala/org/apache/spark/util/collection/Utils.scala: ## @@ -79,6 +79,20 @@ private[spark] object Utils { builder.result() } + /** + * Same function as `keys.zipWithIndex.toMap`, but has perf gain. + */ + def toMap[K](keys: Iterable[K]): Map[K, Int] = { Review Comment: ok -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] sadikovi commented on pull request #37942: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
sadikovi commented on PR #37942: URL: https://github.com/apache/spark/pull/37942#issuecomment-1253154478 Actually, it is only in master, there is no `enableDateTimeParsingFallback` flag in branch-3.3. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #37940: [SPARK-40494][CORE][SQL][ML][MLLIB] Optimize the performance of `keys.zipWithIndex.toMap` code pattern
cloud-fan commented on code in PR #37940: URL: https://github.com/apache/spark/pull/37940#discussion_r975996062 ## core/src/main/scala/org/apache/spark/util/collection/Utils.scala: ## @@ -79,6 +79,20 @@ private[spark] object Utils { builder.result() } + /** + * Same function as `keys.zipWithIndex.toMap`, but has perf gain. + */ + def toMap[K](keys: Iterable[K]): Map[K, Int] = { Review Comment: `toMapWithIndex`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] LuciferYang commented on pull request #37940: [SPARK-40494][CORE][SQL][ML][MLLIB] Optimize the performance of `keys.zipWithIndex.toMap` code pattern
LuciferYang commented on PR #37940: URL: https://github.com/apache/spark/pull/37940#issuecomment-1253151080 friendly ping @cloud-fan -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #37855: [SPARK-40407][SQL] Fix the potential data skew caused by df.repartition
cloud-fan commented on code in PR #37855: URL: https://github.com/apache/spark/pull/37855#discussion_r975993118 ## sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala: ## @@ -299,7 +300,8 @@ object ShuffleExchangeExec { def getPartitionKeyExtractor(): InternalRow => Any = newPartitioning match { case RoundRobinPartitioning(numPartitions) => // Distributes elements evenly across output partitions, starting from a random partition. -var position = new Random(TaskContext.get().partitionId()).nextInt(numPartitions) Review Comment: OK I tried `(1 to 200).foreach(partitionId => print(new Random(partitionId).nextInt(32) + " "))` and the result is very counterintuitive. A small change for the seed does not change the random result. Can we add some comments to explain why we add `hashing.byteswap32`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #37855: [SPARK-40407][SQL] Fix the potential data skew caused by df.repartition
cloud-fan commented on code in PR #37855: URL: https://github.com/apache/spark/pull/37855#discussion_r975989551 ## sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala: ## @@ -299,7 +300,8 @@ object ShuffleExchangeExec { def getPartitionKeyExtractor(): InternalRow => Any = newPartitioning match { case RoundRobinPartitioning(numPartitions) => // Distributes elements evenly across output partitions, starting from a random partition. -var position = new Random(TaskContext.get().partitionId()).nextInt(numPartitions) Review Comment: Sorry I may miss something. The original code should already produce different starting positions for different mapper tasks? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a diff in pull request #37855: [SPARK-40407][SQL] Fix the potential data skew caused by df.repartition
cloud-fan commented on code in PR #37855: URL: https://github.com/apache/spark/pull/37855#discussion_r975988849 ## sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala: ## @@ -2147,6 +2147,12 @@ class DatasetSuite extends QueryTest (2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) } + + test("SPARK-40407: repartition should not result in severe data skew") { +val df = spark.range(0, 100, 1, 50).repartition(4) +val result = df.mapPartitions(iter => Iterator.single(iter.length)).collect() +assert(result.mkString(",") === "25,31,25,19") Review Comment: I'd do `assert(result.map(_.getInt(0)).sorted == Seq(19, 25, 25, 31))` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on pull request #37942: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
cloud-fan commented on PR #37942: URL: https://github.com/apache/spark/pull/37942#issuecomment-1253142422 how far shall we backport it? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan closed pull request #37944: [SQL][MINOR] Re-generate equals/hashCode of IdentifierImpl with non-null optimization
cloud-fan closed pull request #37944: [SQL][MINOR] Re-generate equals/hashCode of IdentifierImpl with non-null optimization URL: https://github.com/apache/spark/pull/37944 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on pull request #37944: [SQL][MINOR] Re-generate equals/hashCode of IdentifierImpl with non-null optimization
cloud-fan commented on PR #37944: URL: https://github.com/apache/spark/pull/37944#issuecomment-1253140641 thanks for review, merging to master! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng commented on pull request #37945: [SPARK-40498][PS] Implement `kendall` and `min_periods` in `Series.corr`
zhengruifeng commented on PR #37945: URL: https://github.com/apache/spark/pull/37945#issuecomment-1253139254 cc @HyukjinKwon -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] beliefer commented on pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD
beliefer commented on PR #37937: URL: https://github.com/apache/spark/pull/37937#issuecomment-1253127562 @HyukjinKwon @cloud-fan Thank you for all ! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon closed pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD
HyukjinKwon closed pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD URL: https://github.com/apache/spark/pull/37937 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD
HyukjinKwon commented on PR #37937: URL: https://github.com/apache/spark/pull/37937#issuecomment-1253126773 Merged to master. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] beliefer commented on a diff in pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD
beliefer commented on code in PR #37937: URL: https://github.com/apache/spark/pull/37937#discussion_r975975156 ## core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala: ## @@ -182,8 +182,6 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { def textFile(path: String, minPartitions: Int): JavaRDD[String] = sc.textFile(path, minPartitions) - - Review Comment: OK -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng commented on a diff in pull request #37929: [SPARK-40486][PS] Implement `spearman` and `kendall` in `DataFrame.corrwith`
zhengruifeng commented on code in PR #37929: URL: https://github.com/apache/spark/pull/37929#discussion_r975974274 ## python/pyspark/pandas/frame.py: ## @@ -1847,14 +1665,14 @@ def corrwith( -- other : DataFrame, Series Object with which to compute correlations. - +axis : int, default 0 or 'index' +Can only be set to 0 at the moment. drop : bool, default False Drop missing indices from result. - -method : str, default 'pearson' -Method of correlation, one of: - +method : {'pearson', 'spearman', 'kendall'} Review Comment: good question, I think it's a bit hard to support this `callable`: it takes two arrays, so should collect all values in the columns, but it's not scalable then maybe, we can support another `callable`: `Callable[[Column, Column], float]`, which is an aggregation function, this may make sense. I think we need more discussion/thoughs on it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng commented on pull request #37947: [SPARK-40500][PS] Deprecate `iteritems` in DataFrame and Seriese
zhengruifeng commented on PR #37947: URL: https://github.com/apache/spark/pull/37947#issuecomment-1253118512 Thank you all! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon closed pull request #37947: [SPARK-40500][PS] Deprecate `iteritems` in DataFrame and Seriese
HyukjinKwon closed pull request #37947: [SPARK-40500][PS] Deprecate `iteritems` in DataFrame and Seriese URL: https://github.com/apache/spark/pull/37947 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on pull request #37947: [SPARK-40500][PS] Deprecate `iteritems` in DataFrame and Seriese
HyukjinKwon commented on PR #37947: URL: https://github.com/apache/spark/pull/37947#issuecomment-1253116817 Merged to master. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a diff in pull request #37947: [SPARK-40500][PS] Deprecate `iteritems` in DataFrame and Seriese
HyukjinKwon commented on code in PR #37947: URL: https://github.com/apache/spark/pull/37947#discussion_r975968699 ## python/pyspark/pandas/frame.py: ## @@ -2054,9 +2054,16 @@ def extract_kv_from_spark_row(row: Row) -> Tuple[Name, Any]: ): yield tuple(([k] if index else []) + list(v)) -def items(self) -> Iterator[Tuple[Name, "Series"]]: -"""This is an alias of ``iteritems``.""" -return self.iteritems() +def iteritems(self) -> Iterator[Tuple[Name, "Series"]]: +""" +This is an alias of ``items``. + +.. deprecated:: 3.4.0 +iteritems is deprecated and will be removed in a future version. +Use .items instead. +""" +warnings.warn("Deprecated in 3.4, Use DataFrame.items instead.", FutureWarning) Review Comment: ```suggestion warnings.warn("Deprecated in 3.4.0, Use DataFrame.items instead.", FutureWarning) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on a diff in pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD
HyukjinKwon commented on code in PR #37937: URL: https://github.com/apache/spark/pull/37937#discussion_r975962115 ## core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala: ## @@ -182,8 +182,6 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { def textFile(path: String, minPartitions: Int): JavaRDD[String] = sc.textFile(path, minPartitions) - - Review Comment: but it's not related changes. Let's exclude unrelated changes. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] beliefer commented on a diff in pull request #37937: [SPARK-40491][SQL] Remove too old TODO for JdbcRDD
beliefer commented on code in PR #37937: URL: https://github.com/apache/spark/pull/37937#discussion_r975960027 ## core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala: ## @@ -182,8 +182,6 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { def textFile(path: String, minPartitions: Int): JavaRDD[String] = sc.textFile(path, minPartitions) - - Review Comment: It seems these blanks are useless. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] bozhang2820 commented on pull request #37931: [SPARK-40488] Do not wrap exceptions thrown when datasource write fails
bozhang2820 commented on PR #37931: URL: https://github.com/apache/spark/pull/37931#issuecomment-1253098147 @cloud-fan, could you take a look? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HyukjinKwon commented on pull request #37947: [SPARK-40500][PS] Use `pd.items` instead of `pd.iteritems`
HyukjinKwon commented on PR #37947: URL: https://github.com/apache/spark/pull/37947#issuecomment-1253079820 Yeah let's match w/ pandas -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975939807 ## python/pyspark/sql/pandas/group_ops.py: ## @@ -216,6 +218,104 @@ def applyInPandas( jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr()) return DataFrame(jdf, self.session) +def applyInPandasWithState( +self, +func: "PandasGroupedMapFunctionWithState", +outputStructType: Union[StructType, str], +stateStructType: Union[StructType, str], +outputMode: str, +timeoutConf: str, +) -> DataFrame: +""" +Applies the given function to each group of data, while maintaining a user-defined +per-group state. The result Dataset will represent the flattened record returned by the +function. + +For a streaming Dataset, the function will be invoked first for all input groups and then +for all timed out states where the input data is set to be empty. Updates to each group's +state will be saved across invocations. + +The function should take parameters (key, Iterator[`pandas.DataFrame`], state) and +returns another Iterator[`pandas.DataFrame`]. The grouping key(s) will be passed as a tuple +of numpy data types, e.g., `numpy.int32` and `numpy.float64`. The state will be passed as +:class:`pyspark.sql.streaming.state.GroupState`. + +For each group, all columns are passed together as `pandas.DataFrame` to the user-function, +and the returned `pandas.DataFrame` across all invocations are combined as a +:class:`DataFrame`. Note that the user function should loop through and process all +elements in the iterator. The user function should not make a guess of the number of +elements in the iterator. + +The `outputStructType` should be a :class:`StructType` describing the schema of all +elements in the returned value, `pandas.DataFrame`. The column labels of all elements in +returned `pandas.DataFrame` must either match the field names in the defined schema if +specified as strings, or match the field data types by position if not strings, +e.g. integer indices. + +The `stateStructType` should be :class:`StructType` describing the schema of the +user-defined state. The value of the state will be presented as a tuple, as well as the +update should be performed with the tuple. The corresponding Python types for +:class:DataType are supported. Please refer to the page +https://spark.apache.org/docs/latest/sql-ref-datatypes.html (python tab). + +The size of each DataFrame in both the input and output can be arbitrary. The number of +DataFrames in both the input and output can also be arbitrary. + +.. versionadded:: 3.4.0 + +Parameters +-- +func : function +a Python native function to be called on every group. It should take parameters +(key, Iterator[`pandas.DataFrame`], state) and return Iterator[`pandas.DataFrame`]. +Note that the type of the key is tuple and the type of the state is +:class:`pyspark.sql.streaming.state.GroupState`. +outputStructType : :class:`pyspark.sql.types.DataType` or str +the type of the output records. The value can be either a +:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. +stateStructType : :class:`pyspark.sql.types.DataType` or str +the type of the user-defined state. The value can be either a +:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. +outputMode : str +the output mode of the function. +timeoutConf : str +timeout configuration for groups that do not receive data for a while. valid values +are defined in :class:`pyspark.sql.streaming.state.GroupStateTimeout`. + +# TODO: Examples Review Comment: https://issues.apache.org/jira/browse/SPARK-40509 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] zhengruifeng commented on pull request #37947: [SPARK-40500][PS] Use `pd.items` instead of `pd.iteritems`
zhengruifeng commented on PR #37947: URL: https://github.com/apache/spark/pull/37947#issuecomment-1253073004 remaining `iteritems`s in `frame.py` and `test_dataframe.py` are the definition and tests of PS's `iteritems` itself, so I think we should not modify them. as to the deprecation of PS's `iteritems`, I think we can deprecate them now, WDYT @itholic @HyukjinKwon @Yikun -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] itholic commented on a diff in pull request #37923: [SPARK-40334][PS] Implement `GroupBy.prod`
itholic commented on code in PR #37923: URL: https://github.com/apache/spark/pull/37923#discussion_r975910504 ## python/pyspark/pandas/groupby.py: ## @@ -993,6 +993,105 @@ def nth(self, n: int) -> FrameLike: return self._prepare_return(DataFrame(internal)) +def prod(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameLike: +""" +Compute prod of groups. + +.. versionadded:: 3.4.0 + +Parameters +-- +numeric_only : bool, default False +Include only float, int, boolean columns. If None, will attempt to use +everything, then use only numeric data. + +min_count: int, default 0 +The required number of valid values to perform the operation. +If fewer than min_count non-NA values are present the result will be NA. + +Returns +--- +Series or DataFrame + +See Also + +pyspark.pandas.Series.groupby +pyspark.pandas.DataFrame.groupby + +Examples + +>>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2], +...'B': [np.nan, 2, 3, 4, 5], +...'C': [1, 2, 1, 1, 2], +...'D': [True, False, True, False, True]}) Review Comment: nit: formatting ```suggestion >>> df = ps.DataFrame( ... { ... "A": [1, 1, 2, 1, 2], ... "B": [np.nan, 2, 3, 4, 5], ... "C": [1, 2, 1, 1, 2], ... "D": [True, False, True, False, True], ... } ... ) ``` ## python/pyspark/pandas/groupby.py: ## @@ -993,6 +993,105 @@ def nth(self, n: int) -> FrameLike: return self._prepare_return(DataFrame(internal)) +def prod(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameLike: +""" +Compute prod of groups. + +.. versionadded:: 3.4.0 + +Parameters +-- +numeric_only : bool, default False +Include only float, int, boolean columns. If None, will attempt to use +everything, then use only numeric data. + +min_count: int, default 0 +The required number of valid values to perform the operation. +If fewer than min_count non-NA values are present the result will be NA. + +Returns +--- +Series or DataFrame + +See Also + +pyspark.pandas.Series.groupby +pyspark.pandas.DataFrame.groupby + +Examples + +>>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2], +...'B': [np.nan, 2, 3, 4, 5], +...'C': [1, 2, 1, 1, 2], +...'D': [True, False, True, False, True]}) + +Groupby one column and return the prod of the remaining columns in +each group. + +>>> df.groupby('A').prod().sort_index() + B C D +A +1 8.0 2 0 +2 15.0 2 1 + +>>> df.groupby('A').prod(min_count=3).sort_index() + B C D +A +1 NaN 2.0 0.0 +2 NaN NaN NaN +""" + +self._validate_agg_columns(numeric_only=numeric_only, function_name="prod") + +groupkey_names = [SPARK_INDEX_NAME_FORMAT(i) for i in range(len(self._groupkeys))] +internal, agg_columns, sdf = self._prepare_reduce( +groupkey_names=groupkey_names, +accepted_spark_types=(NumericType, BooleanType), +bool_to_numeric=True, +) + +psdf: DataFrame = DataFrame(internal) +if len(psdf._internal.column_labels) > 0: + +stat_exprs = [] +for label in psdf._internal.column_labels: +tmp_count_column = verify_temp_column_name(sdf, "__tmp_%s_count_col__" % label[0]) +psser = psdf._psser_for(label) +column = psser._dtype_op.nan_to_null(psser).spark.column +data_type = psser.spark.data_type + +if isinstance(data_type, IntegralType): + stat_exprs.append(F.product(column).cast(data_type).alias(label[0])) +else: +stat_exprs.append(F.product(column).alias(label[0])) Review Comment: What about defining a `label[0]` as a variable since it's used in multiple places ? ## python/pyspark/pandas/groupby.py: ## @@ -993,6 +993,105 @@ def nth(self, n: int) -> FrameLike: return self._prepare_return(DataFrame(internal)) +def prod(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameLike: +""" +Compute prod of groups. + +.. versionadded:: 3.4.0 + +Parameters +-- +numeric_only : bool, default False +
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975910639 ## python/pyspark/sql/pandas/group_ops.py: ## @@ -216,6 +218,104 @@ def applyInPandas( jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr()) return DataFrame(jdf, self.session) +def applyInPandasWithState( +self, +func: "PandasGroupedMapFunctionWithState", +outputStructType: Union[StructType, str], +stateStructType: Union[StructType, str], +outputMode: str, +timeoutConf: str, +) -> DataFrame: +""" +Applies the given function to each group of data, while maintaining a user-defined +per-group state. The result Dataset will represent the flattened record returned by the +function. + +For a streaming Dataset, the function will be invoked first for all input groups and then +for all timed out states where the input data is set to be empty. Updates to each group's +state will be saved across invocations. + +The function should take parameters (key, Iterator[`pandas.DataFrame`], state) and +returns another Iterator[`pandas.DataFrame`]. The grouping key(s) will be passed as a tuple +of numpy data types, e.g., `numpy.int32` and `numpy.float64`. The state will be passed as +:class:`pyspark.sql.streaming.state.GroupState`. + +For each group, all columns are passed together as `pandas.DataFrame` to the user-function, +and the returned `pandas.DataFrame` across all invocations are combined as a +:class:`DataFrame`. Note that the user function should loop through and process all +elements in the iterator. The user function should not make a guess of the number of +elements in the iterator. + +The `outputStructType` should be a :class:`StructType` describing the schema of all +elements in the returned value, `pandas.DataFrame`. The column labels of all elements in +returned `pandas.DataFrame` must either match the field names in the defined schema if +specified as strings, or match the field data types by position if not strings, +e.g. integer indices. + +The `stateStructType` should be :class:`StructType` describing the schema of the +user-defined state. The value of the state will be presented as a tuple, as well as the +update should be performed with the tuple. The corresponding Python types for +:class:DataType are supported. Please refer to the page +https://spark.apache.org/docs/latest/sql-ref-datatypes.html (python tab). + +The size of each DataFrame in both the input and output can be arbitrary. The number of +DataFrames in both the input and output can also be arbitrary. + +.. versionadded:: 3.4.0 + +Parameters +-- +func : function +a Python native function to be called on every group. It should take parameters +(key, Iterator[`pandas.DataFrame`], state) and return Iterator[`pandas.DataFrame`]. +Note that the type of the key is tuple and the type of the state is +:class:`pyspark.sql.streaming.state.GroupState`. +outputStructType : :class:`pyspark.sql.types.DataType` or str +the type of the output records. The value can be either a +:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. +stateStructType : :class:`pyspark.sql.types.DataType` or str +the type of the user-defined state. The value can be either a +:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. +outputMode : str +the output mode of the function. +timeoutConf : str +timeout configuration for groups that do not receive data for a while. valid values +are defined in :class:`pyspark.sql.streaming.state.GroupStateTimeout`. + +# TODO: Examples Review Comment: I just added a simple example - let me come up with full example code in examples directory. I'll file a new JIRA ticket for this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] LuciferYang commented on pull request #37938: [SPARK-40490][YARN][TESTS] Ensure `YarnShuffleIntegrationSuite` tests registeredExecFile reload scenarios
LuciferYang commented on PR #37938: URL: https://github.com/apache/spark/pull/37938#issuecomment-1253038381 GA passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] itholic commented on a diff in pull request #37929: [SPARK-40486][PS] Implement `spearman` and `kendall` in `DataFrame.corrwith`
itholic commented on code in PR #37929: URL: https://github.com/apache/spark/pull/37929#discussion_r975906358 ## python/pyspark/pandas/frame.py: ## @@ -1847,14 +1665,14 @@ def corrwith( -- other : DataFrame, Series Object with which to compute correlations. - +axis : int, default 0 or 'index' +Can only be set to 0 at the moment. drop : bool, default False Drop missing indices from result. - -method : str, default 'pearson' -Method of correlation, one of: - +method : {'pearson', 'spearman', 'kendall'} Review Comment: qq: do we also need to implement `callable` as pandas does ? https://user-images.githubusercontent.com/44108233/191385853-2462b16c-84c4-4c37-8f71-9e3deb0bb4d6.png;> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975902646 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala: ## @@ -311,6 +323,56 @@ object UnsupportedOperationChecker extends Logging { } } +// applyInPandasWithState +case m: FlatMapGroupsInPandasWithState if m.isStreaming => + // Check compatibility with output modes and aggregations in query + val aggsInQuery = collectStreamingAggregates(plan) + + if (aggsInQuery.isEmpty) { +// applyInPandasWithState without aggregation: operation's output mode must Review Comment: Now I can imagine the case which current requirement of providing separate output mode prevents the unintentional behavior: - They implemented the user function for flatMapGroupsWithState with append mode. - They ran the query with append mode. - After that, they changed the output mode for the query to update mode for some reason. - The user function is unchanged to account the change of update mode. We haven't allowed the query to run as of now, and we are going to allow the query to run if we drop the parameter. PS. I'm not a believer that end users can implement their user function accordingly based on output mode, but that is a fundamental API design issue of original flatMapGroupsWithState which is separate one. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975902646 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala: ## @@ -311,6 +323,56 @@ object UnsupportedOperationChecker extends Logging { } } +// applyInPandasWithState +case m: FlatMapGroupsInPandasWithState if m.isStreaming => + // Check compatibility with output modes and aggregations in query + val aggsInQuery = collectStreamingAggregates(plan) + + if (aggsInQuery.isEmpty) { +// applyInPandasWithState without aggregation: operation's output mode must Review Comment: Now I can imagine the case which current requirement of providing separate output mode prevents the unintentional behavior: - They implemented the user function for flatMapGroupsWithState with append mode. - They ran the query with append mode. - After that, they changed the output mode to update mode for some reason. - The user function is unchanged to account the change of update mode. We haven't allowed the query to run as of now, and we are going to allow the query to run if we drop the parameter. PS. I'm not a believer that end users can implement their user function accordingly based on output mode, but that is a fundamental API design issue of original flatMapGroupsWithState which is separate one. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975902646 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala: ## @@ -311,6 +323,56 @@ object UnsupportedOperationChecker extends Logging { } } +// applyInPandasWithState +case m: FlatMapGroupsInPandasWithState if m.isStreaming => + // Check compatibility with output modes and aggregations in query + val aggsInQuery = collectStreamingAggregates(plan) + + if (aggsInQuery.isEmpty) { +// applyInPandasWithState without aggregation: operation's output mode must Review Comment: Now I can imagine the case which can prevent the unintentional behavior: - They implemented the user function for flatMapGroupsWithState with append mode. - They ran the query with append mode. - After that, they changed the output mode to update mode for some reason. - The user function is unchanged to account the change of update mode. We haven't allowed the query to run as of now, and we are going to allow the query to run if we drop the parameter. PS. I'm not a believer that end users can implement their user function accordingly based on output mode, but that is a fundamental API design issue of original flatMapGroupsWithState which is separate one. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975902646 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala: ## @@ -311,6 +323,56 @@ object UnsupportedOperationChecker extends Logging { } } +// applyInPandasWithState +case m: FlatMapGroupsInPandasWithState if m.isStreaming => + // Check compatibility with output modes and aggregations in query + val aggsInQuery = collectStreamingAggregates(plan) + + if (aggsInQuery.isEmpty) { +// applyInPandasWithState without aggregation: operation's output mode must Review Comment: Now I can imagine the case which can prevent the unintentional behavior: - They implemented the user function for flatMapGroupsWithState with append mode. - They ran the query with append mode. - After that, they changed the output mode to update mode for some reason. - The user function is unchanged to account the change of update mode. We haven't allowed the query to run as of now, and we are going to allow the query to run if we drop the parameter. PS. I'm not a believer that end users can implement their user function accordingly based on output mode, but that is a fundamental API design issue which is separate. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] tedyu commented on pull request #37952: [SPARK-40508][SQL] Treat unknown partitioning as UnknownPartitioning
tedyu commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1253011133 @sunchao https://github.com/tedyu/spark/runs/8459534296 shows that all tests have passed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values - If `prefersDate=true`, the column will be inferred as `StringType` - otherwise the column could be inferred as timestamp/string type based on whether the date format is supported by the lenient timestampFormatter - If timestamp values are before date values - the column could be inferred as timestamp/string type based on whether the date format is supported by the lenient timestampFormatter There is no behavior change when `prefersDate=false`. Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] wbo4958 commented on pull request #37855: [SPARK-40407][SQL] Fix the potential data skew caused by df.repartition
wbo4958 commented on PR #37855: URL: https://github.com/apache/spark/pull/37855#issuecomment-1252980054 @cloud-fan could you help to review it? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #37949: [SPARK-40504][YARN] Make yarn appmaster load config from client
AmplabJenkins commented on PR #37949: URL: https://github.com/apache/spark/pull/37949#issuecomment-1252898141 Can one of the admins verify this patch? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #37950: [SPARK-40505][K8S] Remove min heap setting for executor in entrypoint.sh
AmplabJenkins commented on PR #37950: URL: https://github.com/apache/spark/pull/37950#issuecomment-1252898093 Can one of the admins verify this patch? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #37951: [SPARK-40506]Spark Streaming metrics name doesn't need application name
AmplabJenkins commented on PR #37951: URL: https://github.com/apache/spark/pull/37951#issuecomment-1252898051 Can one of the admins verify this patch? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975770234 ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasWithStateExec.scala: ## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.plans.physical.Distribution +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.python.PandasGroupUtils.resolveArgOffsets +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP +import org.apache.spark.sql.execution.streaming.state.FlatMapGroupsWithStateExecHelper.StateData +import org.apache.spark.sql.execution.streaming.state.StateStore +import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.util.CompletionIterator + +/** + * Physical operator for executing + * [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandasWithState]] + * + * @param functionExpr function called on each group + * @param groupingAttributes used to group the data + * @param outAttributes used to define the output rows + * @param stateType used to serialize/deserialize state before calling `functionExpr` + * @param stateInfo `StatefulOperatorStateInfo` to identify the state store for a given operator. + * @param stateFormatVersion the version of state format. + * @param outputMode the output mode of `functionExpr` + * @param timeoutConf used to timeout groups that have not received data in a while + * @param batchTimestampMs processing timestamp of the current batch. + * @param eventTimeWatermark event time watermark for the current batch + * @param child logical plan of the underlying data + */ +case class FlatMapGroupsInPandasWithStateExec( Review Comment: We always have a separate exec implementation for Scala/Java vs Python since the constructor parameters are different. (We are leveraging case class for logical/physical plan, so difference of the constructor parameters warrants a new class.) So this is intentional. As a compromise we did the refactor to have FlatMapGroupsWithStateExecBase as a base class. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975800900 ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStateWriter.scala: ## @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.python + +import scala.collection.JavaConverters._ + +import org.apache.arrow.vector.{FieldVector, VectorSchemaRoot} +import org.apache.arrow.vector.ipc.ArrowStreamWriter + +import org.apache.spark.sql.Row +import org.apache.spark.sql.api.python.PythonSQLUtils +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow} +import org.apache.spark.sql.execution.arrow.ArrowWriter +import org.apache.spark.sql.execution.arrow.ArrowWriter.createFieldWriter +import org.apache.spark.sql.execution.streaming.GroupStateImpl +import org.apache.spark.sql.types.{BinaryType, BooleanType, IntegerType, StringType, StructField, StructType} +import org.apache.spark.unsafe.types.UTF8String + +/** + * This class abstracts the complexity on constructing Arrow RecordBatches for data and state with + * bin-packing and chunking. The caller only need to call the proper public methods of this class + * `startNewGroup`, `writeRow`, `finalizeGroup`, `finalizeData` and this class will write the data + * and state into Arrow RecordBatches with performing bin-pack and chunk internally. + * + * This class requires that the parameter `root` has been initialized with the Arrow schema like + * below: + * - data fields + * - state field + * - nested schema (Refer ApplyInPandasWithStateWriter.STATE_METADATA_SCHEMA) + * + * Please refer the code comment in the implementation to see how the writes of data and state + * against Arrow RecordBatch work with consideration of bin-packing and chunking. + */ +class ApplyInPandasWithStateWriter( +root: VectorSchemaRoot, +writer: ArrowStreamWriter, +arrowMaxRecordsPerBatch: Int) { + + import ApplyInPandasWithStateWriter._ + + // Unlike applyInPandas (and other PySpark operators), applyInPandasWithState requires to produce + // the additional data `state`, along with the input data. + // + // ArrowStreamWriter supports only single VectorSchemaRoot, which means all Arrow RecordBatches + // being sent out from ArrowStreamWriter should have same schema. That said, we have to construct + // "an" Arrow schema to contain both types of data, and also construct Arrow RecordBatches to + // contain both data. + // + // To achieve this, we extend the schema for input data to have a column for state at the end. + // But also, we logically group the columns by family (data vs state) and initialize writer + // separately, since it's lot more easier and probably performant to write the row directly + // rather than projecting the row to match up with the overall schema. + // + // Although Arrow RecordBatch enables to write the data as columnar, we figure out it gives + // strange outputs if we don't ensure that all columns have the same number of values. Since + // there are at least one data for a grouping key (we ensure this for the case of handling timed + // out state as well) whereas there is only one state for a grouping key, we have to fill up the + // empty rows in state side to ensure both have the same number of rows. + private val arrowWriterForData = createArrowWriter( +root.getFieldVectors.asScala.toSeq.dropRight(1)) + private val arrowWriterForState = createArrowWriter( +root.getFieldVectors.asScala.toSeq.takeRight(1)) + + // - Bin-packing + // + // We apply bin-packing the data from multiple groups into one Arrow RecordBatch to + // gain the performance. In many cases, the amount of data per grouping key is quite + // small, which does not seem to maximize the benefits of using Arrow. + // + // We have to split the record batch down to each group in Python worker to convert the + // data for group to Pandas, but hopefully, Arrow RecordBatch provides the way to split + // the range of data and give a view, say, "zero-copy". To help splitting the range for + // data,
[GitHub] [spark] sunchao commented on a diff in pull request #37952: [SPARK-40508][SQL] Treat unknown partitioning as UnknownPartitioning
sunchao commented on code in PR #37952: URL: https://github.com/apache/spark/pull/37952#discussion_r975791836 ## sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanPartitioningAndOrdering.scala: ## @@ -54,8 +55,9 @@ object V2ScanPartitioningAndOrdering extends Rule[LogicalPlan] with SQLConfHelpe } } case _: UnknownPartitioning => None -case p => throw new IllegalArgumentException("Unsupported data source V2 partitioning " + -"type: " + p.getClass.getSimpleName) +case p => + logWarning("Spark ignores the partitioning. Please use KeyGroupedPartitioning for better performance") Review Comment: nit: could we also log the class name here? e.g., ``` Spark ignores partitioning ${p.getClass.getSimpleName}. Please use KeyGroupedPartitioning for better performance ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] grundprinzip commented on a diff in pull request #37710: [SPARK-40448][CONNECT] Spark Connect build as Driver Plugin with Shaded Dependencies
grundprinzip commented on code in PR #37710: URL: https://github.com/apache/spark/pull/37710#discussion_r975782260 ## dev/deps/spark-deps-hadoop-3-hive-2.3: ## @@ -60,10 +62,20 @@ datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar derby/10.14.2.0//derby-10.14.2.0.jar dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +error_prone_annotations/2.10.0//error_prone_annotations-2.10.0.jar +failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/1.12.0//flatbuffers-java-1.12.0.jar gcs-connector/hadoop3-2.2.7/shaded/gcs-connector-hadoop3-2.2.7-shaded.jar generex/1.0.2//generex-1.0.2.jar gmetric4j/1.0.10//gmetric4j-1.0.10.jar +grpc-api/1.47.0//grpc-api-1.47.0.jar Review Comment: Yes, the dependencies are shaded, but the `dev/test-depdencies.sh` script fails if there are not listed here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] tedyu commented on pull request #37952: [SPARK-40508] Treat unknown partitioning as UnknownPartitioning
tedyu commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1252871488 @sunchao Please take another look. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975780795 ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasWithStateExec.scala: ## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.plans.physical.Distribution +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.python.PandasGroupUtils.resolveArgOffsets +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP +import org.apache.spark.sql.execution.streaming.state.FlatMapGroupsWithStateExecHelper.StateData +import org.apache.spark.sql.execution.streaming.state.StateStore +import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.util.CompletionIterator + +/** + * Physical operator for executing + * [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandasWithState]] + * + * @param functionExpr function called on each group + * @param groupingAttributes used to group the data + * @param outAttributes used to define the output rows + * @param stateType used to serialize/deserialize state before calling `functionExpr` + * @param stateInfo `StatefulOperatorStateInfo` to identify the state store for a given operator. + * @param stateFormatVersion the version of state format. + * @param outputMode the output mode of `functionExpr` + * @param timeoutConf used to timeout groups that have not received data in a while + * @param batchTimestampMs processing timestamp of the current batch. + * @param eventTimeWatermark event time watermark for the current batch + * @param child logical plan of the underlying data + */ +case class FlatMapGroupsInPandasWithStateExec( +functionExpr: Expression, +groupingAttributes: Seq[Attribute], +outAttributes: Seq[Attribute], +stateType: StructType, +stateInfo: Option[StatefulOperatorStateInfo], +stateFormatVersion: Int, +outputMode: OutputMode, +timeoutConf: GroupStateTimeout, +batchTimestampMs: Option[Long], +eventTimeWatermark: Option[Long], +child: SparkPlan) extends UnaryExecNode with FlatMapGroupsWithStateExecBase { + + // TODO(SPARK-40444): Add the support of initial state. + override protected val initialStateDeserializer: Expression = null + override protected val initialStateGroupAttrs: Seq[Attribute] = null + override protected val initialStateDataAttrs: Seq[Attribute] = null + override protected val initialState: SparkPlan = null + override protected val hasInitialState: Boolean = false + + override protected val stateEncoder: ExpressionEncoder[Any] = +RowEncoder(stateType).resolveAndBind().asInstanceOf[ExpressionEncoder[Any]] + + override def output: Seq[Attribute] = outAttributes + + private val sessionLocalTimeZone = conf.sessionLocalTimeZone + private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) + + private val pythonFunction = functionExpr.asInstanceOf[PythonUDF].func + private val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonFunction))) + private lazy val (dedupAttributes, argOffsets) = resolveArgOffsets( +groupingAttributes ++ child.output, groupingAttributes) + private lazy val unsafeProj = UnsafeProjection.create(dedupAttributes, child.output) + + override def requiredChildDistribution: Seq[Distribution] = +StatefulOperatorPartitioning.getCompatibleDistribution( + groupingAttributes, getStateInfo, conf) :: Nil + + override def requiredChildOrdering:
[GitHub] [spark] grundprinzip commented on a diff in pull request #37710: [SPARK-40448][CONNECT] Spark Connect build as Driver Plugin with Shaded Dependencies
grundprinzip commented on code in PR #37710: URL: https://github.com/apache/spark/pull/37710#discussion_r975731244 ## project/SparkBuild.scala: ## @@ -753,6 +815,7 @@ object OldDeps { } def oldDepsSettings() = Defaults.coreDefaultSettings ++ Seq( +PB.protocVersion := "3.21.1", Review Comment: Generalized the version into a variable. ## connect/src/main/scala/org/apache/spark/sql/sparkconnect/command/SparkConnectCommandPlanner.scala: ## @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sparkconnect.command + +import com.google.common.collect.{Lists, Maps} +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.Experimental +import org.apache.spark.api.python.{PythonEvalType, SimplePythonFunction} +import org.apache.spark.connect.{proto => proto} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.execution.python.UserDefinedPythonFunction +import org.apache.spark.sql.types.StringType + +@Experimental +class SparkConnectCommandPlanner(session: SparkSession, command: proto.Command) { Review Comment: Done, added `@Since("3.3.1")`, is this correct? ## project/SparkBuild.scala: ## @@ -357,7 +366,10 @@ object SparkBuild extends PomBuild { // To prevent intermittent compilation failures, see also SPARK-33297 // Apparently we can remove this when we use JDK 11. -Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat +Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat, + +// BUG fuck me Review Comment: Done. The SBT build was a major pain. Sorry for the leftover. ## connect/pom.xml: ## @@ -0,0 +1,281 @@ + + + +http://maven.apache.org/POM/4.0.0; xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance; + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd;> +4.0.0 + +org.apache.spark +spark-parent_2.12 +3.4.0-SNAPSHOT +../pom.xml + + +spark-connect_2.12 +jar +Spark Project Connect +https://spark.apache.org/ + + + org.sparkproject.connect + +connect +3.21.1 +31.0.1-jre +1.47.0 +6.0.53 + + + + +org.apache.spark +spark-core_${scala.binary.version} +${project.version} +provided + + +com.google.guava +guava + + + + +org.apache.spark +spark-core_${scala.binary.version} +${project.version} +test-jar +test + + +org.apache.spark +spark-catalyst_${scala.binary.version} +${project.version} +provided + + +com.google.guava +guava + + + + +org.apache.spark +spark-sql_${scala.binary.version} +${project.version} +provided + + +com.google.guava +guava + + + + + +com.google.guava +guava +31.0.1-jre +compile + + +com.google.guava +failureaccess +1.0.1 + + +io.grpc +grpc-netty-shaded +${io.grpc.version} + + +io.grpc +grpc-protobuf +${io.grpc.version} + + +io.grpc +grpc-services +${io.grpc.version} + + +io.grpc +grpc-stub +${io.grpc.version} + + +org.apache.tomcat +annotations-api +${tomcat.annotations.api.version} +provided + + +org.scalacheck +scalacheck_${scala.binary.version} +
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975770234 ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasWithStateExec.scala: ## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.plans.physical.Distribution +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.python.PandasGroupUtils.resolveArgOffsets +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP +import org.apache.spark.sql.execution.streaming.state.FlatMapGroupsWithStateExecHelper.StateData +import org.apache.spark.sql.execution.streaming.state.StateStore +import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.util.CompletionIterator + +/** + * Physical operator for executing + * [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandasWithState]] + * + * @param functionExpr function called on each group + * @param groupingAttributes used to group the data + * @param outAttributes used to define the output rows + * @param stateType used to serialize/deserialize state before calling `functionExpr` + * @param stateInfo `StatefulOperatorStateInfo` to identify the state store for a given operator. + * @param stateFormatVersion the version of state format. + * @param outputMode the output mode of `functionExpr` + * @param timeoutConf used to timeout groups that have not received data in a while + * @param batchTimestampMs processing timestamp of the current batch. + * @param eventTimeWatermark event time watermark for the current batch + * @param child logical plan of the underlying data + */ +case class FlatMapGroupsInPandasWithStateExec( Review Comment: We always have a separate exec implementation for Scala/Java vs Python since the constructor parameters are different. (We are leveraging case class so difference of the constructor parameters warrants a new class.) So this is intentional. As a compromise we did the refactor to have FlatMapGroupsWithStateExecBase as a base class. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] HeartSaVioR commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
HeartSaVioR commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975770234 ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasWithStateExec.scala: ## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.plans.physical.Distribution +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.python.PandasGroupUtils.resolveArgOffsets +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP +import org.apache.spark.sql.execution.streaming.state.FlatMapGroupsWithStateExecHelper.StateData +import org.apache.spark.sql.execution.streaming.state.StateStore +import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.util.CompletionIterator + +/** + * Physical operator for executing + * [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandasWithState]] + * + * @param functionExpr function called on each group + * @param groupingAttributes used to group the data + * @param outAttributes used to define the output rows + * @param stateType used to serialize/deserialize state before calling `functionExpr` + * @param stateInfo `StatefulOperatorStateInfo` to identify the state store for a given operator. + * @param stateFormatVersion the version of state format. + * @param outputMode the output mode of `functionExpr` + * @param timeoutConf used to timeout groups that have not received data in a while + * @param batchTimestampMs processing timestamp of the current batch. + * @param eventTimeWatermark event time watermark for the current batch + * @param child logical plan of the underlying data + */ +case class FlatMapGroupsInPandasWithStateExec( Review Comment: We always have a separate exec implementation for Scala/Java vs Python since the constructor parameters are different. (We are leveraging case class so difference of the constructor parameters warrant a new class.) So this is intentional. As a compromise we did the refactor to have FlatMapGroupsWithStateExecBase as a base class. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] sunchao commented on pull request #37952: Treat unknown partitioning as UnknownPartitioning
sunchao commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1252841711 I guess this PR make sense. @tedyu could you: - create a Spark JIRA for this issue? and update the PR title to reflect it? - add a warning message too? clients may expect Spark to use the partitioning they reported and could be surprised that Spark internally ignores it, so a warning message would be helpful for them to debug. I think the best solution is for connectors such as Cassandra to adopt the new API, otherwise they could see severe performance penalties. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] kazuyukitanimura commented on a diff in pull request #37934: [SPARK-40477][SQL] Support `NullType` in `ColumnarBatchRow`
kazuyukitanimura commented on code in PR #37934: URL: https://github.com/apache/spark/pull/37934#discussion_r975762637 ## sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala: ## @@ -1461,10 +1462,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession vectorizedReader.initBatch(schema, partitionValues) vectorizedReader.nextKeyValue() val row = vectorizedReader.getCurrentValue.asInstanceOf[InternalRow] - - // Use `GenericMutableRow` by explicitly copying rather than `ColumnarBatch` - // in order to use get(...) method which is not implemented in `ColumnarBatch`. - val actual = row.copy().get(1, dt) Review Comment: Thanks, actually we decided to close this PR. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] kazuyukitanimura closed pull request #37934: [SPARK-40477][SQL] Support `NullType` in `ColumnarBatchRow`
kazuyukitanimura closed pull request #37934: [SPARK-40477][SQL] Support `NullType` in `ColumnarBatchRow` URL: https://github.com/apache/spark/pull/37934 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] kazuyukitanimura commented on pull request #37934: [SPARK-40477][SQL] Support `NullType` in `ColumnarBatchRow`
kazuyukitanimura commented on PR #37934: URL: https://github.com/apache/spark/pull/37934#issuecomment-1252840241 We gave another thought and decided to close this one not to be fixed. There is no natural code path of calling ColumnarBatchRow.get() for NullType columns, especially NullType cannot be stored as partition in columnar format like Parquet. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] kazuyukitanimura commented on a diff in pull request #37934: [SPARK-40477][SQL] Support `NullType` in `ColumnarBatchRow`
kazuyukitanimura commented on code in PR #37934: URL: https://github.com/apache/spark/pull/37934#discussion_r975761631 ## sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala: ## @@ -1461,10 +1462,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession vectorizedReader.initBatch(schema, partitionValues) vectorizedReader.nextKeyValue() val row = vectorizedReader.getCurrentValue.asInstanceOf[InternalRow] - - // Use `GenericMutableRow` by explicitly copying rather than `ColumnarBatch` - // in order to use get(...) method which is not implemented in `ColumnarBatch`. Review Comment: I thought so initially, but now I cannot find `ColumnarBatchRow.get` usages on `NullType` columns -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values - If `prefersDate=true`, the column will be inferred as `StringType` - otherwise - If the date format is supported by `DefaultTimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise, the column will be inferred as `StringType` - If timestamp values are before date values - If the date format is supported by `DefaultTimestampFormatter`, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise the column will be inferred as `StringType` There is no behavior change when `prefersDate=false`. Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] grundprinzip commented on a diff in pull request #37710: [SPARK-40448][CONNECT] Spark Connect build as Driver Plugin with Shaded Dependencies
grundprinzip commented on code in PR #37710: URL: https://github.com/apache/spark/pull/37710#discussion_r975725801 ## dev/infra/Dockerfile: ## @@ -65,3 +65,6 @@ RUN Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='ht # See more in SPARK-39735 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" + +# Add Python Deps for Spark Connect. +RUN python3.9 -m pip install grpcio protobuf Review Comment: Done. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] alex-balikov commented on a diff in pull request #37893: [SPARK-40434][SS][PYTHON] Implement applyInPandasWithState in PySpark
alex-balikov commented on code in PR #37893: URL: https://github.com/apache/spark/pull/37893#discussion_r975687838 ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasWithStateExec.scala: ## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.plans.physical.Distribution +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.python.PandasGroupUtils.resolveArgOffsets +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP +import org.apache.spark.sql.execution.streaming.state.FlatMapGroupsWithStateExecHelper.StateData +import org.apache.spark.sql.execution.streaming.state.StateStore +import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.util.CompletionIterator + +/** + * Physical operator for executing + * [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandasWithState]] + * + * @param functionExpr function called on each group + * @param groupingAttributes used to group the data + * @param outAttributes used to define the output rows + * @param stateType used to serialize/deserialize state before calling `functionExpr` + * @param stateInfo `StatefulOperatorStateInfo` to identify the state store for a given operator. + * @param stateFormatVersion the version of state format. + * @param outputMode the output mode of `functionExpr` + * @param timeoutConf used to timeout groups that have not received data in a while + * @param batchTimestampMs processing timestamp of the current batch. + * @param eventTimeWatermark event time watermark for the current batch + * @param child logical plan of the underlying data + */ +case class FlatMapGroupsInPandasWithStateExec( Review Comment: I wonder if this can be merged with the regular FlatMapGroupsWithStateExec. Maybe as a followup cleanup. ## sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasWithStateExec.scala: ## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.plans.physical.Distribution +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import
[GitHub] [spark] ueshin commented on a diff in pull request #35391: [SPARK-38098][PYTHON] Add support for ArrayType of nested StructType to arrow-based conversion
ueshin commented on code in PR #35391: URL: https://github.com/apache/spark/pull/35391#discussion_r975708413 ## python/pyspark/sql/tests/test_dataframe.py: ## @@ -1107,6 +1107,29 @@ def test_to_pandas_from_mixed_dataframe(self): pdf_with_only_nulls = self.spark.sql(sql).filter("tinyint is null").toPandas() self.assertTrue(np.all(pdf_with_only_nulls.dtypes == pdf_with_some_nulls.dtypes)) +@unittest.skipIf( +not have_pandas or not have_pyarrow, +cast(str, pandas_requirement_message or pyarrow_requirement_message), +) +def test_to_pandas_for_array_of_struct(self): Review Comment: Could you mark to skip this test if the underlying PyArrow is less than `2.0.0`? ## python/pyspark/sql/tests/test_pandas_udf_scalar.py: ## @@ -134,6 +134,30 @@ def test_pandas_udf_nested_arrays(self): result = df.select(tokenize("vals").alias("hi")) self.assertEqual([Row(hi=[["hi", "boo"]]), Row(hi=[["bye", "boo"]])], result.collect()) +def test_pandas_array_struct(self): Review Comment: ditto. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] ueshin commented on a diff in pull request #35391: [SPARK-38098][PYTHON] Add support for ArrayType of nested StructType to arrow-based conversion
ueshin commented on code in PR #35391: URL: https://github.com/apache/spark/pull/35391#discussion_r975699563 ## python/pyspark/sql/pandas/types.py: ## @@ -86,9 +86,23 @@ def to_arrow_type(dt: DataType) -> "pa.DataType": elif type(dt) == DayTimeIntervalType: arrow_type = pa.duration("us") elif type(dt) == ArrayType: -if type(dt.elementType) in [StructType, TimestampType]: +if type(dt.elementType) == TimestampType: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) -arrow_type = pa.list_(to_arrow_type(dt.elementType)) +elif type(dt.elementType) == StructType: +if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): +raise TypeError( +"Array of StructType is only supported with pyarrow 2.0.0 and above" +) +dt_nested = dt.elementType +if any(type(field.dataType) == StructType for field in dt_nested): +raise TypeError("Nested StructType not supported in conversion to Arrow") +fields = [ +pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) +for field in dt_nested +] +arrow_type = pa.list_(pa.struct(fields)) Review Comment: We can just use `arrow_type = pa.list_(to_arrow_type(dt.elementType))` here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] gengliangwang closed pull request #37840: [SPARK-40416][SQL] Move subquery expression CheckAnalysis error messages to use the new error framework
gengliangwang closed pull request #37840: [SPARK-40416][SQL] Move subquery expression CheckAnalysis error messages to use the new error framework URL: https://github.com/apache/spark/pull/37840 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] gengliangwang commented on pull request #37840: [SPARK-40416][SQL] Move subquery expression CheckAnalysis error messages to use the new error framework
gengliangwang commented on PR #37840: URL: https://github.com/apache/spark/pull/37840#issuecomment-1252746471 Thanks, merging to master -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] tedyu commented on pull request #37952: Treat unknown partitioning as UnknownPartitioning
tedyu commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1252722669 I have run the test using Cassandra Spark connector and modified Spark (with this patch). The test passes (without modification to Cassandra Spark connector or client code). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] warrenzhu25 commented on pull request #37822: [SPARK-40381][DEPLOY] Support standalone worker recommission
warrenzhu25 commented on PR #37822: URL: https://github.com/apache/spark/pull/37822#issuecomment-1252713078 > Sorry, but I don't use Standalone cluster. Any ideas who is right person to review this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dongjoon-hyun commented on pull request #37822: [SPARK-40381][DEPLOY] Support standalone worker recommission
dongjoon-hyun commented on PR #37822: URL: https://github.com/apache/spark/pull/37822#issuecomment-1252708251 Sorry, but I don't use Standalone cluster. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dongjoon-hyun commented on a diff in pull request #37943: [WIP][SPARK-40497][BUILD] Upgrade Scala to 2.13.9
dongjoon-hyun commented on code in PR #37943: URL: https://github.com/apache/spark/pull/37943#discussion_r975658409 ## sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala: ## @@ -1044,7 +1044,7 @@ trait ShowCreateTableCommandBase extends SQLConfHelper { metadata .comment .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n") - .foreach(builder.append) + .foreach(s => builder.append(s)) Review Comment: Thank you for checking. +1 for skipping Scala 2.13.9 and reusing this JIRA for 2.13.10. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] dongjoon-hyun commented on a diff in pull request #37943: [WIP][SPARK-40497][BUILD] Upgrade Scala to 2.13.9
dongjoon-hyun commented on code in PR #37943: URL: https://github.com/apache/spark/pull/37943#discussion_r975658409 ## sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala: ## @@ -1044,7 +1044,7 @@ trait ShowCreateTableCommandBase extends SQLConfHelper { metadata .comment .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n") - .foreach(builder.append) + .foreach(s => builder.append(s)) Review Comment: Thank you for checking. +1 for skipping Scala 2.13.9. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values - If `prefersDate=true`, the column will be inferred as `StringType` - otherwise - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise, the column will be inferred as `StringType` - If timestamp values are before date values - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise the column will be inferred as `StringType` There is no behavior change when `prefersDate=false`. Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values - If `prefersDate=true`, the column will be inferred as `StringType` - otherwise - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise, the column will be inferred as `StringType` - If timestamp values are before date values - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise the column will be inferred as `StringType` Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] warrenzhu25 commented on pull request #37822: [SPARK-40381][DEPLOY] Support standalone worker recommission
warrenzhu25 commented on PR #37822: URL: https://github.com/apache/spark/pull/37822#issuecomment-1252694429 @dongjoon-hyun Could you help take a look? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values - If `prefersDate=true`, the column will be inferred as `StringType` - otherwise - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise, the column will be inferred as `StringType` - otherwise the column will be inferred as `StringType` - If timestamp values are before date values - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise the column will be inferred as `StringType` Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values, the column will be inferred as `StringType` - If timestamp values are before date values - If the date format is supported by `Iso8601TimestampFormatter `, the column will be inferred as `timestampFormat/timestampNTZFormat` - otherwise the column will be inferred as `StringType` Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, for a column with mixing dates and timestamps - If date values are before timestamp values, the column will be inferred as `timestampFormat/timestampNTZFormat` - If timestamp values are before date values - If the date format is supported by `Iso8601TimestampFormatter `, the column will still be inferred as `timestampFormat/timestampNTZFormat` - otherwise the column will be inferred as `StringType` Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, we keep current behavior that columns with mixing dates and timestamps could be inferred as `TimestampType/TimestampNTZType`, which is okay and even a good feature. Does this make sense to you? @sadikovi @cloud-fan cc @brkyvz @Yaohua628 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan & @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will always be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, we keep current behavior that columns with mixing dates and timestamps could be inferred as `TimestampType/TimestampNTZType`, which is okay and even a good feature. Does this make sense to you? @sadikovi @cloud-fan -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] xiaonanyang-db commented on a diff in pull request #37933: [SPARK-40474][SQL] Infer columns with mixed date and timestamp as String in CSV schema inference
xiaonanyang-db commented on code in PR #37933: URL: https://github.com/apache/spark/pull/37933#discussion_r975634588 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala: ## @@ -183,7 +180,9 @@ class CSVOptions( Some(parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")) } else { - parameters.get("timestampFormat") + // Use Iso8601TimestampFormatter (with strict timestamp parsing) to + // avoid parsing dates in timestamp columns as timestamp type Review Comment: Totally agree with your concerns @cloud-fan & @sadikovi. After some quick discussion within my team, we agreed on not changing these lines to avoid unnecessary regressions and any other behavior changes. Thus, the behavior after this PR become: - If user provides a `timestampFormat/timestampNTZFormat`, we will strictly parse fields as timestamp according to the format. Thus, columns with mixing dates and timestamps will be inferred as `StringType`. - If no `timestampFormat/timestampNTZFormat` specified by user, columns with mixing dates and timestamps could be inferred as `TimestampType/TimestampNTZType`, which is okay and even a good feature. Does this make sense to you? @sadikovi @cloud-fan -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] bersprockets commented on pull request #37825: [SPARK-40382][SQL] Group distinct aggregate expressions by semantically equivalent children in `RewriteDistinctAggregates`
bersprockets commented on PR #37825: URL: https://github.com/apache/spark/pull/37825#issuecomment-1252637037 cc @beliefer @cloud-fan -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] grundprinzip commented on pull request #37710: [SPARK-40448][CONNECT] Spark Connect build as Driver Plugin with Shaded Dependencies
grundprinzip commented on PR #37710: URL: https://github.com/apache/spark/pull/37710#issuecomment-1252636546 @pan3793 thanks for the thorough review. I will address the comments shortly. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] tedyu commented on pull request #37952: Treat unknown partitioning as UnknownPartitioning
tedyu commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1252627778 If custom partitioning reports `UnknownPartitioning` to Spark and can keep 3.2.1 behavior, that means the current check is not desired. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] sunchao commented on pull request #37952: Treat unknown partitioning as UnknownPartitioning
sunchao commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1252620344 Can you directly report `UnknownPartitioning` to Spark? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] tedyu commented on pull request #37952: Treat unknown partitioning as UnknownPartitioning
tedyu commented on PR #37952: URL: https://github.com/apache/spark/pull/37952#issuecomment-1252616217 If I subclass `UnknownPartitioning` directly, I would get this compilation error: ``` [error] /nfusr/dev-server/zyu/spark-cassandra-connector/connector/src/main/scala/com/datastax/spark/connector/datasource/CassandraScanBuilder.scala:327:92: not enough arguments for constructor UnknownPartitioning: (x$1: Int)org.apache.spark.sql.connector.read.partitioning.UnknownPartitioning. [error] Unspecified value parameter x$1. [error] case class CassandraPartitioning(partitionKeys: Array[String], numPartitions: Int) extends UnknownPartitioning { [error] ^ [error] one error found ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] pan3793 commented on a diff in pull request #37710: [SPARK-40448][CONNECT] Spark Connect build as Driver Plugin with Shaded Dependencies
pan3793 commented on code in PR #37710: URL: https://github.com/apache/spark/pull/37710#discussion_r975588478 ## connect/pom.xml: ## @@ -0,0 +1,281 @@ + + + +http://maven.apache.org/POM/4.0.0; xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance; + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd;> +4.0.0 + +org.apache.spark +spark-parent_2.12 +3.4.0-SNAPSHOT +../pom.xml + + +spark-connect_2.12 +jar +Spark Project Connect +https://spark.apache.org/ + + + org.sparkproject.connect + +connect +3.21.1 +31.0.1-jre +1.47.0 +6.0.53 + + + + +org.apache.spark +spark-core_${scala.binary.version} +${project.version} +provided + + +com.google.guava +guava + + + + +org.apache.spark +spark-core_${scala.binary.version} +${project.version} +test-jar +test + + +org.apache.spark +spark-catalyst_${scala.binary.version} +${project.version} +provided + + +com.google.guava +guava + + + + +org.apache.spark +spark-sql_${scala.binary.version} +${project.version} +provided + + +com.google.guava +guava + + + + + +com.google.guava +guava +31.0.1-jre +compile + + +com.google.guava +failureaccess +1.0.1 + + +io.grpc +grpc-netty-shaded +${io.grpc.version} + + +io.grpc +grpc-protobuf +${io.grpc.version} + + +io.grpc +grpc-services +${io.grpc.version} + + +io.grpc +grpc-stub +${io.grpc.version} + + +org.apache.tomcat +annotations-api +${tomcat.annotations.api.version} +provided + + +org.scalacheck +scalacheck_${scala.binary.version} +test + + +org.mockito +mockito-core +test + + + + + + + +kr.motd.maven +os-maven-plugin +1.6.2 + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + + +org.apache.maven.plugins +maven-jar-plugin + + +prepare-test-jar +test-compile + +test-jar + + + + + +org.scalatest +scalatest-maven-plugin + +-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize} ${extraJavaTestArgs} -Dio.netty.tryReflectionSetAccessible=true + + + +org.codehaus.mojo +build-helper-maven-plugin + + +add-sources +generate-sources + +add-source + + + + src/main/scala-${scala.binary.version} + + + + +add-scala-test-sources +generate-test-sources + +add-test-source + + + +src/test/gen-java + + + + + + + +org.apache.maven.plugins +maven-compiler-plugin + +1.6 +1.6 + + + +org.xolstice.maven.plugins +protobuf-maven-plugin +0.6.1 + +