This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 812a9ad3ea92 [MINOR] Fix some typos
812a9ad3ea92 is described below
commit 812a9ad3ea92b972fa8d966069ffd394b5c46ac1
Author: Kaz <[email protected]>
AuthorDate: Wed Dec 4 17:41:07 2024 +0100
[MINOR] Fix some typos
### What changes were proposed in this pull request?
This PR fixes typos in docstrings and comments (and a few functional bits
of code). The typos were found using the typos software:
https://github.com/crate-ci/typos
I've made a typo-fix-PR before, but haven't seen the result on the website
yet. Is there anything else I need to do for that?
### Why are the changes needed?
Nice to fix :)
### Does this PR introduce _any_ user-facing change?
yes, documentation was updated.
### How was this patch tested?
No tests added.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #48557 from KazMiddelhoek/master.
Lead-authored-by: Kaz <[email protected]>
Co-authored-by: Kaz <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.github/workflows/update_build_status.yml | 2 +-
R/pkg/R/functions.R | 4 ++--
R/pkg/R/serialize.R | 2 +-
.../apache/spark/network/shuffle/RemoteBlockPushResolver.java | 6 +++---
connector/connect/docs/client-connection-string.md | 4 ++--
docs/_plugins/include_example.rb | 4 ++--
docs/core-migration-guide.md | 2 +-
docs/running-on-yarn.md | 2 +-
docs/security.md | 2 +-
docs/spark-standalone.md | 2 +-
docs/sql-ref-syntax-ddl-declare-variable.md | 2 +-
python/docs/source/reference/pyspark.ss/index.rst | 2 +-
python/pyspark/ml/connect/io_utils.py | 2 +-
python/pyspark/ml/connect/tuning.py | 2 +-
python/pyspark/ml/deepspeed/deepspeed_distributor.py | 2 +-
python/pyspark/ml/dl_util.py | 2 +-
python/pyspark/ml/tests/connect/test_connect_function.py | 2 +-
python/pyspark/ml/tests/test_dl_util.py | 2 +-
python/pyspark/ml/tests/test_functions.py | 4 ++--
python/pyspark/ml/tests/test_param.py | 4 ++--
python/pyspark/ml/torch/distributor.py | 4 ++--
python/pyspark/pandas/accessors.py | 2 +-
python/pyspark/pandas/base.py | 2 +-
python/pyspark/pandas/frame.py | 4 ++--
python/pyspark/sql/connect/plan.py | 2 +-
python/pyspark/sql/dataframe.py | 4 ++--
python/pyspark/sql/functions/builtin.py | 10 +++++-----
python/pyspark/sql/readwriter.py | 2 +-
python/pyspark/sql/tests/connect/test_connect_function.py | 2 +-
.../sql/tests/streaming/test_streaming_foreach_batch.py | 2 +-
python/pyspark/sql/tests/test_udtf.py | 2 +-
python/pyspark/sql/udtf.py | 2 +-
python/pyspark/streaming/tests/test_dstream.py | 2 +-
python/pyspark/worker_util.py | 4 ++--
.../sql/connect/execution/ExecuteGrpcResponseSender.scala | 2 +-
35 files changed, 50 insertions(+), 50 deletions(-)
diff --git a/.github/workflows/update_build_status.yml
b/.github/workflows/update_build_status.yml
index d0a50b2b4aa7..542fa567dea6 100644
--- a/.github/workflows/update_build_status.yml
+++ b/.github/workflows/update_build_status.yml
@@ -72,7 +72,7 @@ jobs:
} catch (error) {
console.error(error)
// Run not found. This can happen when the PR author
removes GitHub Actions runs or
- // disalbes GitHub Actions.
+ // disables GitHub Actions.
continue
}
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9c825a99be18..e320981783ec 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2922,7 +2922,7 @@ setClassUnion("characterOrstructTypeOrColumn",
c("character", "structType", "Col
#' @details
#' \code{from_json}: Parses a column containing a JSON string into a Column of
\code{structType}
#' with the specified \code{schema} or array of \code{structType} if
\code{as.json.array} is set
-#' to \code{TRUE}. If the string is unparseable, the Column will contain the
value NA.
+#' to \code{TRUE}. If the string is unparsable, the Column will contain the
value NA.
#'
#' @rdname column_collection_functions
#' @param as.json.array indicating if input string is JSON array of objects or
a single object.
@@ -3004,7 +3004,7 @@ setMethod("schema_of_json", signature(x =
"characterOrColumn"),
#' @details
#' \code{from_csv}: Parses a column containing a CSV string into a Column of
\code{structType}
#' with the specified \code{schema}.
-#' If the string is unparseable, the Column will contain the value NA.
+#' If the string is unparsable, the Column will contain the value NA.
#'
#' @rdname column_collection_functions
#' @aliases from_csv from_csv,Column,characterOrstructTypeOrColumn-method
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 61e174de9ac5..4ccec991bb07 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -60,7 +60,7 @@ writeObject <- function(con, object, writeType = TRUE) {
if (type %in% c("integer", "character", "logical", "double", "numeric")) {
if (is.na(object[[1]])) {
# Uses the first element for now to keep the behavior same as R before
- # 4.2.0. This is wrong because we should differenciate c(NA) from a
+ # 4.2.0. This is wrong because we should differentiate c(NA) from a
# single NA as the former means array(null) and the latter means null
# in Spark SQL. However, it requires non-trivial comparison to
distinguish
# both in R. We should ideally fix this.
diff --git
a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
index 02a38eac5b40..6e9bd548f532 100644
---
a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
+++
b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
@@ -251,17 +251,17 @@ public class RemoteBlockPushResolver implements
MergedShuffleFileManager {
// Higher shuffleMergeId seen for the shuffle ID meaning new stage
attempt is being
// run for the shuffle ID. Close and clean up old shuffleMergeId
files,
// happens in the indeterminate stage retries
- AppAttemptShuffleMergeId currrentAppAttemptShuffleMergeId =
+ AppAttemptShuffleMergeId currentAppAttemptShuffleMergeId =
new AppAttemptShuffleMergeId(appShuffleInfo.appId,
appShuffleInfo.attemptId,
shuffleId, latestShuffleMergeId);
logger.info("{}: creating a new shuffle merge metadata since
received " +
"shuffleMergeId {} is higher than latest shuffleMergeId {}",
MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$,
- currrentAppAttemptShuffleMergeId),
+ currentAppAttemptShuffleMergeId),
MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId),
MDC.of(LogKeys.LATEST_SHUFFLE_MERGE_ID$.MODULE$,
latestShuffleMergeId));
submitCleanupTask(() ->
-
closeAndDeleteOutdatedPartitions(currrentAppAttemptShuffleMergeId,
+
closeAndDeleteOutdatedPartitions(currentAppAttemptShuffleMergeId,
mergePartitionsInfo.shuffleMergePartitions));
return new AppShuffleMergePartitionsInfo(shuffleMergeId, false);
} else {
diff --git a/connector/connect/docs/client-connection-string.md
b/connector/connect/docs/client-connection-string.md
index 37b2956a5c44..df371c5beaaa 100644
--- a/connector/connect/docs/client-connection-string.md
+++ b/connector/connect/docs/client-connection-string.md
@@ -2,7 +2,7 @@
From the client perspective, Spark Connect mostly behaves as any other GRPC
client and can be configured as such. However, to make it easy to use from
-different programming languages and to have a homogenous connection surface
+different programming languages and to have a homogeneous connection surface
this document proposes what the user surface is for connecting to a
Spark Connect endpoint.
@@ -136,7 +136,7 @@ server_url =
"sc://myhost.com:443/;use_ssl=true;token=ABCDEFG"
As mentioned above, Spark Connect uses a regular GRPC client and the server
path
cannot be configured to remain compatible with the GRPC standard and HTTP. For
-example the following examles are invalid.
+example the following examples are invalid.
```python
server_url = "sc://myhost.com:443/mypathprefix/;token=AAAAAAA"
diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index 7d0e78738095..6fd14ce31a68 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -114,8 +114,8 @@ module Jekyll
range = Range.new(start + 1, endline - 1)
trimmed = trim_codeblock(lines[range])
# Filter out possible example tags of overlapped labels.
- taggs_filtered = trimmed.select { |l| !l.include? '$example ' }
- result += taggs_filtered.join
+ tags_filtered = trimmed.select { |l| !l.include? '$example ' }
+ result += tags_filtered.join
result += "\n"
end
result
diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md
index 88bad6c5d1b9..958e442545dc 100644
--- a/docs/core-migration-guide.md
+++ b/docs/core-migration-guide.md
@@ -62,7 +62,7 @@ license: |
## Upgrading from Core 3.3 to 3.4
-- Since Spark 3.4, Spark driver will own `PersistentVolumnClaim`s and try to
reuse if they are not assigned to live executors. To restore the behavior
before Spark 3.4, you can set
`spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and
`spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`.
+- Since Spark 3.4, Spark driver will own `PersistentVolumeClaim`s and try to
reuse if they are not assigned to live executors. To restore the behavior
before Spark 3.4, you can set
`spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and
`spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`.
- Since Spark 3.4, Spark driver will track shuffle data when dynamic
allocation is enabled without shuffle service. To restore the behavior before
Spark 3.4, you can set `spark.dynamicAllocation.shuffleTracking.enabled` to
`false`.
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index aefa979946a6..b6f847ff533f 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -673,7 +673,7 @@ To use a custom metrics.properties for the application
master and executors, upd
<td>false</td>
<td>
Set to true for applications that have higher security requirements and
prefer that their
- secret is not saved in the db. The shuffle data of such applications wll
not be recovered after
+ secret is not saved in the db. The shuffle data of such applications will
not be recovered after
the External Shuffle Service restarts.
</td>
<td>3.5.0</td>
diff --git a/docs/security.md b/docs/security.md
index c7d3fd5f8c36..81173d5f01ce 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -72,7 +72,7 @@ secrets to be secure.
<td>false</td>
<td>
Set to true for applications that have higher security requirements and
prefer that their
- secret is not saved in the db. The shuffle data of such applications wll
not be recovered after
+ secret is not saved in the db. The shuffle data of such applications will
not be recovered after
the External Shuffle Service restarts.
</td>
<td>3.5.0</td>
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index d828436e7734..4f8e0dc1a391 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -372,7 +372,7 @@ SPARK_MASTER_OPTS supports the following system properties:
<td>
The pattern for app ID generation based on Java `String.format` method.
The default value is `app-%s-%04d` which represents the existing app id
string, e.g.,
- `app-20231031224509-0008`. Plesae be careful to generate unique IDs.
+ `app-20231031224509-0008`. Please be careful to generate unique IDs.
</td>
<td>4.0.0</td>
</tr>
diff --git a/docs/sql-ref-syntax-ddl-declare-variable.md
b/docs/sql-ref-syntax-ddl-declare-variable.md
index ba9857bf1917..41ecba136436 100644
--- a/docs/sql-ref-syntax-ddl-declare-variable.md
+++ b/docs/sql-ref-syntax-ddl-declare-variable.md
@@ -83,7 +83,7 @@ DECLARE OR REPLACE five = 55;
-- Explicitly declare the default value of a variable using the keyword
`DEFAULT`
DECLARE VARIABLE size DEFAULT 6;
--- STRING variable initialialized to `NULL`
+-- STRING variable initialized to `NULL`
DECLARE some_var STRING;
```
diff --git a/python/docs/source/reference/pyspark.ss/index.rst
b/python/docs/source/reference/pyspark.ss/index.rst
index 2cb0b1216eff..440228134fac 100644
--- a/python/docs/source/reference/pyspark.ss/index.rst
+++ b/python/docs/source/reference/pyspark.ss/index.rst
@@ -20,7 +20,7 @@
Structured Streaming
====================
-This page gives an overview of all public Structed Streaming API.
+This page gives an overview of all public Structured Streaming API.
.. toctree::
:maxdepth: 2
diff --git a/python/pyspark/ml/connect/io_utils.py
b/python/pyspark/ml/connect/io_utils.py
index c401e3e76676..8d93426915d4 100644
--- a/python/pyspark/ml/connect/io_utils.py
+++ b/python/pyspark/ml/connect/io_utils.py
@@ -74,7 +74,7 @@ class ParamsReadWrite(Params):
def _get_extra_metadata(self) -> Any:
"""
- Returns exta metadata of the instance
+ Returns extra metadata of the instance
"""
return None
diff --git a/python/pyspark/ml/connect/tuning.py
b/python/pyspark/ml/connect/tuning.py
index cdb606048a59..190fc683acf7 100644
--- a/python/pyspark/ml/connect/tuning.py
+++ b/python/pyspark/ml/connect/tuning.py
@@ -170,7 +170,7 @@ def _parallelFitTasks(
if active_session is None:
raise RuntimeError(
- "An active SparkSession is required for running cross valiator fit
tasks."
+ "An active SparkSession is required for running cross validator
fit tasks."
)
def get_single_task(index: int, param_map: Any) -> Callable[[], Tuple[int,
float]]:
diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py
b/python/pyspark/ml/deepspeed/deepspeed_distributor.py
index 4ac5ff2fb420..3fd1d3bb3246 100644
--- a/python/pyspark/ml/deepspeed/deepspeed_distributor.py
+++ b/python/pyspark/ml/deepspeed/deepspeed_distributor.py
@@ -49,7 +49,7 @@ class DeepspeedTorchDistributor(TorchDistributor):
Parameters
----------
numGpus: int
- The number of GPUs to use per node (analagous to num_gpus in
deepspeed command).
+ The number of GPUs to use per node (analogous to num_gpus in
deepspeed command).
nnodes: int
The number of nodes that should be used for the run.
localMode: bool
diff --git a/python/pyspark/ml/dl_util.py b/python/pyspark/ml/dl_util.py
index 8ead529d7b72..3b87049ef277 100644
--- a/python/pyspark/ml/dl_util.py
+++ b/python/pyspark/ml/dl_util.py
@@ -27,7 +27,7 @@ class FunctionPickler:
This class provides a way to pickle a function and its arguments.
It also provides a way to create a script that can run a
function with arguments if they have them pickled to a file.
- It also provides a way of extracting the conents of a pickle file.
+ It also provides a way of extracting the contents of a pickle file.
"""
@staticmethod
diff --git a/python/pyspark/ml/tests/connect/test_connect_function.py
b/python/pyspark/ml/tests/connect/test_connect_function.py
index 393d38fdc426..7d3a115ab061 100644
--- a/python/pyspark/ml/tests/connect/test_connect_function.py
+++ b/python/pyspark/ml/tests/connect/test_connect_function.py
@@ -43,7 +43,7 @@ class SparkConnectMLFunctionTests(ReusedConnectTestCase,
PandasOnSparkTestUtils,
# Disable the shared namespace so pyspark.sql.functions, etc point the
regular
# PySpark libraries.
os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1"
- cls.connect = cls.spark # Switch Spark Connect session and regular
PySpark sesion.
+ cls.connect = cls.spark # Switch Spark Connect session and regular
PySpark session.
cls.spark = PySparkSession._instantiatedSession
assert cls.spark is not None
diff --git a/python/pyspark/ml/tests/test_dl_util.py
b/python/pyspark/ml/tests/test_dl_util.py
index e5e2c6bc191d..c130cf1ff6b9 100644
--- a/python/pyspark/ml/tests/test_dl_util.py
+++ b/python/pyspark/ml/tests/test_dl_util.py
@@ -137,7 +137,7 @@ class TestFunctionPickler(unittest.TestCase):
"",
),
(
- "Check if it creates the correct file with only suffix +
boddy",
+ "Check if it creates the correct file with only suffix + body",
"",
"print('goodbye')",
),
diff --git a/python/pyspark/ml/tests/test_functions.py
b/python/pyspark/ml/tests/test_functions.py
index e67e46ded67b..7719b2b27e0a 100644
--- a/python/pyspark/ml/tests/test_functions.py
+++ b/python/pyspark/ml/tests/test_functions.py
@@ -265,14 +265,14 @@ class PredictBatchUDFTests(SparkSessionTestCase):
with self.assertRaisesRegex(Exception, "Model expected 3 inputs, but
received 4 columns"):
preds = self.df.withColumn("preds", sum_cols(*columns)).toPandas()
- # muliple scalar columns with one tensor_input_shape => single numpy
array
+ # multiple scalar columns with one tensor_input_shape => single numpy
array
sum_cols = predict_batch_udf(
array_sum_fn, return_type=DoubleType(), batch_size=5,
input_tensor_shapes=[[4]]
)
preds = self.df.withColumn("preds",
sum_cols(struct(*columns))).toPandas()
self.assertTrue(np.array_equal(np.sum(self.data, axis=1),
preds["preds"].to_numpy()))
- # muliple scalar columns with wrong tensor_input_shape => ERROR
+ # multiple scalar columns with wrong tensor_input_shape => ERROR
sum_cols = predict_batch_udf(
array_sum_fn, return_type=DoubleType(), batch_size=5,
input_tensor_shapes=[[3]]
)
diff --git a/python/pyspark/ml/tests/test_param.py
b/python/pyspark/ml/tests/test_param.py
index 8df50a5963e6..0aa982712495 100644
--- a/python/pyspark/ml/tests/test_param.py
+++ b/python/pyspark/ml/tests/test_param.py
@@ -368,12 +368,12 @@ class ParamTests(SparkSessionTestCase):
self.assertFalse(binarizer.isSet(binarizer.outputCol))
self.assertEqual(result[0][0], 1.0)
- def test_lr_evaluate_invaild_type(self):
+ def test_lr_evaluate_invalid_type(self):
lr = LinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, lr.evaluate, invalid_type)
- def test_glr_evaluate_invaild_type(self):
+ def test_glr_evaluate_invalid_type(self):
glr = GeneralizedLinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, glr.evaluate, invalid_type)
diff --git a/python/pyspark/ml/torch/distributor.py
b/python/pyspark/ml/torch/distributor.py
index 62a71c5a96af..ef86f38b716b 100644
--- a/python/pyspark/ml/torch/distributor.py
+++ b/python/pyspark/ml/torch/distributor.py
@@ -232,10 +232,10 @@ class Distributor:
def _validate_input_params(self) -> None:
if self.num_processes <= 0:
- raise ValueError("num_proccesses has to be a positive integer")
+ raise ValueError("num_processes has to be a positive integer")
def _check_encryption(self) -> None:
- """Checks to see if the user requires encrpytion of data.
+ """Checks to see if the user requires encryption of data.
If required, throw an exception since we don't support that.
Raises
diff --git a/python/pyspark/pandas/accessors.py
b/python/pyspark/pandas/accessors.py
index 4c36f7976af8..77757e4b6087 100644
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@@ -936,7 +936,7 @@ class PandasOnSparkSeriesMethods:
def pandas_concat(*series: pd.Series) -> pd.DataFrame:
# The input can only be a DataFrame for struct from Spark 3.0.
- # This works around makeing the input as a frame. See SPARK-27240
+ # This works around making the input as a frame. See SPARK-27240
pdf = pd.concat(series, axis=1)
pdf.columns = columns
return pdf
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index bc54d8b9b17c..01e23214d662 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -1123,7 +1123,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
Shift Series/Index by desired number of periods.
.. note:: the current implementation of shift uses Spark's Window
without
- specifying partition specification. This leads to moveing all data
into
+ specifying partition specification. This leads to moving all data
into
a single partition in a single machine and could cause serious
performance degradation. Avoid this method with very large
datasets.
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 49aa49f65e35..f315d59a4fe9 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -7686,7 +7686,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
if na_position not in ("first", "last"):
raise ValueError("invalid na_position: '{}'".format(na_position))
- # Mapper: Get a spark colum
+ # Mapper: Get a spark column
# n function for (ascending, na_position) combination
mapper = {
(True, "first"): PySparkColumn.asc_nulls_first,
@@ -9808,7 +9808,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
if is_all_string_type:
# Handling string type columns
- # We will retrive the `count`, `unique`, `top` and `freq`.
+ # We will retrieve the `count`, `unique`, `top` and `freq`.
internal = self._internal.resolved_copy
exprs_string = [
internal.spark_column_for(psser._column_label) for psser in
psser_string
diff --git a/python/pyspark/sql/connect/plan.py
b/python/pyspark/sql/connect/plan.py
index b387ca1d4e50..34f11768bcbc 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -205,7 +205,7 @@ class LogicalPlan:
try:
params[name] = getattr(self, "_" + name)
except AttributeError:
- pass # Simpy ignore
+ pass # Simply ignore
return params
def print(self, indent: int = 0) -> str:
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 085a1a629634..0ea0eef50c0f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2549,7 +2549,7 @@ class DataFrame:
pyspark.errors.exceptions.captured.AnalysisException: Column name#0
are ambiguous...
A better approach is to assign aliases to the dataframes, and then
reference
- the ouptut columns from the join operation using these aliases:
+ the output columns from the join operation using these aliases:
>>> df.alias("a").join(
... df.alias("b"), sf.col("a.name") == sf.col("b.name"), "outer"
@@ -3907,7 +3907,7 @@ class DataFrame:
groupingSets : sequence of sequence of columns or str
Individual set of columns to group on.
cols : :class:`Column` or str
- Addional grouping columns specified by users.
+ Additional grouping columns specified by users.
Those columns are shown as the output columns after aggregation.
Returns
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 391bc3db7a86..4b4c164055ea 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -15832,7 +15832,7 @@ def split_part(src: "ColumnOrName", delimiter:
"ColumnOrName", partNum: "ColumnO
Parameters
----------
src : :class:`~pyspark.sql.Column` or column name
- A column of string to be splited.
+ A column of string to be split.
delimiter : :class:`~pyspark.sql.Column` or column name
A column of string, the delimiter used for split.
partNum : :class:`~pyspark.sql.Column` or column name
@@ -19618,7 +19618,7 @@ def from_json(
"""
Parses a column containing a JSON string into a :class:`MapType` with
:class:`StringType`
as keys type, :class:`StructType` or :class:`ArrayType` with
- the specified schema. Returns `null`, in the case of an unparseable string.
+ the specified schema. Returns `null`, in the case of an unparsable string.
.. versionadded:: 2.1.0
@@ -20230,7 +20230,7 @@ def from_xml(
) -> Column:
"""
Parses a column containing a XML string to a row with
- the specified schema. Returns `null`, in the case of an unparseable string.
+ the specified schema. Returns `null`, in the case of an unparsable string.
.. versionadded:: 4.0.0
@@ -22624,7 +22624,7 @@ def transform_keys(col: "ColumnOrName", f:
Callable[[Column, Column], Column]) -
Returns
-------
:class:`~pyspark.sql.Column`
- a new map of enties where new keys were calculated by applying given
function to
+ a new map of entries where new keys were calculated by applying given
function to
each key value argument.
Examples
@@ -22664,7 +22664,7 @@ def transform_values(col: "ColumnOrName", f:
Callable[[Column, Column], Column])
Returns
-------
:class:`~pyspark.sql.Column`
- a new map of enties where new values were calculated by applying given
function to
+ a new map of entries where new values were calculated by applying
given function to
each key value argument.
Examples
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 4744bdf861d3..2113f0707f91 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -215,7 +215,7 @@ class DataFrameReader(OptionUtils):
Parameters
----------
**options : dict
- The dictionary of string keys and prmitive-type values.
+ The dictionary of string keys and primitive-type values.
Examples
--------
diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py
b/python/pyspark/sql/tests/connect/test_connect_function.py
index e29873173cc3..b7a02efcd5e2 100644
--- a/python/pyspark/sql/tests/connect/test_connect_function.py
+++ b/python/pyspark/sql/tests/connect/test_connect_function.py
@@ -54,7 +54,7 @@ class SparkConnectFunctionTests(ReusedConnectTestCase,
PandasOnSparkTestUtils, S
# Disable the shared namespace so pyspark.sql.functions, etc point the
regular
# PySpark libraries.
os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1"
- cls.connect = cls.spark # Switch Spark Connect session and regular
PySpark sesion.
+ cls.connect = cls.spark # Switch Spark Connect session and regular
PySpark session.
cls.spark = PySparkSession._instantiatedSession
assert cls.spark is not None
diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
index de8f30baebca..9db66aa252ee 100644
--- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
+++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
@@ -146,7 +146,7 @@ class StreamingTestsForeachBatchMixin:
def my_test_function_2():
return 2
- def test_streaming_foreach_batch_fuction_calling(self):
+ def test_streaming_foreach_batch_function_calling(self):
def my_test_function_3():
return 3
diff --git a/python/pyspark/sql/tests/test_udtf.py
b/python/pyspark/sql/tests/test_udtf.py
index 206cfd7dc488..8447edfbbb15 100644
--- a/python/pyspark/sql/tests/test_udtf.py
+++ b/python/pyspark/sql/tests/test_udtf.py
@@ -1345,7 +1345,7 @@ class BaseUDTFTestsMixin:
assertSchemaEqual(df.schema, expected_schema)
assertDataFrameEqual(df, expected_results)
- def test_udtf_with_analyze_arbitary_number_arguments(self):
+ def test_udtf_with_analyze_arbitrary_number_arguments(self):
class TestUDTF:
@staticmethod
def analyze(*args: AnalyzeArgument) -> AnalyzeResult:
diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py
index 5ce3e2dfd2a9..cf4f976fd93b 100644
--- a/python/pyspark/sql/udtf.py
+++ b/python/pyspark/sql/udtf.py
@@ -148,7 +148,7 @@ class AnalyzeResult:
The schema that the Python UDTF will return.
withSinglePartition: bool
If true, the UDTF is specifying for Catalyst to repartition all rows
of the input TABLE
- argument to one collection for consumption by exactly one instance of
the correpsonding
+ argument to one collection for consumption by exactly one instance of
the corresponding
UDTF class.
partitionBy: sequence of :class:`PartitioningColumn`
If non-empty, this is a sequence of expressions that the UDTF is
specifying for Catalyst to
diff --git a/python/pyspark/streaming/tests/test_dstream.py
b/python/pyspark/streaming/tests/test_dstream.py
index 046247763c0b..4c9633db311a 100644
--- a/python/pyspark/streaming/tests/test_dstream.py
+++ b/python/pyspark/streaming/tests/test_dstream.py
@@ -403,7 +403,7 @@ class BasicOperationTests(PySparkStreamingTestCase):
self.fail("a failed func should throw an error")
- def test_failed_func_with_reseting_failure(self):
+ def test_failed_func_with_resetting_failure(self):
input = [self.sc.parallelize([d], 1) for d in range(4)]
input_stream = self.ssc.queueStream(input)
diff --git a/python/pyspark/worker_util.py b/python/pyspark/worker_util.py
index 81c05ce94eb6..5c758d3f83fe 100644
--- a/python/pyspark/worker_util.py
+++ b/python/pyspark/worker_util.py
@@ -107,8 +107,8 @@ def setup_memory_limits(memory_limit_mb: int) -> None:
except (resource.error, OSError, ValueError) as e:
# not all systems support resource limits, so warn instead of
failing
- curent = currentframe()
- lineno = getframeinfo(curent).lineno + 1 if curent is not None
else 0
+ current = currentframe()
+ lineno = getframeinfo(current).lineno + 1 if current is not None
else 0
if "__file__" in globals():
print(
warnings.formatwarning(
diff --git
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
index 051093fcad27..44b634af95ca 100644
---
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
+++
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
@@ -291,7 +291,7 @@ private[connect] class ExecuteGrpcResponseSender[T <:
Message](
assert(finished == false)
} else {
// If it wasn't sent, time deadline must have been reached before
stream became available,
- // or it was intterupted. Will exit in the next loop iterattion.
+ // or it was interrupted. Will exit in the next loop iterattion.
assert(deadlineLimitReached || interrupted)
}
} else if (streamFinished) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]