This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6fb417e [SPARK-37632][PYTHON] Drop code that targets Python < 3.7
6fb417e is described below
commit 6fb417e21c98283c40713698c58a73e68ea9614f
Author: zero323 <[email protected]>
AuthorDate: Thu Dec 16 10:53:13 2021 +0900
[SPARK-37632][PYTHON] Drop code that targets Python < 3.7
### What changes were proposed in this pull request?
This PR drops parts of code that target Python < 3.7 including:
- Components required for type hint compatibility.
- Old style (Python 2) class declarations.
- Dictionary-related code that was intended for drop after removal of
Python 3.5 support.
### Why are the changes needed?
With SPARK-36145 we remove support for Python 3.6 ‒ we should remove all
dead code as well, to improve style and reduce maintenance overhead.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
Closes #34886 from zero323/SPARK-37632.
Authored-by: zero323 <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/_globals.py | 2 +-
python/pyspark/accumulators.py | 4 +--
python/pyspark/broadcast.py | 2 +-
python/pyspark/conf.py | 2 +-
python/pyspark/context.py | 7 +---
python/pyspark/files.py | 2 +-
python/pyspark/ml/base.py | 2 +-
python/pyspark/ml/image.py | 2 +-
python/pyspark/ml/linalg/__init__.py | 8 ++---
python/pyspark/ml/param/__init__.py | 4 +--
python/pyspark/ml/stat.py | 10 +++---
python/pyspark/ml/tuning.py | 2 +-
python/pyspark/ml/util.py | 10 +++---
python/pyspark/ml/wrapper.py | 2 +-
python/pyspark/mllib/classification.py | 8 ++---
python/pyspark/mllib/clustering.py | 12 +++----
python/pyspark/mllib/common.py | 2 +-
python/pyspark/mllib/feature.py | 14 ++++----
python/pyspark/mllib/fpm.py | 4 +--
python/pyspark/mllib/linalg/__init__.py | 10 +++---
python/pyspark/mllib/linalg/distributed.py | 6 ++--
python/pyspark/mllib/random.py | 2 +-
python/pyspark/mllib/recommendation.py | 2 +-
python/pyspark/mllib/regression.py | 14 ++++----
python/pyspark/mllib/stat/KernelDensity.py | 2 +-
python/pyspark/mllib/stat/_statistics.py | 2 +-
python/pyspark/mllib/tree.py | 6 ++--
python/pyspark/mllib/util.py | 8 ++---
python/pyspark/pandas/accessors.py | 4 +--
python/pyspark/pandas/categorical.py | 2 +-
python/pyspark/pandas/datetimes.py | 2 +-
python/pyspark/pandas/frame.py | 38 +++-------------------
python/pyspark/pandas/groupby.py | 15 +++------
python/pyspark/pandas/indexing.py | 2 +-
python/pyspark/pandas/internal.py | 2 +-
python/pyspark/pandas/missing/frame.py | 2 +-
python/pyspark/pandas/missing/groupby.py | 4 +--
python/pyspark/pandas/missing/indexes.py | 4 +--
python/pyspark/pandas/missing/series.py | 2 +-
python/pyspark/pandas/missing/window.py | 8 ++---
python/pyspark/pandas/mlflow.py | 2 +-
python/pyspark/pandas/series.py | 17 ----------
python/pyspark/pandas/spark/accessors.py | 2 +-
python/pyspark/pandas/sql_processor.py | 2 +-
python/pyspark/pandas/strings.py | 2 +-
.../pandas/tests/data_type_ops/testing_utils.py | 2 +-
python/pyspark/pandas/tests/test_dataframe.py | 14 ++++----
python/pyspark/pandas/typedef/typehints.py | 30 ++++++-----------
.../pyspark/pandas/usage_logging/usage_logger.py | 2 +-
python/pyspark/profiler.py | 4 +--
python/pyspark/rdd.py | 10 +++---
python/pyspark/rddsampler.py | 2 +-
python/pyspark/resource/information.py | 2 +-
python/pyspark/resource/profile.py | 4 +--
python/pyspark/resource/requests.py | 8 ++---
python/pyspark/serializers.py | 6 ++--
python/pyspark/shuffle.py | 10 +++---
python/pyspark/sql/catalog.py | 2 +-
python/pyspark/sql/column.py | 2 +-
python/pyspark/sql/conf.py | 2 +-
python/pyspark/sql/context.py | 2 +-
python/pyspark/sql/dataframe.py | 4 +--
python/pyspark/sql/pandas/conversion.py | 4 +--
python/pyspark/sql/pandas/functions.py | 2 +-
python/pyspark/sql/pandas/group_ops.py | 4 +--
python/pyspark/sql/pandas/map_ops.py | 2 +-
python/pyspark/sql/pandas/serializers.py | 2 +-
python/pyspark/sql/pandas/typehints.py | 2 +-
python/pyspark/sql/readwriter.py | 4 +--
python/pyspark/sql/session.py | 2 +-
python/pyspark/sql/streaming.py | 6 ++--
python/pyspark/sql/tests/test_streaming.py | 2 +-
python/pyspark/sql/tests/test_types.py | 2 +-
python/pyspark/sql/tests/test_udf.py | 2 +-
python/pyspark/sql/types.py | 10 +++---
python/pyspark/sql/udf.py | 4 +--
python/pyspark/sql/utils.py | 2 +-
python/pyspark/sql/window.py | 4 +--
python/pyspark/statcounter.py | 2 +-
python/pyspark/status.py | 2 +-
python/pyspark/storagelevel.py | 2 +-
python/pyspark/streaming/context.py | 2 +-
python/pyspark/streaming/dstream.py | 2 +-
python/pyspark/streaming/kinesis.py | 4 +--
python/pyspark/streaming/listener.py | 2 +-
python/pyspark/streaming/util.py | 4 +--
python/pyspark/taskcontext.py | 4 +--
python/pyspark/testing/pandasutils.py | 2 +-
python/pyspark/testing/sqlutils.py | 4 +--
python/pyspark/testing/utils.py | 4 +--
python/pyspark/tests/test_serializers.py | 4 +--
python/pyspark/tests/test_util.py | 4 +--
python/pyspark/traceback_utils.py | 2 +-
python/pyspark/util.py | 2 +-
python/setup.py | 3 +-
python/test_support/userlibrary.py | 2 +-
96 files changed, 201 insertions(+), 271 deletions(-)
diff --git a/python/pyspark/_globals.py b/python/pyspark/_globals.py
index a635972..1b8c827 100644
--- a/python/pyspark/_globals.py
+++ b/python/pyspark/_globals.py
@@ -42,7 +42,7 @@ if "_is_loaded" in globals():
_is_loaded = True
-class _NoValueType(object):
+class _NoValueType:
"""Special keyword value.
The instance of this class may be used as the default value assigned to a
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 2ea2a49..d3dc2e9 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -46,7 +46,7 @@ def _deserialize_accumulator(aid, zero_value, accum_param):
return accum
-class Accumulator(object):
+class Accumulator:
"""
A shared variable that can be accumulated, i.e., has a commutative and
associative "add"
@@ -151,7 +151,7 @@ class Accumulator(object):
return "Accumulator<id=%i, value=%s>" % (self.aid, self._value)
-class AccumulatorParam(object):
+class AccumulatorParam:
"""
Helper object that defines how to accumulate values of a given type.
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index 995da33..903e4ea 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -42,7 +42,7 @@ def _from_id(bid):
return _broadcastRegistry[bid]
-class Broadcast(object):
+class Broadcast:
"""
A broadcast variable created with :meth:`SparkContext.broadcast`.
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index 47ea8b6..536e1f8 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -23,7 +23,7 @@ from typing import Dict, List, Optional, Tuple, cast, overload
from py4j.java_gateway import JVMView, JavaObject # type: ignore[import]
-class SparkConf(object):
+class SparkConf:
"""
Configuration for a Spark application. Used to set various Spark
parameters as key-value pairs.
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index e069b4f..d1fca36 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -86,7 +86,7 @@ T = TypeVar("T")
U = TypeVar("U")
-class SparkContext(object):
+class SparkContext:
"""
Main entry point for Spark functionality. A SparkContext represents the
@@ -306,11 +306,6 @@ class SparkContext(object):
self.pythonExec = os.environ.get("PYSPARK_PYTHON", "python3")
self.pythonVer = "%d.%d" % sys.version_info[:2]
- if sys.version_info[:2] < (3, 7):
- with warnings.catch_warnings():
- warnings.simplefilter("once")
- warnings.warn("Python 3.6 support is deprecated in Spark
3.2.", FutureWarning)
-
# Broadcast's __reduce__ method stores Broadcast instances here.
# This allows other code to determine which Broadcast instances have
# been pickled, so it can determine which Java broadcast objects to
diff --git a/python/pyspark/files.py b/python/pyspark/files.py
index 59ab0b6..5d68f91 100644
--- a/python/pyspark/files.py
+++ b/python/pyspark/files.py
@@ -26,7 +26,7 @@ if TYPE_CHECKING:
from pyspark import SparkContext
-class SparkFiles(object):
+class SparkFiles:
"""
Resolves paths to files added through :meth:`SparkContext.addFile`.
diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py
index 970444d..9d2a191 100644
--- a/python/pyspark/ml/base.py
+++ b/python/pyspark/ml/base.py
@@ -34,7 +34,7 @@ from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType
-class _FitMultipleIterator(object):
+class _FitMultipleIterator:
"""
Used by default implementation of Estimator.fitMultiple to produce models
in a thread safe
iterator. This class handles the simple case of fitMultiple where each
param map should be
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index bdd1c9d..7188ef3 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -36,7 +36,7 @@ from pyspark.sql import SparkSession
__all__ = ["ImageSchema"]
-class _ImageSchema(object):
+class _ImageSchema:
"""
Internal class for `pyspark.ml.image.ImageSchema` attribute. Meant to be
private and
not to be instantized. Use `pyspark.ml.image.ImageSchema` attribute to
access the
diff --git a/python/pyspark/ml/linalg/__init__.py
b/python/pyspark/ml/linalg/__init__.py
index 46a8b97..a82ae0d 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -250,7 +250,7 @@ class MatrixUDT(UserDefinedType):
return "matrix"
-class Vector(object):
+class Vector:
__UDT__ = VectorUDT()
@@ -795,7 +795,7 @@ class SparseVector(Vector):
return result
-class Vectors(object):
+class Vectors:
"""
Factory methods for working with vectors.
@@ -905,7 +905,7 @@ class Vectors(object):
return all_equal
-class Matrix(object):
+class Matrix:
__UDT__ = MatrixUDT()
@@ -1211,7 +1211,7 @@ class SparseMatrix(Matrix):
return np.all(self.toArray() == other.toArray())
-class Matrices(object):
+class Matrices:
@staticmethod
def dense(numRows, numCols, values):
"""
diff --git a/python/pyspark/ml/param/__init__.py
b/python/pyspark/ml/param/__init__.py
index e011c50..bab56a0 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -28,7 +28,7 @@ from pyspark.ml.util import Identifiable
__all__ = ["Param", "Params", "TypeConverters"]
-class Param(object):
+class Param:
"""
A param with self-contained documentation.
@@ -68,7 +68,7 @@ class Param(object):
return False
-class TypeConverters(object):
+class TypeConverters:
"""
Factory methods for common type conversion functions for
`Param.typeConverter`.
diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py
index 561ec71..15bb6ca 100644
--- a/python/pyspark/ml/stat.py
+++ b/python/pyspark/ml/stat.py
@@ -24,7 +24,7 @@ from pyspark.sql.column import Column, _to_seq
from pyspark.sql.functions import lit
-class ChiSquareTest(object):
+class ChiSquareTest:
"""
Conduct Pearson's independence test for every feature against the label.
For each feature,
the (feature, label) pairs are converted into a contingency matrix for
which the Chi-squared
@@ -100,7 +100,7 @@ class ChiSquareTest(object):
return _java2py(sc, javaTestObj.test(*args))
-class Correlation(object):
+class Correlation:
"""
Compute the correlation matrix for the input dataset of Vectors using the
specified method.
Methods currently supported: `pearson` (default), `spearman`.
@@ -167,7 +167,7 @@ class Correlation(object):
return _java2py(sc, javaCorrObj.corr(*args))
-class KolmogorovSmirnovTest(object):
+class KolmogorovSmirnovTest:
"""
Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a
continuous
distribution.
@@ -236,7 +236,7 @@ class KolmogorovSmirnovTest(object):
)
-class Summarizer(object):
+class Summarizer:
"""
Tools for vectorized statistics on MLlib Vectors.
The methods in this package provide various statistics for Vectors
contained inside DataFrames.
@@ -459,7 +459,7 @@ class SummaryBuilder(JavaWrapper):
return Column(self._java_obj.summary(featuresCol._jc, weightCol._jc))
-class MultivariateGaussian(object):
+class MultivariateGaussian:
"""Represents a (mean, cov) tuple
.. versionadded:: 3.0.0
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 2a21483..47805c9 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -91,7 +91,7 @@ def _parallelFitTasks(est, train, eva, validation, epm,
collectSubModel):
return [singleTask] * len(epm)
-class ParamGridBuilder(object):
+class ParamGridBuilder:
r"""
Builder for a param grid used in grid search-based model selection.
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index a99e4c5..cb0a64d 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -38,7 +38,7 @@ def _jvm():
raise AttributeError("Cannot load _jvm from SparkContext. Is
SparkContext initialized?")
-class Identifiable(object):
+class Identifiable:
"""
Object with a unique ID.
"""
@@ -60,7 +60,7 @@ class Identifiable(object):
@inherit_doc
-class BaseReadWrite(object):
+class BaseReadWrite:
"""
Base class for MLWriter and MLReader. Stores information about the
SparkContext
and SparkSession.
@@ -210,7 +210,7 @@ class GeneralJavaMLWriter(JavaMLWriter):
@inherit_doc
-class MLWritable(object):
+class MLWritable:
"""
Mixin for ML instances that provide :py:class:`MLWriter`.
@@ -315,7 +315,7 @@ class JavaMLReader(MLReader):
@inherit_doc
-class MLReadable(object):
+class MLReadable:
"""
Mixin for instances that provide :py:class:`MLReader`.
@@ -605,7 +605,7 @@ class DefaultParamsReader(MLReader):
@inherit_doc
-class HasTrainingSummary(object):
+class HasTrainingSummary:
"""
Base class for models that provides Training summary.
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 179b5f2..c35df2e 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -27,7 +27,7 @@ from pyspark.ml.util import _jvm
from pyspark.ml.common import inherit_doc, _java2py, _py2java
-class JavaWrapper(object):
+class JavaWrapper:
"""
Wrapper class for a Java companion object
"""
diff --git a/python/pyspark/mllib/classification.py
b/python/pyspark/mllib/classification.py
index 7decd20..0ccb21f 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -281,7 +281,7 @@ class LogisticRegressionModel(LinearClassificationModel):
return self._call_java("toString")
-class LogisticRegressionWithSGD(object):
+class LogisticRegressionWithSGD:
"""
Train a classification model for Binary Logistic Regression using
Stochastic Gradient Descent.
@@ -373,7 +373,7 @@ class LogisticRegressionWithSGD(object):
return _regression_train_wrapper(train, LogisticRegressionModel, data,
initialWeights)
-class LogisticRegressionWithLBFGS(object):
+class LogisticRegressionWithLBFGS:
"""
Train a classification model for Multinomial/Binary Logistic Regression
using Limited-memory BFGS.
@@ -586,7 +586,7 @@ class SVMModel(LinearClassificationModel):
return model
-class SVMWithSGD(object):
+class SVMWithSGD:
"""
Train a Support Vector Machine (SVM) using Stochastic Gradient Descent.
@@ -771,7 +771,7 @@ class NaiveBayesModel(Saveable, Loader):
return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
-class NaiveBayes(object):
+class NaiveBayes:
"""
Train a Multinomial Naive Bayes model.
diff --git a/python/pyspark/mllib/clustering.py
b/python/pyspark/mllib/clustering.py
index 314c38f..a9e4fd8 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -133,7 +133,7 @@ class BisectingKMeansModel(JavaModelWrapper):
return self.call("computeCost", _convert_to_vector(x))
-class BisectingKMeans(object):
+class BisectingKMeans:
"""
A bisecting k-means algorithm based on the paper "A comparison of
document clustering techniques" by Steinbach, Karypis, and Kumar,
@@ -342,7 +342,7 @@ class KMeansModel(Saveable, Loader):
return KMeansModel(_java2py(sc, java_model.clusterCenters()))
-class KMeans(object):
+class KMeans:
"""
K-means clustering.
@@ -595,7 +595,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
return cls(wrapper)
-class GaussianMixture(object):
+class GaussianMixture:
"""
Learning algorithm for Gaussian Mixtures using the
expectation-maximization algorithm.
@@ -739,7 +739,7 @@ class PowerIterationClusteringModel(JavaModelWrapper,
JavaSaveable, JavaLoader):
return PowerIterationClusteringModel(wrapper)
-class PowerIterationClustering(object):
+class PowerIterationClustering:
"""
Power Iteration Clustering (PIC), a scalable graph clustering algorithm.
@@ -914,7 +914,7 @@ class StreamingKMeansModel(KMeansModel):
return self
-class StreamingKMeans(object):
+class StreamingKMeans:
"""
Provides methods to set k, decayFactor, timeUnit to configure the
KMeans algorithm for fitting and predicting on incoming dstreams.
@@ -1143,7 +1143,7 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
return LDAModel(model)
-class LDA(object):
+class LDA:
"""
Train Latent Dirichlet Allocation (LDA) model.
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index 5f109be..64fb2bd 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -134,7 +134,7 @@ def callMLlibFunc(name: str, *args: Any) ->
"JavaObjectOrPickleDump":
return callJavaFunc(sc, api, *args)
-class JavaModelWrapper(object):
+class JavaModelWrapper:
"""
Wrapper for the model in JVM
"""
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 2ed3727..320ba00 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -43,7 +43,7 @@ __all__ = [
]
-class VectorTransformer(object):
+class VectorTransformer:
"""
Base class for transformation of a vector or RDD of vector
"""
@@ -233,7 +233,7 @@ class StandardScalerModel(JavaVectorTransformer):
return self.call("mean")
-class StandardScaler(object):
+class StandardScaler:
"""
Standardizes features by removing the mean and scaling to unit
variance using column summary statistics on the samples in the
@@ -325,7 +325,7 @@ class ChiSqSelectorModel(JavaVectorTransformer):
return JavaVectorTransformer.transform(self, vector)
-class ChiSqSelector(object):
+class ChiSqSelector:
"""
Creates a ChiSquared feature selector.
The selector supports different selection methods: `numTopFeatures`,
`percentile`, `fpr`,
@@ -481,7 +481,7 @@ class PCAModel(JavaVectorTransformer):
"""
-class PCA(object):
+class PCA:
"""
A feature transformer that projects vectors to a low-dimensional space
using PCA.
@@ -524,7 +524,7 @@ class PCA(object):
return PCAModel(jmodel)
-class HashingTF(object):
+class HashingTF:
"""
Maps a sequence of terms to their term frequencies using the hashing
trick.
@@ -642,7 +642,7 @@ class IDFModel(JavaVectorTransformer):
return self.call("numDocs")
-class IDF(object):
+class IDF:
"""
Inverse document frequency (IDF).
@@ -780,7 +780,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable,
JavaLoader):
return Word2VecModel(model)
-class Word2Vec(object):
+class Word2Vec:
"""Word2Vec creates vector representation of words in a text corpus.
The algorithm first constructs a vocabulary from the corpus
and then learns vector representation of words in the vocabulary.
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index 531e957..a349a55 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -66,7 +66,7 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
return FPGrowthModel(wrapper)
-class FPGrowth(object):
+class FPGrowth:
"""
A Parallel FP-growth algorithm to mine frequent itemsets.
@@ -129,7 +129,7 @@ class PrefixSpanModel(JavaModelWrapper):
return self.call("getFreqSequences").map(lambda x:
PrefixSpan.FreqSequence(x[0], x[1]))
-class PrefixSpan(object):
+class PrefixSpan:
"""
A parallel PrefixSpan algorithm to mine frequent sequential patterns.
The PrefixSpan algorithm is described in Jian Pei et al (2001) [1]_
diff --git a/python/pyspark/mllib/linalg/__init__.py
b/python/pyspark/mllib/linalg/__init__.py
index 9992906..d9dee01 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -253,7 +253,7 @@ class MatrixUDT(UserDefinedType):
return "matrix"
-class Vector(object):
+class Vector:
__UDT__ = VectorUDT()
@@ -895,7 +895,7 @@ class SparseVector(Vector):
return result
-class Vectors(object):
+class Vectors:
"""
Factory methods for working with vectors.
@@ -1061,7 +1061,7 @@ class Vectors(object):
return all_equal
-class Matrix(object):
+class Matrix:
__UDT__ = MatrixUDT()
@@ -1407,7 +1407,7 @@ class SparseMatrix(Matrix):
return np.all(self.toArray() == other.toArray())
-class Matrices(object):
+class Matrices:
@staticmethod
def dense(numRows, numCols, values):
"""
@@ -1448,7 +1448,7 @@ class Matrices(object):
raise TypeError("Unsupported matrix type %s" % type(mat))
-class QRDecomposition(object):
+class QRDecomposition:
"""
Represents QR factors.
"""
diff --git a/python/pyspark/mllib/linalg/distributed.py
b/python/pyspark/mllib/linalg/distributed.py
index cce74a2..f892d41 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -43,7 +43,7 @@ __all__ = [
]
-class DistributedMatrix(object):
+class DistributedMatrix:
"""
Represents a distributively stored matrix backed by one or
more RDDs.
@@ -520,7 +520,7 @@ class SingularValueDecomposition(JavaModelWrapper):
return self.call("V")
-class IndexedRow(object):
+class IndexedRow:
"""
Represents a row of an IndexedRowMatrix.
@@ -868,7 +868,7 @@ class IndexedRowMatrix(DistributedMatrix):
return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply",
matrix))
-class MatrixEntry(object):
+class MatrixEntry:
"""
Represents an entry of a CoordinateMatrix.
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 9684f45..bf8c50e 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -39,7 +39,7 @@ def toArray(f):
return func
-class RandomRDDs(object):
+class RandomRDDs:
"""
Generator methods for creating RDDs comprised of i.i.d samples from
some distribution.
diff --git a/python/pyspark/mllib/recommendation.py
b/python/pyspark/mllib/recommendation.py
index 7755f1e..cb412fb 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -217,7 +217,7 @@ class MatrixFactorizationModel(JavaModelWrapper,
JavaSaveable, JavaLoader):
return MatrixFactorizationModel(wrapper)
-class ALS(object):
+class ALS:
"""Alternating Least Squares matrix factorization
.. versionadded:: 0.9.0
diff --git a/python/pyspark/mllib/regression.py
b/python/pyspark/mllib/regression.py
index a0f4649..2cac731 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -42,7 +42,7 @@ __all__ = [
]
-class LabeledPoint(object):
+class LabeledPoint:
"""
Class that represents the features and labels of a data point.
@@ -76,7 +76,7 @@ class LabeledPoint(object):
return "LabeledPoint(%s, %s)" % (self.label, self.features)
-class LinearModel(object):
+class LinearModel:
"""
A linear model that has a vector of coefficients and an intercept.
@@ -245,7 +245,7 @@ def _regression_train_wrapper(train_func, modelClass, data,
initial_weights):
return modelClass(weights, intercept)
-class LinearRegressionWithSGD(object):
+class LinearRegressionWithSGD:
"""
Train a linear regression model with no regularization using Stochastic
Gradient Descent.
@@ -425,7 +425,7 @@ class LassoModel(LinearRegressionModelBase):
return model
-class LassoWithSGD(object):
+class LassoWithSGD:
"""
Train a regression model with L1-regularization using Stochastic Gradient
Descent.
@@ -601,7 +601,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
return model
-class RidgeRegressionWithSGD(object):
+class RidgeRegressionWithSGD:
"""
Train a regression model with L2-regularization using Stochastic Gradient
Descent.
@@ -794,7 +794,7 @@ class IsotonicRegressionModel(Saveable, Loader):
return IsotonicRegressionModel(py_boundaries, py_predictions,
java_model.isotonic)
-class IsotonicRegression(object):
+class IsotonicRegression:
"""
Isotonic regression.
Currently implemented using parallelized pool adjacent violators
@@ -843,7 +843,7 @@ class IsotonicRegression(object):
return IsotonicRegressionModel(boundaries.toArray(),
predictions.toArray(), isotonic)
-class StreamingLinearAlgorithm(object):
+class StreamingLinearAlgorithm:
"""
Base class that has to be inherited by any StreamingLinearAlgorithm.
diff --git a/python/pyspark/mllib/stat/KernelDensity.py
b/python/pyspark/mllib/stat/KernelDensity.py
index ec6b6c5..103c955 100644
--- a/python/pyspark/mllib/stat/KernelDensity.py
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -24,7 +24,7 @@ from pyspark.mllib.common import callMLlibFunc
from pyspark.rdd import RDD
-class KernelDensity(object):
+class KernelDensity:
"""
Estimate probability density at required points given an RDD of samples
from the population.
diff --git a/python/pyspark/mllib/stat/_statistics.py
b/python/pyspark/mllib/stat/_statistics.py
index 2fab449..34a373d 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -58,7 +58,7 @@ class MultivariateStatisticalSummary(JavaModelWrapper):
return self.call("normL2").toArray()
-class Statistics(object):
+class Statistics:
@staticmethod
def colStats(rdd):
"""
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 1a379eb..9b477ff 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -141,7 +141,7 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
return "org.apache.spark.mllib.tree.model.DecisionTreeModel"
-class DecisionTree(object):
+class DecisionTree:
"""
Learning algorithm for a decision tree model for classification or
regression.
@@ -366,7 +366,7 @@ class RandomForestModel(TreeEnsembleModel, JavaLoader):
return "org.apache.spark.mllib.tree.model.RandomForestModel"
-class RandomForest(object):
+class RandomForest:
"""
Learning algorithm for a random forest model for classification or
regression.
@@ -637,7 +637,7 @@ class GradientBoostedTreesModel(TreeEnsembleModel,
JavaLoader):
return "org.apache.spark.mllib.tree.model.GradientBoostedTreesModel"
-class GradientBoostedTrees(object):
+class GradientBoostedTrees:
"""
Learning algorithm for a gradient boosted trees model for
classification or regression.
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index b3e24b4..d3824e8 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -24,7 +24,7 @@ from pyspark.mllib.linalg import Vectors, SparseVector,
_convert_to_vector
from pyspark.sql import DataFrame
-class MLUtils(object):
+class MLUtils:
"""
Helper methods to load, save and pre-process data used in MLlib.
@@ -420,7 +420,7 @@ class MLUtils(object):
return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols))
-class Saveable(object):
+class Saveable:
"""
Mixin for models and transformers which may be saved as files.
@@ -468,7 +468,7 @@ class JavaSaveable(Saveable):
self._java_model.save(sc._jsc.sc(), path)
-class Loader(object):
+class Loader:
"""
Mixin for classes which can load saved models from files.
@@ -534,7 +534,7 @@ class JavaLoader(Loader):
return cls(java_model)
-class LinearDataGenerator(object):
+class LinearDataGenerator:
"""Utils for generating linear data.
.. versionadded:: 1.5.0
diff --git a/python/pyspark/pandas/accessors.py
b/python/pyspark/pandas/accessors.py
index d04d778..5124be6 100644
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@@ -52,7 +52,7 @@ if TYPE_CHECKING:
from pyspark.sql._typing import UserDefinedFunctionLike
-class PandasOnSparkFrameMethods(object):
+class PandasOnSparkFrameMethods:
"""pandas-on-Spark specific features for DataFrame."""
def __init__(self, frame: "DataFrame"):
@@ -750,7 +750,7 @@ class PandasOnSparkFrameMethods(object):
return DataFrame(internal)
-class PandasOnSparkSeriesMethods(object):
+class PandasOnSparkSeriesMethods:
"""pandas-on-Spark specific features for Series."""
def __init__(self, series: "Series"):
diff --git a/python/pyspark/pandas/categorical.py
b/python/pyspark/pandas/categorical.py
index e59048b..246e6ac 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
import pyspark.pandas as ps
-class CategoricalAccessor(object):
+class CategoricalAccessor:
"""
Accessor object for categorical properties of the Series values.
diff --git a/python/pyspark/pandas/datetimes.py
b/python/pyspark/pandas/datetimes.py
index 4bd8c2f..f52809d 100644
--- a/python/pyspark/pandas/datetimes.py
+++ b/python/pyspark/pandas/datetimes.py
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
import pyspark.pandas as ps
-class DatetimeMethods(object):
+class DatetimeMethods:
"""Date/Time methods for pandas-on-Spark Series"""
def __init__(self, series: "ps.Series"):
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index de36531..008d958 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -341,24 +341,6 @@ rectangle 16.0 2.348543e+108
"""
-if (3, 5) <= sys.version_info < (3, 7) and __name__ != "__main__":
- from typing import GenericMeta # type: ignore[attr-defined]
-
- # This is a workaround to support variadic generic in DataFrame in Python
3.5+.
- # See https://github.com/python/typing/issues/193
- # We wrap the input params by a tuple to mimic variadic generic.
- old_getitem = GenericMeta.__getitem__
-
- @no_type_check
- def new_getitem(self, params):
- if hasattr(self, "is_dataframe"):
- return old_getitem(self, create_tuple_for_frame_type(params))
- else:
- return old_getitem(self, params)
-
- GenericMeta.__getitem__ = new_getitem
-
-
class DataFrame(Frame, Generic[T]):
"""
pandas-on-Spark DataFrame that corresponds to pandas DataFrame logically.
This holds Spark
@@ -372,8 +354,6 @@ class DataFrame(Frame, Generic[T]):
data : numpy ndarray (structured or homogeneous), dict, pandas DataFrame,
Spark DataFrame \
or pandas-on-Spark Series
Dict can contain Series, arrays, constants, or list-like objects
- If data is a dict, argument order is maintained for Python 3.6
- and later.
Note that if `data` is a pandas DataFrame, a Spark DataFrame, and a
pandas-on-Spark Series,
other arguments should not be used.
index : Index or array-like
@@ -12315,19 +12295,11 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
internal = this._internal.with_new_columns(applied)
return DataFrame(internal)
- if sys.version_info >= (3, 7):
-
- def __class_getitem__(cls, params: Any) -> object:
- # This is a workaround to support variadic generic in DataFrame in
Python 3.7.
- # See https://github.com/python/typing/issues/193
- # we always wraps the given type hints by a tuple to mimic the
variadic generic.
- return create_tuple_for_frame_type(params)
-
- elif (3, 5) <= sys.version_info < (3, 7):
- # This is a workaround to support variadic generic in DataFrame in
Python 3.5+
- # The implementation is in its metaclass so this flag is needed to
distinguish
- # pandas-on-Spark DataFrame.
- is_dataframe = None
+ def __class_getitem__(cls, params: Any) -> object:
+ # This is a workaround to support variadic generic in DataFrame in
Python 3.7.
+ # See https://github.com/python/typing/issues/193
+ # we always wraps the given type hints by a tuple to mimic the
variadic generic.
+ return create_tuple_for_frame_type(params)
def _reduce_spark_multi(sdf: SparkDataFrame, aggs: List[Column]) -> Any:
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 7cdfb86..3be5416 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -20,9 +20,8 @@ A wrapper for GroupedData to behave similar to pandas GroupBy.
"""
from abc import ABCMeta, abstractmethod
-import sys
import inspect
-from collections import OrderedDict, namedtuple
+from collections import OrderedDict, defaultdict, namedtuple
from distutils.version import LooseVersion
from functools import partial
from itertools import product
@@ -3309,7 +3308,7 @@ def normalize_keyword_aggregation(
Normalize user-provided kwargs.
Transforms from the new ``Dict[str, NamedAgg]`` style kwargs
- to the old OrderedDict[str, List[scalar]]].
+ to the old defaultdict[str, List[scalar]].
Parameters
----------
@@ -3327,15 +3326,9 @@ def normalize_keyword_aggregation(
Examples
--------
>>> normalize_keyword_aggregation({'output': ('input', 'sum')})
- (OrderedDict([('input', ['sum'])]), ['output'], [('input', 'sum')])
+ (defaultdict(<class 'list'>, {'input': ['sum']}), ['output'], [('input',
'sum')])
"""
- # this is due to python version issue, not sure the impact on
pandas-on-Spark
- PY36 = sys.version_info >= (3, 6)
- if not PY36:
- kwargs = OrderedDict(sorted(kwargs.items()))
-
- # TODO(Py35): When we drop python 3.5, change this to defaultdict(list)
- aggspec: Dict[Union[Any, Tuple], List[str]] = OrderedDict()
+ aggspec: Dict[Union[Any, Tuple], List[str]] = defaultdict(list)
order: List[Tuple] = []
columns, pairs = zip(*kwargs.items())
diff --git a/python/pyspark/pandas/indexing.py
b/python/pyspark/pandas/indexing.py
index b561e03..e8b7b5d 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -58,7 +58,7 @@ if TYPE_CHECKING:
from pyspark.pandas.series import Series
-class IndexerLike(object):
+class IndexerLike:
def __init__(self, psdf_or_psser: "Frame"):
from pyspark.pandas.frame import DataFrame
from pyspark.pandas.series import Series
diff --git a/python/pyspark/pandas/internal.py
b/python/pyspark/pandas/internal.py
index e5786f5..d361c3c 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -211,7 +211,7 @@ class InternalField:
)
-class InternalFrame(object):
+class InternalFrame:
"""
The internal immutable DataFrame which manages Spark DataFrame and column
names and index
information.
diff --git a/python/pyspark/pandas/missing/frame.py
b/python/pyspark/pandas/missing/frame.py
index d822c14..775115d 100644
--- a/python/pyspark/pandas/missing/frame.py
+++ b/python/pyspark/pandas/missing/frame.py
@@ -29,7 +29,7 @@ def _unsupported_property(property_name, deprecated=False,
reason=""):
)
-class _MissingPandasLikeDataFrame(object):
+class _MissingPandasLikeDataFrame:
# Functions
asfreq = _unsupported_function("asfreq")
diff --git a/python/pyspark/pandas/missing/groupby.py
b/python/pyspark/pandas/missing/groupby.py
index 9a18374..839b266 100644
--- a/python/pyspark/pandas/missing/groupby.py
+++ b/python/pyspark/pandas/missing/groupby.py
@@ -36,7 +36,7 @@ def _unsupported_property(property_name, deprecated=False,
reason=""):
)
-class MissingPandasLikeDataFrameGroupBy(object):
+class MissingPandasLikeDataFrameGroupBy:
# Properties
corr = _unsupported_property("corr")
@@ -68,7 +68,7 @@ class MissingPandasLikeDataFrameGroupBy(object):
sem = _unsupported_function("sem")
-class MissingPandasLikeSeriesGroupBy(object):
+class MissingPandasLikeSeriesGroupBy:
# Properties
corr = _unsupported_property("corr")
diff --git a/python/pyspark/pandas/missing/indexes.py
b/python/pyspark/pandas/missing/indexes.py
index 73dab4a..efc2e10 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -35,7 +35,7 @@ def _unsupported_property(property_name, deprecated=False,
reason="", cls="Index
)
-class MissingPandasLikeIndex(object):
+class MissingPandasLikeIndex:
# Properties
nbytes = _unsupported_property("nbytes")
@@ -115,7 +115,7 @@ class
MissingPandasLikeTimedeltaIndex(MissingPandasLikeIndex):
mean = _unsupported_function("mean", cls="TimedeltaIndex")
-class MissingPandasLikeMultiIndex(object):
+class MissingPandasLikeMultiIndex:
# Functions
argsort = _unsupported_function("argsort")
diff --git a/python/pyspark/pandas/missing/series.py
b/python/pyspark/pandas/missing/series.py
index a26d17b..695866e 100644
--- a/python/pyspark/pandas/missing/series.py
+++ b/python/pyspark/pandas/missing/series.py
@@ -29,7 +29,7 @@ def _unsupported_property(property_name, deprecated=False,
reason=""):
)
-class MissingPandasLikeSeries(object):
+class MissingPandasLikeSeries:
# Functions
asfreq = _unsupported_function("asfreq")
diff --git a/python/pyspark/pandas/missing/window.py
b/python/pyspark/pandas/missing/window.py
index a29887e..fb79992 100644
--- a/python/pyspark/pandas/missing/window.py
+++ b/python/pyspark/pandas/missing/window.py
@@ -54,7 +54,7 @@ def _unsupported_property_rolling(property_name,
deprecated=False, reason=""):
)
-class MissingPandasLikeExpanding(object):
+class MissingPandasLikeExpanding:
agg = _unsupported_function_expanding("agg")
aggregate = _unsupported_function_expanding("aggregate")
apply = _unsupported_function_expanding("apply")
@@ -72,7 +72,7 @@ class MissingPandasLikeExpanding(object):
ndim = _unsupported_property_expanding("ndim")
-class MissingPandasLikeRolling(object):
+class MissingPandasLikeRolling:
agg = _unsupported_function_rolling("agg")
aggregate = _unsupported_function_rolling("aggregate")
apply = _unsupported_function_rolling("apply")
@@ -90,7 +90,7 @@ class MissingPandasLikeRolling(object):
ndim = _unsupported_property_rolling("ndim")
-class MissingPandasLikeExpandingGroupby(object):
+class MissingPandasLikeExpandingGroupby:
agg = _unsupported_function_expanding("agg")
aggregate = _unsupported_function_expanding("aggregate")
apply = _unsupported_function_expanding("apply")
@@ -108,7 +108,7 @@ class MissingPandasLikeExpandingGroupby(object):
ndim = _unsupported_property_expanding("ndim")
-class MissingPandasLikeRollingGroupby(object):
+class MissingPandasLikeRollingGroupby:
agg = _unsupported_function_rolling("agg")
aggregate = _unsupported_function_rolling("aggregate")
apply = _unsupported_function_rolling("apply")
diff --git a/python/pyspark/pandas/mlflow.py b/python/pyspark/pandas/mlflow.py
index 067f9ca..32ebe77 100644
--- a/python/pyspark/pandas/mlflow.py
+++ b/python/pyspark/pandas/mlflow.py
@@ -34,7 +34,7 @@ from pyspark.pandas.typedef import as_spark_type
__all__ = ["PythonModelWrapper", "load_model"]
-class PythonModelWrapper(object):
+class PythonModelWrapper:
"""
A wrapper around MLflow's Python object model.
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index ff20f3e..d8b4669 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -344,21 +344,6 @@ dtype: float64
str_type = str
-if (3, 5) <= sys.version_info < (3, 7) and __name__ != "__main__":
- from typing import GenericMeta # type: ignore[attr-defined]
-
- old_getitem = GenericMeta.__getitem__
-
- @no_type_check
- def new_getitem(self, params):
- if hasattr(self, "is_series"):
- return old_getitem(self, create_type_for_series_type(params))
- else:
- return old_getitem(self, params)
-
- GenericMeta.__getitem__ = new_getitem
-
-
class Series(Frame, IndexOpsMixin, Generic[T]):
"""
pandas-on-Spark Series that corresponds to pandas Series logically. This
holds Spark Column
@@ -373,8 +358,6 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
----------
data : array-like, dict, or scalar value, pandas Series
Contains data stored in Series
- If data is a dict, argument order is maintained for Python 3.6
- and later.
Note that if `data` is a pandas Series, other arguments should not be
used.
index : array-like or Index (1d)
Values must be hashable and have the same length as `data`.
diff --git a/python/pyspark/pandas/spark/accessors.py
b/python/pyspark/pandas/spark/accessors.py
index e0d4639..f13898b 100644
--- a/python/pyspark/pandas/spark/accessors.py
+++ b/python/pyspark/pandas/spark/accessors.py
@@ -301,7 +301,7 @@ class SparkIndexMethods(SparkIndexOpsMethods["ps.Index"]):
return DataFrame(self._data._internal.resolved_copy).index
-class SparkFrameMethods(object):
+class SparkFrameMethods:
"""Spark related features. Usually, the features here are missing in pandas
but Spark has it."""
diff --git a/python/pyspark/pandas/sql_processor.py
b/python/pyspark/pandas/sql_processor.py
index 8126d1e..47cb938 100644
--- a/python/pyspark/pandas/sql_processor.py
+++ b/python/pyspark/pandas/sql_processor.py
@@ -254,7 +254,7 @@ def escape_sql_string(value: str) -> str:
return value.translate(_escape_table)
-class SQLProcessor(object):
+class SQLProcessor:
def __init__(self, scope: Dict[str, Any], statement: str, session:
SparkSession):
self._scope = scope
self._statement = statement
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index 8fa530b..986e3d1 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -43,7 +43,7 @@ if TYPE_CHECKING:
import pyspark.pandas as ps
-class StringMethods(object):
+class StringMethods:
"""String methods for pandas-on-Spark Series"""
def __init__(self, series: "ps.Series"):
diff --git a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
index 1d1a8ec..9f57ad4 100644
--- a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
+++ b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
@@ -41,7 +41,7 @@ if extension_object_dtypes_available:
from pandas import BooleanDtype, StringDtype
-class TestCasesUtils(object):
+class TestCasesUtils:
"""A utility holding common test cases for arithmetic operations of
different data types."""
@property
diff --git a/python/pyspark/pandas/tests/test_dataframe.py
b/python/pyspark/pandas/tests/test_dataframe.py
index c2ae4da..84a53c0 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -1478,14 +1478,12 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
- if sys.version_info >= (3, 6):
- # flaky in Python 3.5.
- self.assert_eq(
- psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1,
("x", "b"): -2})
- )
- self.assert_eq(
- psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x",
"b"): -2, "x": -1})
- )
+ self.assert_eq(
+ psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, ("x",
"b"): -2})
+ )
+ self.assert_eq(
+ psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", "b"):
-2, "x": -1})
+ )
# check multi index
pdf = pdf.set_index([("x", "a"), ("x", "b")])
diff --git a/python/pyspark/pandas/typedef/typehints.py
b/python/pyspark/pandas/typedef/typehints.py
index 6620ffc..544f4e4 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -73,7 +73,8 @@ import pyarrow as pa
import pyspark.sql.types as types
from pyspark.sql.pandas.types import to_arrow_type, from_arrow_type
-from pyspark import pandas as ps # For running doctests and reference
resolution in PyCharm.
+# For running doctests and reference resolution in PyCharm.
+from pyspark import pandas as ps # noqa: F401
from pyspark.pandas._typing import Dtype, T
from pyspark.pandas.typedef.string_typehints import resolve_string_type_hint
@@ -91,7 +92,7 @@ class SeriesType(Generic[T]):
return "SeriesType[{}]".format(self.spark_type)
-class DataFrameType(object):
+class DataFrameType:
def __init__(
self,
index_fields: List["InternalField"],
@@ -114,7 +115,7 @@ class DataFrameType(object):
# The type is a scalar type that is furthermore understood by Spark.
-class ScalarType(object):
+class ScalarType:
def __init__(self, dtype: Dtype, spark_type: types.DataType):
self.dtype = dtype
self.spark_type = spark_type
@@ -124,7 +125,7 @@ class ScalarType(object):
# The type is left unspecified or we do not know about this type.
-class UnknownType(object):
+class UnknownType:
def __init__(self, tpe: Any):
self.tpe = tpe
@@ -132,13 +133,13 @@ class UnknownType(object):
return "UnknownType[{}]".format(self.tpe)
-class IndexNameTypeHolder(object):
+class IndexNameTypeHolder:
name = None
tpe = None
short_name = "IndexNameType"
-class NameTypeHolder(object):
+class NameTypeHolder:
name = None
tpe = None
short_name = "NameType"
@@ -571,12 +572,6 @@ def infer_return_type(f: Callable) -> Union[SeriesType,
DataFrameType, ScalarTyp
# This type hint can happen when given hints are string to avoid
forward reference.
tpe = resolve_string_type_hint(tpe)
- if hasattr(tpe, "__origin__") and (
- tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series
- ):
- # When Python version is lower then 3.7. Unwrap it to a
Tuple/SeriesType type hints.
- tpe = tpe.__args__[0]
-
if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
tpe = tpe.__args__[0]
if issubclass(tpe, NameTypeHolder):
@@ -585,17 +580,12 @@ def infer_return_type(f: Callable) -> Union[SeriesType,
DataFrameType, ScalarTyp
return SeriesType(dtype, spark_type)
# Note that, DataFrame type hints will create a Tuple.
- # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
- # Check if the name is Tuple.
+ # Tuple has _name but other types have __name__
name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
+ # Check if the name is Tuple.
if name == "Tuple":
tuple_type = tpe
- if hasattr(tuple_type, "__tuple_params__"):
- # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead.
- # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py
- parameters = getattr(tuple_type, "__tuple_params__")
- else:
- parameters = getattr(tuple_type, "__args__")
+ parameters = getattr(tuple_type, "__args__")
index_parameters = [
p for p in parameters if isclass(p) and issubclass(p,
IndexNameTypeHolder)
diff --git a/python/pyspark/pandas/usage_logging/usage_logger.py
b/python/pyspark/pandas/usage_logging/usage_logger.py
index 286b569..a17c52a 100644
--- a/python/pyspark/pandas/usage_logging/usage_logger.py
+++ b/python/pyspark/pandas/usage_logging/usage_logger.py
@@ -37,7 +37,7 @@ def _format_signature(signature):
)
-class PandasOnSparkUsageLogger(object):
+class PandasOnSparkUsageLogger:
"""
The reference implementation of usage logger.
diff --git a/python/pyspark/profiler.py b/python/pyspark/profiler.py
index 3d3a65a..6271bbc 100644
--- a/python/pyspark/profiler.py
+++ b/python/pyspark/profiler.py
@@ -24,7 +24,7 @@ import sys
from pyspark.accumulators import AccumulatorParam
-class ProfilerCollector(object):
+class ProfilerCollector:
"""
This class keeps track of different profilers on a per
stage/UDF basis. Also this is used to create new profilers for
@@ -70,7 +70,7 @@ class ProfilerCollector(object):
self.profilers[i][2] = True
-class Profiler(object):
+class Profiler:
"""
PySpark supports custom profilers, this is to allow for different
profilers to
be used as well as outputting to different formats than what is provided
in the
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 2452d69..80d63b4 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -70,7 +70,7 @@ from pyspark.util import fail_on_stopiteration, _parse_memory
__all__ = ["RDD"]
-class PythonEvalType(object):
+class PythonEvalType:
"""
Evaluation type of python rdd.
@@ -189,7 +189,7 @@ def _load_from_socket(sock_info, serializer):
def _local_iterator_from_socket(sock_info, serializer):
- class PyLocalIterable(object):
+ class PyLocalIterable:
"""Create a synchronous local iterable over a socket"""
def __init__(self, _sock_info, _serializer):
@@ -235,7 +235,7 @@ def _local_iterator_from_socket(sock_info, serializer):
return iter(PyLocalIterable(sock_info, serializer))
-class Partitioner(object):
+class Partitioner:
def __init__(self, numPartitions, partitionFunc):
self.numPartitions = numPartitions
self.partitionFunc = partitionFunc
@@ -251,7 +251,7 @@ class Partitioner(object):
return self.partitionFunc(k) % self.numPartitions
-class RDD(object):
+class RDD:
"""
A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
@@ -2947,7 +2947,7 @@ def _wrap_function(sc, func, deserializer, serializer,
profiler=None):
)
-class RDDBarrier(object):
+class RDDBarrier:
"""
Wraps an RDD in a barrier stage, which forces Spark to launch tasks of
this stage together.
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
index d917c69..11da447 100644
--- a/python/pyspark/rddsampler.py
+++ b/python/pyspark/rddsampler.py
@@ -20,7 +20,7 @@ import random
import math
-class RDDSamplerBase(object):
+class RDDSamplerBase:
def __init__(self, withReplacement, seed=None):
self._seed = seed if seed is not None else random.randint(0,
sys.maxsize)
self._withReplacement = withReplacement
diff --git a/python/pyspark/resource/information.py
b/python/pyspark/resource/information.py
index 60ba96f..bcd78eb 100644
--- a/python/pyspark/resource/information.py
+++ b/python/pyspark/resource/information.py
@@ -18,7 +18,7 @@
from typing import List
-class ResourceInformation(object):
+class ResourceInformation:
"""
Class to hold information about a type of Resource. A resource could be a
GPU, FPGA, etc.
diff --git a/python/pyspark/resource/profile.py
b/python/pyspark/resource/profile.py
index f2f2ba7..f54aea2 100644
--- a/python/pyspark/resource/profile.py
+++ b/python/pyspark/resource/profile.py
@@ -26,7 +26,7 @@ from pyspark.resource.requests import (
)
-class ResourceProfile(object):
+class ResourceProfile:
"""
Resource profile to associate with an RDD. A
:class:`pyspark.resource.ResourceProfile`
@@ -102,7 +102,7 @@ class ResourceProfile(object):
return self._executor_resource_requests
-class ResourceProfileBuilder(object):
+class ResourceProfileBuilder:
"""
Resource profile Builder to build a resource profile to associate with an
RDD.
diff --git a/python/pyspark/resource/requests.py
b/python/pyspark/resource/requests.py
index f578956..f5f07ec 100644
--- a/python/pyspark/resource/requests.py
+++ b/python/pyspark/resource/requests.py
@@ -21,7 +21,7 @@ from py4j.java_gateway import JavaObject, JVMView
from pyspark.util import _parse_memory # type: ignore[attr-defined]
-class ExecutorResourceRequest(object):
+class ExecutorResourceRequest:
"""
An Executor resource request. This is used in conjunction with the
ResourceProfile to
programmatically specify the resources needed for an RDD that will be
applied at the
@@ -94,7 +94,7 @@ class ExecutorResourceRequest(object):
return self._vendor
-class ExecutorResourceRequests(object):
+class ExecutorResourceRequests:
"""
A set of Executor resource requests. This is used in conjunction with the
@@ -230,7 +230,7 @@ class ExecutorResourceRequests(object):
return self._executor_resources
-class TaskResourceRequest(object):
+class TaskResourceRequest:
"""
A task resource request. This is used in conjunction with the
:class:`pyspark.resource.ResourceProfile` to programmatically specify the
resources
@@ -267,7 +267,7 @@ class TaskResourceRequest(object):
return self._amount
-class TaskResourceRequests(object):
+class TaskResourceRequests:
"""
A set of task resource requests. This is used in conjunction with the
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 766ea64..a0941afd 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -78,7 +78,7 @@ __all__ = [
]
-class SpecialLengths(object):
+class SpecialLengths:
END_OF_DATA_SECTION = -1
PYTHON_EXCEPTION_THROWN = -2
TIMING_DATA = -3
@@ -87,7 +87,7 @@ class SpecialLengths(object):
START_ARROW_STREAM = -6
-class Serializer(object):
+class Serializer:
def dump_stream(self, iterator, stream):
"""
Serialize an iterator of objects to the output stream.
@@ -605,7 +605,7 @@ def write_with_length(obj, stream):
stream.write(obj)
-class ChunkedStream(object):
+class ChunkedStream:
"""
This is a file-like object takes a stream of data, of unknown length, and
breaks it into fixed
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index bd45566..410519d 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -90,7 +90,7 @@ MemoryBytesSpilled = 0
DiskBytesSpilled = 0
-class Aggregator(object):
+class Aggregator:
"""
Aggregator has tree functions to merge values into combiner.
@@ -117,7 +117,7 @@ class SimpleAggregator(Aggregator):
Aggregator.__init__(self, lambda x: x, combiner, combiner)
-class Merger(object):
+class Merger:
"""
Merge shuffled data together by aggregator
@@ -438,7 +438,7 @@ class ExternalMerger(Merger):
shutil.rmtree(d, True)
-class ExternalSorter(object):
+class ExternalSorter:
"""
ExternalSorter will divide the elements into chunks, sort them in
memory and dump them into disks, finally merge them back.
@@ -528,7 +528,7 @@ class ExternalSorter(object):
return heapq.merge(*chunks, key=key, reverse=reverse)
-class ExternalList(object):
+class ExternalList:
"""
ExternalList can have many items which cannot be hold in memory in
the same time.
@@ -663,7 +663,7 @@ class ExternalListOfList(ExternalList):
yield v
-class GroupByKey(object):
+class GroupByKey:
"""
Group a sorted iterator as [(k1, it1), (k2, it2), ...]
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index b0c9148..bbe6227 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -59,7 +59,7 @@ class Function(NamedTuple):
isTemporary: bool
-class Catalog(object):
+class Catalog:
"""User-facing catalog API, accessible through `SparkSession.catalog`.
This is a thin wrapper around its Scala implementation
org.apache.spark.sql.catalog.Catalog.
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 26c34b8..2f78608 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -176,7 +176,7 @@ def _reverse_op(
return _
-class Column(object):
+class Column:
"""
A column in a DataFrame.
diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py
index 8489e55..7e8a565 100644
--- a/python/pyspark/sql/conf.py
+++ b/python/pyspark/sql/conf.py
@@ -24,7 +24,7 @@ from pyspark import since, _NoValue # type:
ignore[attr-defined]
from pyspark._globals import _NoValueType
-class RuntimeConfig(object):
+class RuntimeConfig:
"""User-facing configuration API, accessible through `SparkSession.conf`.
Options set here are automatically propagated to the Hadoop configuration
during I/O.
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index b64de05..78b2064 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -60,7 +60,7 @@ __all__ = ["SQLContext", "HiveContext"]
# TODO: ignore[attr-defined] will be removed, once SparkContext is inlined
-class SQLContext(object):
+class SQLContext:
"""The entry point for working with structured data (rows and columns) in
Spark, in Spark 1.x.
As of Spark 2.0, this is replaced by :class:`SparkSession`. However, we
are keeping the class
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index e2574e5..a1071e0 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -3284,7 +3284,7 @@ def _to_scala_map(sc: SparkContext, jm: Dict) ->
JavaObject:
return sc._jvm.PythonUtils.toScalaMap(jm) # type: ignore[attr-defined]
-class DataFrameNaFunctions(object):
+class DataFrameNaFunctions:
"""Functionality for working with missing data in :class:`DataFrame`.
.. versionadded:: 1.4
@@ -3356,7 +3356,7 @@ class DataFrameNaFunctions(object):
replace.__doc__ = DataFrame.replace.__doc__
-class DataFrameStatFunctions(object):
+class DataFrameStatFunctions:
"""Functionality for statistic functions with :class:`DataFrame`.
.. versionadded:: 1.4
diff --git a/python/pyspark/sql/pandas/conversion.py
b/python/pyspark/sql/pandas/conversion.py
index 045115f7..70bb0f7 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -48,7 +48,7 @@ if TYPE_CHECKING:
from pyspark.sql import DataFrame
-class PandasConversionMixin(object):
+class PandasConversionMixin:
"""
Min-in for the conversion from Spark to pandas. Currently, only
:class:`DataFrame`
can use this class.
@@ -361,7 +361,7 @@ class PandasConversionMixin(object):
return [batches[i] for i in batch_order]
-class SparkConversionMixin(object):
+class SparkConversionMixin:
"""
Min-in for the conversion from pandas to Spark. Currently, only
:class:`SparkSession`
can use this class.
diff --git a/python/pyspark/sql/pandas/functions.py
b/python/pyspark/sql/pandas/functions.py
index ca3263b..5f50220 100644
--- a/python/pyspark/sql/pandas/functions.py
+++ b/python/pyspark/sql/pandas/functions.py
@@ -26,7 +26,7 @@ from pyspark.sql.types import DataType
from pyspark.sql.udf import _create_udf
-class PandasUDFType(object):
+class PandasUDFType:
"""Pandas UDF Types. See :meth:`pyspark.sql.functions.pandas_udf`."""
SCALAR = PythonEvalType.SQL_SCALAR_PANDAS_UDF
diff --git a/python/pyspark/sql/pandas/group_ops.py
b/python/pyspark/sql/pandas/group_ops.py
index 593b72d..81afb82 100644
--- a/python/pyspark/sql/pandas/group_ops.py
+++ b/python/pyspark/sql/pandas/group_ops.py
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
from pyspark.sql.group import GroupedData
-class PandasGroupedOpsMixin(object):
+class PandasGroupedOpsMixin:
"""
Min-in for pandas grouped operations. Currently, only :class:`GroupedData`
can use this class.
@@ -237,7 +237,7 @@ class PandasGroupedOpsMixin(object):
return PandasCogroupedOps(self, other)
-class PandasCogroupedOps(object):
+class PandasCogroupedOps:
"""
A logical grouping of two :class:`GroupedData`,
created by :func:`GroupedData.cogroup`.
diff --git a/python/pyspark/sql/pandas/map_ops.py
b/python/pyspark/sql/pandas/map_ops.py
index ce1480f..c1c29ec 100644
--- a/python/pyspark/sql/pandas/map_ops.py
+++ b/python/pyspark/sql/pandas/map_ops.py
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
from pyspark.sql.pandas._typing import PandasMapIterFunction,
ArrowMapIterFunction
-class PandasMapOpsMixin(object):
+class PandasMapOpsMixin:
"""
Min-in for pandas map operations. Currently, only :class:`DataFrame`
can use this class.
diff --git a/python/pyspark/sql/pandas/serializers.py
b/python/pyspark/sql/pandas/serializers.py
index 44276a4..47c98c8 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -22,7 +22,7 @@ Serializers for PyArrow and pandas conversions. See
`pyspark.serializers` for mo
from pyspark.serializers import Serializer, read_int, write_int,
UTF8Deserializer
-class SpecialLengths(object):
+class SpecialLengths:
END_OF_DATA_SECTION = -1
PYTHON_EXCEPTION_THROWN = -2
TIMING_DATA = -3
diff --git a/python/pyspark/sql/pandas/typehints.py
b/python/pyspark/sql/pandas/typehints.py
index 1779b7b..167104c 100644
--- a/python/pyspark/sql/pandas/typehints.py
+++ b/python/pyspark/sql/pandas/typehints.py
@@ -138,7 +138,7 @@ def infer_eval_type(
def check_tuple_annotation(
annotation: Any, parameter_check_func: Optional[Callable[[Any], bool]] =
None
) -> bool:
- # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
+ # Tuple has _name but other types have __name__
# Check if the name is Tuple first. After that, check the generic types.
name = getattr(annotation, "_name", getattr(annotation, "__name__", None))
return name == "Tuple" and (
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 2ab3f52..c827de4 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -37,7 +37,7 @@ PathOrPaths = Union[str, List[str]]
TupleOrListOfString = Union[List[str], Tuple[str, ...]]
-class OptionUtils(object):
+class OptionUtils:
def _set_opts(
self,
schema: Optional[Union[StructType, str]] = None,
@@ -1349,7 +1349,7 @@ class DataFrameWriter(OptionUtils):
self.mode(mode)._jwrite.jdbc(url, table, jprop)
-class DataFrameWriterV2(object):
+class DataFrameWriterV2:
"""
Interface used to write a class:`pyspark.sql.dataframe.DataFrame`
to external storage using the v2 API.
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6ff63bc..fefd73c 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -138,7 +138,7 @@ class SparkSession(SparkConversionMixin):
[(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1,
[1, 2, 3])]
"""
- class Builder(object):
+ class Builder:
"""Builder for :class:`SparkSession`."""
_lock = RLock()
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 74593d0..30618fa 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
__all__ = ["StreamingQuery", "StreamingQueryManager", "DataStreamReader",
"DataStreamWriter"]
-class StreamingQuery(object):
+class StreamingQuery:
"""
A handle to a query that is executing continuously in the background as
new data arrives.
All these methods are thread-safe.
@@ -211,7 +211,7 @@ class StreamingQuery(object):
return None
-class StreamingQueryManager(object):
+class StreamingQueryManager:
"""A class to manage all the :class:`StreamingQuery` StreamingQueries
active.
.. versionadded:: 2.0.0
@@ -840,7 +840,7 @@ class DataStreamReader(OptionUtils):
raise TypeError("tableName can be only a single string")
-class DataStreamWriter(object):
+class DataStreamWriter:
"""
Interface used to write a streaming :class:`DataFrame
<pyspark.sql.DataFrame>` to external
storage systems (e.g. file systems, key-value stores, etc).
diff --git a/python/pyspark/sql/tests/test_streaming.py
b/python/pyspark/sql/tests/test_streaming.py
index 3750d02..783eaae 100644
--- a/python/pyspark/sql/tests/test_streaming.py
+++ b/python/pyspark/sql/tests/test_streaming.py
@@ -373,7 +373,7 @@ class StreamingTests(ReusedSQLTestCase):
def __setstate__(self, state):
self.open_events_dir, self.process_events_dir,
self.close_events_dir = state
- # Those foreach tests are failed in Python 3.6 and macOS High Sierra by
defined rules
+ # Those foreach tests are failed in macOS High Sierra by defined rules
# at
http://sealiesoftware.com/blog/archive/2017/6/5/Objective-C_and_fork_in_macOS_1013.html
# To work around this, OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES.
def test_streaming_foreach_with_simple_function(self):
diff --git a/python/pyspark/sql/tests/test_types.py
b/python/pyspark/sql/tests/test_types.py
index 135660f..2502387 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -134,7 +134,7 @@ class TypesTests(ReusedSQLTestCase):
def test_infer_schema_specification(self):
from decimal import Decimal
- class A(object):
+ class A:
def __init__(self):
self.a = 1
diff --git a/python/pyspark/sql/tests/test_udf.py
b/python/pyspark/sql/tests/test_udf.py
index 52d6fa4..7f421aa 100644
--- a/python/pyspark/sql/tests/test_udf.py
+++ b/python/pyspark/sql/tests/test_udf.py
@@ -570,7 +570,7 @@ class UDFTests(ReusedSQLTestCase):
self.assertEqual(f, f_.func)
self.assertEqual(return_type, f_.returnType)
- class F(object):
+ class F:
"""Identity"""
def __call__(self, x):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 3847ce4..9cf767a 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -76,7 +76,7 @@ __all__ = [
]
-class DataType(object):
+class DataType:
"""Base class for data types."""
def __repr__(self) -> str:
@@ -1910,7 +1910,7 @@ class Row(tuple):
return "<Row(%s)>" % ", ".join("%r" % field for field in self)
-class DateConverter(object):
+class DateConverter:
def can_convert(self, obj: Any) -> bool:
return isinstance(obj, datetime.date)
@@ -1919,7 +1919,7 @@ class DateConverter(object):
return Date.valueOf(obj.strftime("%Y-%m-%d"))
-class DatetimeConverter(object):
+class DatetimeConverter:
def can_convert(self, obj: Any) -> bool:
return isinstance(obj, datetime.datetime)
@@ -1933,7 +1933,7 @@ class DatetimeConverter(object):
return t
-class DatetimeNTZConverter(object):
+class DatetimeNTZConverter:
def can_convert(self, obj: Any) -> bool:
from pyspark.sql.utils import is_timestamp_ntz_preferred
@@ -1953,7 +1953,7 @@ class DatetimeNTZConverter(object):
)
-class DayTimeIntervalTypeConverter(object):
+class DayTimeIntervalTypeConverter:
def can_convert(self, obj: Any) -> bool:
return isinstance(obj, datetime.timedelta)
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 0b47f87..317bd47 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -73,7 +73,7 @@ def _create_udf(
return udf_obj._wrapped()
-class UserDefinedFunction(object):
+class UserDefinedFunction:
"""
User defined function in Python
@@ -303,7 +303,7 @@ class UserDefinedFunction(object):
return self
-class UDFRegistration(object):
+class UDFRegistration:
"""
Wrapper for user-defined function registration. This instance can be
accessed by
:attr:`spark.udf` or :attr:`sqlContext.udf`.
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index ee6ffe8..8f9049a 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -258,7 +258,7 @@ def require_test_compiled() -> None:
)
-class ForeachBatchFunction(object):
+class ForeachBatchFunction:
"""
This is the Python implementation of Java interface
'ForeachBatchFunction'. This wraps
the user-defined 'foreachBatch' function such that it can be called from
the JVM when
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index 2957b29..019fd21 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -36,7 +36,7 @@ def _to_java_cols(cols: Tuple[Union["ColumnOrName",
List["ColumnOrName"]], ...])
return _to_seq(sc, cast(Iterable["ColumnOrName"], cols), _to_java_column)
-class Window(object):
+class Window:
"""
Utility functions for defining window in DataFrames.
@@ -220,7 +220,7 @@ class Window(object):
return WindowSpec(jspec)
-class WindowSpec(object):
+class WindowSpec:
"""
A window specification that defines the partitioning, ordering,
and frame boundaries.
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index a994671..758d0a6 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -29,7 +29,7 @@ except ImportError:
sqrt = math.sqrt # type: ignore[assignment]
-class StatCounter(object):
+class StatCounter:
def __init__(self, values: Optional[Iterable[float]] = None):
if values is None:
values = list()
diff --git a/python/pyspark/status.py b/python/pyspark/status.py
index d2e1b0c..193b9ff 100644
--- a/python/pyspark/status.py
+++ b/python/pyspark/status.py
@@ -47,7 +47,7 @@ class SparkStageInfo(NamedTuple):
numFailedTasks: int
-class StatusTracker(object):
+class StatusTracker:
"""
Low-level status reporting APIs for monitoring job and stage progress.
diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py
index 279f4ea4..754e0b9 100644
--- a/python/pyspark/storagelevel.py
+++ b/python/pyspark/storagelevel.py
@@ -20,7 +20,7 @@ __all__ = ["StorageLevel"]
from typing import ClassVar
-class StorageLevel(object):
+class StorageLevel:
"""
Flags for controlling the storage of an RDD. Each StorageLevel records
whether to use memory,
diff --git a/python/pyspark/streaming/context.py
b/python/pyspark/streaming/context.py
index 5b76f09..cc9875d 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -27,7 +27,7 @@ from pyspark.streaming.util import TransformFunction,
TransformFunctionSerialize
__all__ = ["StreamingContext"]
-class StreamingContext(object):
+class StreamingContext:
"""
Main entry point for Spark Streaming functionality. A StreamingContext
represents the connection to a Spark cluster, and can be used to create
diff --git a/python/pyspark/streaming/dstream.py
b/python/pyspark/streaming/dstream.py
index 3f5e0ce..0c1aa19 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -31,7 +31,7 @@ from pyspark.resultiterable import ResultIterable
__all__ = ["DStream"]
-class DStream(object):
+class DStream:
"""
A Discretized Stream (DStream), the basic abstraction in Spark Streaming,
is a continuous sequence of RDDs (of the same type) representing a
diff --git a/python/pyspark/streaming/kinesis.py
b/python/pyspark/streaming/kinesis.py
index f973fed..b7298a3 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -35,7 +35,7 @@ def utf8_decoder(s: Optional[bytes]) -> Optional[str]:
return s.decode("utf-8")
-class KinesisUtils(object):
+class KinesisUtils:
@staticmethod
@overload
def createStream(
@@ -188,5 +188,5 @@ class KinesisUtils(object):
return stream.map(lambda v: decoder(v))
-class InitialPositionInStream(object):
+class InitialPositionInStream:
LATEST, TRIM_HORIZON = (0, 1)
diff --git a/python/pyspark/streaming/listener.py
b/python/pyspark/streaming/listener.py
index 675f27e..75c4aa2 100644
--- a/python/pyspark/streaming/listener.py
+++ b/python/pyspark/streaming/listener.py
@@ -21,7 +21,7 @@ from typing import Any
__all__ = ["StreamingListener"]
-class StreamingListener(object):
+class StreamingListener:
def __init__(self) -> None:
pass
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index 19ee458..57b1615 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -25,7 +25,7 @@ from py4j.java_gateway import is_instance_of
from pyspark import SparkContext, RDD
-class TransformFunction(object):
+class TransformFunction:
"""
This class wraps a function RDD[X] -> RDD[Y] that was passed to
DStream.transform(), allowing it to be called from Java via Py4J's
@@ -91,7 +91,7 @@ class TransformFunction(object):
implements =
["org.apache.spark.streaming.api.python.PythonTransformFunction"]
-class TransformFunctionSerializer(object):
+class TransformFunctionSerializer:
"""
This class implements a serializer for PythonTransformFunction Java
objects.
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index 7333a68..627456b 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -21,7 +21,7 @@ from pyspark.resource import ResourceInformation
from pyspark.serializers import read_int, write_int, write_with_length,
UTF8Deserializer
-class TaskContext(object):
+class TaskContext:
"""
Contextual information about a task which can be read or mutated during
@@ -282,7 +282,7 @@ class BarrierTaskContext(TaskContext):
return [BarrierTaskInfo(h.strip()) for h in addresses.split(",")]
-class BarrierTaskInfo(object):
+class BarrierTaskInfo:
"""
Carries all task infos of a barrier task.
diff --git a/python/pyspark/testing/pandasutils.py
b/python/pyspark/testing/pandasutils.py
index 42cc14b..5da2f9e 100644
--- a/python/pyspark/testing/pandasutils.py
+++ b/python/pyspark/testing/pandasutils.py
@@ -247,7 +247,7 @@ class PandasOnSparkTestCase(unittest.TestCase,
SQLTestUtils):
return obj
-class TestUtils(object):
+class TestUtils:
@contextmanager
def temp_dir(self):
tmp = tempfile.mkdtemp()
diff --git a/python/pyspark/testing/sqlutils.py
b/python/pyspark/testing/sqlutils.py
index 652cd45..3eb58ff 100644
--- a/python/pyspark/testing/sqlutils.py
+++ b/python/pyspark/testing/sqlutils.py
@@ -154,13 +154,13 @@ class PythonOnlyPoint(ExamplePoint):
__UDT__ = PythonOnlyUDT() # type: ignore
-class MyObject(object):
+class MyObject:
def __init__(self, key, value):
self.key = key
self.value = value
-class SQLTestUtils(object):
+class SQLTestUtils:
"""
This util assumes the instance of this to have 'spark' attribute, having a
spark session.
It is usually used with 'ReusedSQLTestCase' class but can be used if you
feel sure the
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
index 0cde179..b97bfe6 100644
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@@ -98,7 +98,7 @@ def eventually(condition, timeout=30.0,
catch_assertions=False):
)
-class QuietTest(object):
+class QuietTest:
def __init__(self, sc):
self.log4j = sc._jvm.org.apache.log4j
@@ -138,7 +138,7 @@ class ReusedPySparkTestCase(unittest.TestCase):
cls.sc.stop()
-class ByteArrayOutput(object):
+class ByteArrayOutput:
def __init__(self):
self.buffer = bytearray()
diff --git a/python/pyspark/tests/test_serializers.py
b/python/pyspark/tests/test_serializers.py
index 019f527..1c04295 100644
--- a/python/pyspark/tests/test_serializers.py
+++ b/python/pyspark/tests/test_serializers.py
@@ -81,7 +81,7 @@ class SerializationTestCase(unittest.TestCase):
ser = CloudPickleSerializer()
- class C(object):
+ class C:
def __getattr__(self, item):
return item
@@ -113,7 +113,7 @@ class SerializationTestCase(unittest.TestCase):
self.assertEqual(out1, out2)
def test_func_globals(self):
- class Unpicklable(object):
+ class Unpicklable:
def __reduce__(self):
raise RuntimeError("not picklable")
diff --git a/python/pyspark/tests/test_util.py
b/python/pyspark/tests/test_util.py
index a1519da..7291083 100644
--- a/python/pyspark/tests/test_util.py
+++ b/python/pyspark/tests/test_util.py
@@ -23,7 +23,7 @@ from pyspark.testing.utils import PySparkTestCase
class KeywordOnlyTests(unittest.TestCase):
- class Wrapped(object):
+ class Wrapped:
@keyword_only
def set(self, x=None, y=None):
if "x" in self._input_kwargs:
@@ -46,7 +46,7 @@ class KeywordOnlyTests(unittest.TestCase):
def test_kwarg_ownership(self):
# test _input_kwargs is owned by each class instance and not a shared
static variable
- class Setter(object):
+ class Setter:
@keyword_only
def set(self, x=None, other=None, other_x=None):
if "other" in self._input_kwargs:
diff --git a/python/pyspark/traceback_utils.py
b/python/pyspark/traceback_utils.py
index 4a51295..af4169e 100644
--- a/python/pyspark/traceback_utils.py
+++ b/python/pyspark/traceback_utils.py
@@ -46,7 +46,7 @@ def first_spark_call():
return CallSite(function=sfun, file=ufile, linenum=uline)
-class SCCallSiteSync(object):
+class SCCallSiteSync:
"""
Helper for setting the spark context call site.
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index 6af489d..3a54a90 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -39,7 +39,7 @@ def print_exec(stream: TextIO) -> None:
traceback.print_exception(ei[0], ei[1], ei[2], None, stream)
-class VersionUtils(object):
+class VersionUtils:
"""
Provides utility method to determine Spark versions with given input
string.
"""
diff --git a/python/setup.py b/python/setup.py
index 89e74f0..008b62f 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -272,11 +272,10 @@ try:
'numpy>=1.14',
],
},
- python_requires='>=3.6',
+ python_requires='>=3.7',
classifiers=[
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: Apache Software License',
- 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
diff --git a/python/test_support/userlibrary.py
b/python/test_support/userlibrary.py
index 90cd307..2d262ba 100755
--- a/python/test_support/userlibrary.py
+++ b/python/test_support/userlibrary.py
@@ -20,7 +20,7 @@ Used to test shipping of code dependencies with
SparkContext.addPyFile().
"""
-class UserClass(object):
+class UserClass:
def hello(self):
return "Hello World!"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]