Github user jkbradley commented on a diff in the pull request:
https://github.com/apache/spark/pull/20112#discussion_r159096655
--- Diff: python/pyspark/ml/feature.py ---
@@ -3466,6 +3466,72 @@ def selectedFeatures(self):
return self._call_java("selectedFeatures")
+@inherit_doc
+class VectorSizeHint(JavaTransformer, HasInputCol, HasHandleInvalid,
JavaMLReadable,
+ JavaMLWritable):
+ """
+ A feature transformer that adds size information to the metadata of a
vector column.
+ VectorAssembler needs size information for its input columns and
cannot be used on streaming
+ dataframes without this metadata.
+
+ >>> from pyspark.ml.linalg import Vectors
+ >>> from pyspark.ml import Pipeline, PipelineModel
+ >>> data = [(Vectors.dense([1., 2., 3.]), 4.)]
+ >>> df = spark.createDataFrame(data, ["vector", "float"])
+ >>>
+ >>> sizeHint = VectorSizeHint(inputCol="vector", size=3,
handleInvalid="skip")
+ >>> vecAssembler = VectorAssembler(inputCols=["vector", "float"],
outputCol="assembled")
+ >>> pipeline = Pipeline(stages=[sizeHint, vecAssembler])
+ >>>
+ >>> pipelineModel = pipeline.fit(df)
+ >>> pipelineModel.transform(df).head().assembled
+ DenseVector([1.0, 2.0, 3.0, 4.0])
+ >>> vectorSizeHintPath = temp_path + "/vector-size-hint-pipeline"
+ >>> pipelineModel.save(vectorSizeHintPath)
+ >>> loadedPipeline = PipelineModel.load(vectorSizeHintPath)
+ >>> loaded = loadedPipeline.transform(df).head().assembled
+ >>> expected = pipelineModel.transform(df).head().assembled
+ >>> loaded == expected
+ True
+
+ .. versionadded:: 2.3.0
+ .. note:: Experimental
+ """
+
+ size = Param(Params._dummy(), "size", "Size of vectors in column.",
+ typeConverter=TypeConverters.toInt)
+
+ @since("2.3.0")
+ def getSize(self):
+ """ Gets size param, the size of vectors in `inputCol`."""
+ self.getOrDefault(self.size)
+
+ @since("2.3.0")
+ def setSize(self, value):
+ """ Sets size param, the size of vectors in `inputCol`."""
+ self._set(size=value)
+
+ @keyword_only
+ def __init__(self, inputCol=None, size=None, handleInvalid="error"):
--- End diff --
Let's stick with the order which all other python classes follow: dummy
Params, __init__, Param setters & getters
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]