zhengruifeng commented on code in PR #43199:
URL: https://github.com/apache/spark/pull/43199#discussion_r1348144761


##########
python/pyspark/ml/connect/feature.py:
##########
@@ -256,3 +262,124 @@ def _load_core_model(self, path: str) -> None:
         self.scale_values = sk_model.scale_
         self.mean_values = sk_model.mean_
         self.n_samples_seen = sk_model.n_samples_seen_
+
+
+class VectorAssembler(
+    Transformer,
+    HasInputCols,
+    HasOutputCol,
+    HasInputFeatureSizeList,
+    HasHandleInvalid,
+    ParamsReadWrite,
+):
+    """
+    A feature transformer that merges multiple input columns into a array type 
column.

Review Comment:
   ```suggestion
       A feature transformer that merges multiple input columns into a vector 
type column.
   ```



##########
python/pyspark/ml/connect/feature.py:
##########
@@ -256,3 +262,124 @@ def _load_core_model(self, path: str) -> None:
         self.scale_values = sk_model.scale_
         self.mean_values = sk_model.mean_
         self.n_samples_seen = sk_model.n_samples_seen_
+
+
+class VectorAssembler(
+    Transformer,
+    HasInputCols,
+    HasOutputCol,
+    HasInputFeatureSizeList,
+    HasHandleInvalid,
+    ParamsReadWrite,
+):
+    """
+    A feature transformer that merges multiple input columns into a array type 
column.
+    You need to set param `inputCols` for specifying input column names,

Review Comment:
   What about having a `Parameters` section for those parameters?



##########
python/pyspark/ml/connect/feature.py:
##########
@@ -256,3 +262,124 @@ def _load_core_model(self, path: str) -> None:
         self.scale_values = sk_model.scale_
         self.mean_values = sk_model.mean_
         self.n_samples_seen = sk_model.n_samples_seen_
+
+
+class VectorAssembler(
+    Transformer,
+    HasInputCols,
+    HasOutputCol,
+    HasInputFeatureSizeList,
+    HasHandleInvalid,
+    ParamsReadWrite,
+):
+    """
+    A feature transformer that merges multiple input columns into a array type 
column.
+    You need to set param `inputCols` for specifying input column names,
+    and set param `inputFeatureSizeList` for specifying corresponding input 
column
+    feature size, for scalar type input column, corresponding feature size 
must be set to 1,
+    otherwise, set corresponding feature size to feature array length.
+    Output column is "array<double"> type and contains array of assembled 
features.
+    All elements in input feature columns must be convertible to double type.
+
+    You can set 'handler_invalid' param to specify how to handle invalid input 
value
+    (None or NaN), if it is set to 'error', error is thrown for invalid input 
value,
+    if it is set to 'keep', it returns relevant number of NaN in the output.
+
+    .. versionadded:: 4.0.0
+
+    Examples
+    --------
+    >>> from pyspark.ml.connect.feature import VectorAssembler
+    >>> import numpy as np
+    >>>
+    >>> spark_df = spark.createDataFrame(
+    ...     [
+    ...         ([2.0, 3.5, 1.5], 3.0, True, 1),
+    ...         ([-3.0, np.nan, -2.5], 4.0, False, 2),
+    ...     ],
+    ...     schema=["f1", "f2", "f3", "f4"],
+    ... )
+    >>> assembler = VectorAssembler(
+    ...     inputCols=["f1", "f2", "f3", "f4"],
+    ...     outputCol="out",
+    ...     inputFeatureSizeList=[3, 1, 1, 1],
+    ...     handleInvalid="keep",
+    ... )
+    >>> assembler.transform(spark_df).select("out").show(truncate=False)
+    """
+
+    _input_kwargs: Dict[str, Any]
+
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        inputCols: Optional[List[str]] = None,
+        outputCol: Optional[str] = None,
+        inputFeatureSizeList: Optional[List[int]] = None,

Review Comment:
   existing impl doesn't have this param



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to