yunfengzhou-hub commented on code in PR #156:
URL: https://github.com/apache/flink-ml/pull/156#discussion_r1015280099
##########
flink-ml-lib/src/main/java/org/apache/flink/ml/feature/vectorassembler/VectorAssembler.java:
##########
@@ -74,64 +85,107 @@ public Table[] transform(Table... inputs) {
DataStream<Row> output =
tEnv.toDataStream(inputs[0])
.flatMap(
- new AssemblerFunc(getInputCols(),
getHandleInvalid()),
+ new AssemblerFunction(
+ getInputCols(), getHandleInvalid(),
getInputSizes()),
outputTypeInfo);
Table outputTable = tEnv.fromDataStream(output);
return new Table[] {outputTable};
}
- private static class AssemblerFunc implements FlatMapFunction<Row, Row> {
+ private static class AssemblerFunction implements FlatMapFunction<Row,
Row> {
private final String[] inputCols;
private final String handleInvalid;
+ private final Integer[] inputSizes;
+ private final boolean keepInvalid;
- public AssemblerFunc(String[] inputCols, String handleInvalid) {
+ public AssemblerFunction(String[] inputCols, String handleInvalid,
Integer[] inputSizes) {
this.inputCols = inputCols;
this.handleInvalid = handleInvalid;
+ this.inputSizes = inputSizes;
+ keepInvalid = handleInvalid.equals(HasHandleInvalid.KEEP_INVALID);
}
@Override
public void flatMap(Row value, Collector<Row> out) {
int nnz = 0;
int vectorSize = 0;
try {
- for (String inputCol : inputCols) {
- Object object = value.getField(inputCol);
- Preconditions.checkNotNull(object, "Input column value
should not be null.");
- if (object instanceof Number) {
- nnz += 1;
- vectorSize += 1;
- } else if (object instanceof SparseVector) {
- nnz += ((SparseVector) object).indices.length;
- vectorSize += ((SparseVector) object).size();
- } else if (object instanceof DenseVector) {
- nnz += ((DenseVector) object).size();
- vectorSize += ((DenseVector) object).size();
+ for (int i = 0; i < inputCols.length; ++i) {
+ Object object = value.getField(inputCols[i]);
+ if (object != null) {
+ if (object instanceof Number) {
+
checkVectorAndNumberSizeIfNotKeepInvalid(inputSizes[i], 1);
+ vectorSize += 1;
+ nnz += 1;
+ } else if (object instanceof SparseVector) {
+ int localSize = ((SparseVector) object).size();
+
checkVectorAndNumberSizeIfNotKeepInvalid(inputSizes[i], localSize);
+ nnz += ((SparseVector) object).indices.length;
+ vectorSize += localSize;
+ } else if (object instanceof DenseVector) {
+ int localSize = ((DenseVector) object).size();
+
checkVectorAndNumberSizeIfNotKeepInvalid(inputSizes[i], localSize);
+
+ vectorSize += localSize;
+ nnz += ((DenseVector) object).size();
+ } else {
+ throw new IllegalArgumentException(
+ "Input type has not been supported yet.
Only Vector and Number types are supported.");
Review Comment:
How about the following error message? Adding detailed information about the
data type that has not been supported.
```java
throw new IllegalArgumentException(
String.format("Input type %s has not been supported yet. Only Vector and
Number types are supported.", object.getClass()));
```
##########
flink-ml-python/pyflink/ml/lib/feature/vectorassembler.py:
##########
@@ -27,21 +29,61 @@ class _VectorAssemblerParams(
HasOutputCol,
HasHandleInvalid
):
+
+ """
+ Checks the inputSizes parameter.
+ """
+ def sizes_validator(self) -> ParamValidator[Tuple[int]]:
+ class SizesValidator(ParamValidator[Tuple[int]]):
+ def validate(self, indices: Tuple[int]) -> bool:
+ if indices is None:
+ return False
+ for val in indices:
+ if val < 0:
Review Comment:
Can the values be 0?
##########
flink-ml-python/pyflink/ml/lib/feature/vectorassembler.py:
##########
@@ -27,21 +29,61 @@ class _VectorAssemblerParams(
HasOutputCol,
HasHandleInvalid
):
+
+ """
+ Checks the inputSizes parameter.
+ """
+ def sizes_validator(self) -> ParamValidator[Tuple[int]]:
+ class SizesValidator(ParamValidator[Tuple[int]]):
+ def validate(self, indices: Tuple[int]) -> bool:
+ if indices is None:
+ return False
+ for val in indices:
+ if val < 0:
+ return False
+ return len(indices) != 0
+ return SizesValidator()
+
"""
Params for :class:`VectorAssembler`.
"""
+ INPUT_SIZES: Param[Tuple[int, ...]] = IntArrayParam(
+ "input_sizes",
+ "Sizes of the input elements to be assembled.",
+ None,
+ sizes_validator(None))
+
def __init__(self, java_params):
super(_VectorAssemblerParams, self).__init__(java_params)
+ def set_input_sizes(self, *sizes: int):
+ return self.set(self.INPUT_SIZES, sizes)
+
+ def get_input_sizes(self) -> Tuple[int, ...]:
+ return self.get(self.INPUT_SIZES)
+
+ @property
+ def input_sizes(self) -> Tuple[int, ...]:
+ return self.get_input_sizes()
+
class VectorAssembler(JavaFeatureTransformer, _VectorAssemblerParams):
"""
- A Transformer which combines a given list of input columns into a vector
column. Types of
- input columns must be either vector or numerical value.
-
- The `keep` option of :class:HasHandleInvalid means that we output bad rows
with output column
- set to null.
+ A Transformer which combines a given list of input columns into a vector
column. Input columns
+ would be numerics or vectors whose size is specified by the {@link
#INPUT_SIZES} parameter.
+ Invalid input data with null values or values with wrong sizes would be
dealt with according to
+ the strategy specified by the {@link HasHandleInvalid} parameter as
follows:
Review Comment:
`{@link ...}` is JavaDoc grammar and unsuitable for python documents. Let's
change it to the way it has originally used, like `:class:HasHandleInvalid`.
##########
flink-ml-lib/src/test/java/org/apache/flink/ml/feature/VectorAssemblerTest.java:
##########
@@ -45,13 +45,15 @@
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
/** Tests VectorAssembler. */
public class VectorAssemblerTest extends AbstractTestBase {
private StreamTableEnvironment tEnv;
+ private StreamExecutionEnvironment env;
Review Comment:
It is enough to keep this variable as local variable as it has been.
##########
docs/content/docs/operators/feature/vectorassembler.md:
##########
@@ -156,12 +168,13 @@ input_data_table = t_env.from_data_stream(
[DenseVectorTypeInfo(), Types.DOUBLE(), SparseVectorTypeInfo()])))
# create a vector assembler object and initialize its parameters
-vector_assembler = VectorAssembler() \
- .set_input_cols('vec', 'num', 'sparse_vec') \
- .set_output_col('assembled_vec') \
+vector_assembler = VectorAssembler()\
+ .set_input_cols('vec', 'num', 'sparse_vec')\
+ .set_output_col('assembled_vec')\
+ .set_input_sizes(2, 1, 5)\
Review Comment:
It might be better to change `)\` to `) \`, keeping the same as python
example file.
##########
flink-ml-python/pyflink/ml/lib/feature/vectorassembler.py:
##########
@@ -27,21 +29,61 @@ class _VectorAssemblerParams(
HasOutputCol,
HasHandleInvalid
):
+
+ """
+ Checks the inputSizes parameter.
+ """
+ def sizes_validator(self) -> ParamValidator[Tuple[int]]:
+ class SizesValidator(ParamValidator[Tuple[int]]):
+ def validate(self, indices: Tuple[int]) -> bool:
+ if indices is None:
+ return False
+ for val in indices:
+ if val < 0:
+ return False
+ return len(indices) != 0
+ return SizesValidator()
+
"""
Params for :class:`VectorAssembler`.
"""
+ INPUT_SIZES: Param[Tuple[int, ...]] = IntArrayParam(
+ "input_sizes",
+ "Sizes of the input elements to be assembled.",
+ None,
+ sizes_validator(None))
Review Comment:
Would it be enough to use `SizesValidator()` directly, instead of adding a
`sizes_validator` method?
##########
flink-ml-lib/src/main/java/org/apache/flink/ml/feature/vectorassembler/VectorAssemblerParams.java:
##########
@@ -21,11 +21,44 @@
import org.apache.flink.ml.common.param.HasHandleInvalid;
import org.apache.flink.ml.common.param.HasInputCols;
import org.apache.flink.ml.common.param.HasOutputCol;
+import org.apache.flink.ml.param.IntArrayParam;
+import org.apache.flink.ml.param.Param;
+import org.apache.flink.ml.param.ParamValidator;
/**
* Params of {@link VectorAssembler}.
*
* @param <T> The class type of this instance.
*/
public interface VectorAssemblerParams<T>
- extends HasInputCols<T>, HasOutputCol<T>, HasHandleInvalid<T> {}
+ extends HasInputCols<T>, HasOutputCol<T>, HasHandleInvalid<T> {
+ Param<Integer[]> INPUT_SIZES =
+ new IntArrayParam(
+ "inputSizes",
+ "Sizes of the input elements to be assembled.",
+ null,
+ sizesValidator());
+
+ default Integer[] getInputSizes() {
+ return get(INPUT_SIZES);
+ }
+
+ default T setInputSizes(Integer... value) {
+ return set(INPUT_SIZES, value);
+ }
+
+ // Checks the inputSizes parameter.
+ static ParamValidator<Integer[]> sizesValidator() {
+ return inputSizes -> {
+ if (inputSizes == null) {
+ return false;
+ }
+ for (Integer ele : inputSizes) {
+ if (ele < 0) {
Review Comment:
Can the size of the elements to be assembled equal to 0?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]