This is an automated email from the ASF dual-hosted git repository.

lindong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flink-ml.git

commit 58fce034a36d0e2416f03c61a4de450cc709bfc8
Author: zhangzp <[email protected]>
AuthorDate: Mon Jun 20 10:02:59 2022 +0800

    [FLINK-27877] Add benchmark configuration for StringIndexer, StandardScaler 
and Bucketizer
---
 .../datagenerator/common/DoubleGenerator.java      | 58 +++++++++++++++++
 .../common/RandomStringGenerator.java              | 76 ++++++++++++++++++++++
 .../src/main/resources/bucketizer-benchmark.json   | 53 +++++++++++++++
 .../main/resources/standardscaler-benchmark.json   | 42 ++++++++++++
 .../main/resources/stringindexer-benchmark.json    | 46 +++++++++++++
 .../flink/ml/benchmark/DataGeneratorTest.java      | 51 +++++++++++++++
 6 files changed, 326 insertions(+)

diff --git 
a/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/DoubleGenerator.java
 
b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/DoubleGenerator.java
new file mode 100644
index 0000000..3dffe52
--- /dev/null
+++ 
b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/DoubleGenerator.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.benchmark.datagenerator.common;
+
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.common.typeinfo.Types;
+import org.apache.flink.api.java.typeutils.RowTypeInfo;
+import org.apache.flink.types.Row;
+import org.apache.flink.util.Preconditions;
+
+import java.util.Arrays;
+
+/** A DataGenerator which creates a table of doubles. */
+public class DoubleGenerator extends InputTableGenerator<DoubleGenerator> {
+
+    @Override
+    protected RowGenerator[] getRowGenerators() {
+        String[][] colNames = getColNames();
+        Preconditions.checkState(colNames.length == 1);
+        int numOutputCols = colNames[0].length;
+
+        return new RowGenerator[] {
+            new RowGenerator(getNumValues(), getSeed()) {
+                @Override
+                public Row nextRow() {
+                    Row r = new Row(numOutputCols);
+                    for (int i = 0; i < numOutputCols; i++) {
+                        r.setField(i, random.nextDouble());
+                    }
+                    return r;
+                }
+
+                @Override
+                protected RowTypeInfo getRowTypeInfo() {
+                    TypeInformation[] outputTypes = new 
TypeInformation[colNames[0].length];
+                    Arrays.fill(outputTypes, Types.DOUBLE);
+                    return new RowTypeInfo(outputTypes, colNames[0]);
+                }
+            }
+        };
+    }
+}
diff --git 
a/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/RandomStringGenerator.java
 
b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/RandomStringGenerator.java
new file mode 100644
index 0000000..b14cf61
--- /dev/null
+++ 
b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/RandomStringGenerator.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.benchmark.datagenerator.common;
+
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.common.typeinfo.Types;
+import org.apache.flink.api.java.typeutils.RowTypeInfo;
+import org.apache.flink.ml.param.IntParam;
+import org.apache.flink.ml.param.Param;
+import org.apache.flink.ml.param.ParamValidators;
+import org.apache.flink.types.Row;
+import org.apache.flink.util.Preconditions;
+
+import java.util.Arrays;
+
+/** A DataGenerator which creates a table of random strings. */
+public class RandomStringGenerator extends 
InputTableGenerator<RandomStringGenerator> {
+    public static final Param<Integer> NUM_DISTINCT_VALUE =
+            new IntParam(
+                    "numDistinctValue",
+                    "Number of distinct values of the data to be generated.",
+                    10,
+                    ParamValidators.gt(0));
+
+    public int getNumDistinctValue() {
+        return get(NUM_DISTINCT_VALUE);
+    }
+
+    public RandomStringGenerator setNumDistinctValue(int value) {
+        return set(NUM_DISTINCT_VALUE, value);
+    }
+
+    @Override
+    protected RowGenerator[] getRowGenerators() {
+        String[][] colNames = getColNames();
+        Preconditions.checkState(colNames.length == 1);
+        int numOutputCols = colNames[0].length;
+        int numDistinctValues = getNumDistinctValue();
+
+        return new RowGenerator[] {
+            new RowGenerator(getNumValues(), getSeed()) {
+                @Override
+                public Row nextRow() {
+                    Row r = new Row(numOutputCols);
+                    for (int i = 0; i < numOutputCols; i++) {
+                        r.setField(i, 
Integer.toString(random.nextInt(numDistinctValues)));
+                    }
+                    return r;
+                }
+
+                @Override
+                protected RowTypeInfo getRowTypeInfo() {
+                    TypeInformation[] outputTypes = new 
TypeInformation[colNames[0].length];
+                    Arrays.fill(outputTypes, Types.STRING);
+                    return new RowTypeInfo(outputTypes, colNames[0]);
+                }
+            }
+        };
+    }
+}
diff --git a/flink-ml-benchmark/src/main/resources/bucketizer-benchmark.json 
b/flink-ml-benchmark/src/main/resources/bucketizer-benchmark.json
new file mode 100644
index 0000000..6362e75
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/bucketizer-benchmark.json
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "bucketizer100000000": {
+    "inputData": {
+      "className": 
"org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator",
+      "paramMap": {
+        "colNames": [
+          [
+            "col0"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 100000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.bucketizer.Bucketizer",
+      "paramMap": {
+        "outputCols": [
+          "outputCol0"
+        ],
+        "handleInvalid": "skip",
+        "inputCols": [
+          "col0"
+        ],
+        "splitsArray": [
+          [
+            -1.0,
+            0.0,
+            0.5,
+            1.0,
+            2.0
+          ]
+        ]
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git 
a/flink-ml-benchmark/src/main/resources/standardscaler-benchmark.json 
b/flink-ml-benchmark/src/main/resources/standardscaler-benchmark.json
new file mode 100644
index 0000000..0995ef9
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/standardscaler-benchmark.json
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "standardscaler10000000": {
+    "inputData": {
+      "className": 
"org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator",
+      "paramMap": {
+        "vectorDim": 100,
+        "colNames": [
+          [
+            "features"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 10000000
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.standardscaler.StandardScaler",
+      "paramMap": {
+        "inputCol": "features",
+        "withMean": true,
+        "withStd": true,
+        "outputCol": "outputCol"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/flink-ml-benchmark/src/main/resources/stringindexer-benchmark.json 
b/flink-ml-benchmark/src/main/resources/stringindexer-benchmark.json
new file mode 100644
index 0000000..f8b97cf
--- /dev/null
+++ b/flink-ml-benchmark/src/main/resources/stringindexer-benchmark.json
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+{
+  "version": 1,
+  "stringindexer100000000": {
+    "inputData": {
+      "className": 
"org.apache.flink.ml.benchmark.datagenerator.common.RandStringGenerator",
+      "paramMap": {
+        "colNames": [
+          [
+            "col0"
+          ]
+        ],
+        "seed": 2,
+        "numValues": 100000000,
+        "numDistinctValue": 100
+      }
+    },
+    "stage": {
+      "className": "org.apache.flink.ml.feature.stringindexer.StringIndexer",
+      "paramMap": {
+        "outputCols": [
+          "outputCol0"
+        ],
+        "handleInvalid": "skip",
+        "inputCols": [
+          "col0"
+        ],
+        "stringOrderType": "arbitrary"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git 
a/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java
 
b/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java
index 7d2883a..25e9930 100644
--- 
a/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java
+++ 
b/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java
@@ -22,7 +22,9 @@ import 
org.apache.flink.api.common.restartstrategy.RestartStrategies;
 import org.apache.flink.configuration.Configuration;
 import 
org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorArrayGenerator;
 import org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator;
+import org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator;
 import 
org.apache.flink.ml.benchmark.datagenerator.common.LabeledPointWithWeightGenerator;
+import 
org.apache.flink.ml.benchmark.datagenerator.common.RandomStringGenerator;
 import org.apache.flink.ml.linalg.DenseVector;
 import 
org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions;
 import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
@@ -136,4 +138,53 @@ public class DataGeneratorTest {
         }
         assertEquals(generator.getNumValues(), count);
     }
+
+    @Test
+    public void testRandomStringGenerator() {
+        String col1 = "col1";
+        String col2 = "col2";
+
+        RandomStringGenerator generator =
+                new RandomStringGenerator()
+                        .setColNames(new String[] {col1, col2})
+                        .setSeed(2L)
+                        .setNumValues(5)
+                        .setNumDistinctValue(2);
+
+        int count = 0;
+        for (CloseableIterator<Row> it = 
generator.getData(tEnv)[0].execute().collect();
+                it.hasNext(); ) {
+            Row row = it.next();
+            count++;
+            String value1 = (String) row.getField(col1);
+            String value2 = (String) row.getField(col2);
+            assertTrue(Integer.parseInt(value1) < 
generator.getNumDistinctValue());
+            assertTrue(Integer.parseInt(value2) < 
generator.getNumDistinctValue());
+        }
+        assertEquals(generator.getNumValues(), count);
+    }
+
+    @Test
+    public void testDoubleGenerator() {
+        String col1 = "col1";
+        String col2 = "col2";
+
+        DoubleGenerator generator =
+                new DoubleGenerator()
+                        .setColNames(new String[] {"col1", "col2"})
+                        .setSeed(2L)
+                        .setNumValues(5);
+
+        int count = 0;
+        for (CloseableIterator<Row> it = 
generator.getData(tEnv)[0].execute().collect();
+                it.hasNext(); ) {
+            Row row = it.next();
+            count++;
+            double value1 = (Double) row.getField(col1);
+            double value2 = (Double) row.getField(col2);
+            assertTrue(value1 <= 1 && value1 >= 0);
+            assertTrue(value2 <= 1 && value2 >= 0);
+        }
+        assertEquals(generator.getNumValues(), count);
+    }
 }

Reply via email to