This is an automated email from the ASF dual-hosted git repository. lindong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/flink-ml.git
commit 58fce034a36d0e2416f03c61a4de450cc709bfc8 Author: zhangzp <[email protected]> AuthorDate: Mon Jun 20 10:02:59 2022 +0800 [FLINK-27877] Add benchmark configuration for StringIndexer, StandardScaler and Bucketizer --- .../datagenerator/common/DoubleGenerator.java | 58 +++++++++++++++++ .../common/RandomStringGenerator.java | 76 ++++++++++++++++++++++ .../src/main/resources/bucketizer-benchmark.json | 53 +++++++++++++++ .../main/resources/standardscaler-benchmark.json | 42 ++++++++++++ .../main/resources/stringindexer-benchmark.json | 46 +++++++++++++ .../flink/ml/benchmark/DataGeneratorTest.java | 51 +++++++++++++++ 6 files changed, 326 insertions(+) diff --git a/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/DoubleGenerator.java b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/DoubleGenerator.java new file mode 100644 index 0000000..3dffe52 --- /dev/null +++ b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/DoubleGenerator.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.benchmark.datagenerator.common; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import java.util.Arrays; + +/** A DataGenerator which creates a table of doubles. */ +public class DoubleGenerator extends InputTableGenerator<DoubleGenerator> { + + @Override + protected RowGenerator[] getRowGenerators() { + String[][] colNames = getColNames(); + Preconditions.checkState(colNames.length == 1); + int numOutputCols = colNames[0].length; + + return new RowGenerator[] { + new RowGenerator(getNumValues(), getSeed()) { + @Override + public Row nextRow() { + Row r = new Row(numOutputCols); + for (int i = 0; i < numOutputCols; i++) { + r.setField(i, random.nextDouble()); + } + return r; + } + + @Override + protected RowTypeInfo getRowTypeInfo() { + TypeInformation[] outputTypes = new TypeInformation[colNames[0].length]; + Arrays.fill(outputTypes, Types.DOUBLE); + return new RowTypeInfo(outputTypes, colNames[0]); + } + } + }; + } +} diff --git a/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/RandomStringGenerator.java b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/RandomStringGenerator.java new file mode 100644 index 0000000..b14cf61 --- /dev/null +++ b/flink-ml-benchmark/src/main/java/org/apache/flink/ml/benchmark/datagenerator/common/RandomStringGenerator.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.benchmark.datagenerator.common; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.ml.param.IntParam; +import org.apache.flink.ml.param.Param; +import org.apache.flink.ml.param.ParamValidators; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import java.util.Arrays; + +/** A DataGenerator which creates a table of random strings. */ +public class RandomStringGenerator extends InputTableGenerator<RandomStringGenerator> { + public static final Param<Integer> NUM_DISTINCT_VALUE = + new IntParam( + "numDistinctValue", + "Number of distinct values of the data to be generated.", + 10, + ParamValidators.gt(0)); + + public int getNumDistinctValue() { + return get(NUM_DISTINCT_VALUE); + } + + public RandomStringGenerator setNumDistinctValue(int value) { + return set(NUM_DISTINCT_VALUE, value); + } + + @Override + protected RowGenerator[] getRowGenerators() { + String[][] colNames = getColNames(); + Preconditions.checkState(colNames.length == 1); + int numOutputCols = colNames[0].length; + int numDistinctValues = getNumDistinctValue(); + + return new RowGenerator[] { + new RowGenerator(getNumValues(), getSeed()) { + @Override + public Row nextRow() { + Row r = new Row(numOutputCols); + for (int i = 0; i < numOutputCols; i++) { + r.setField(i, Integer.toString(random.nextInt(numDistinctValues))); + } + return r; + } + + @Override + protected RowTypeInfo getRowTypeInfo() { + TypeInformation[] outputTypes = new TypeInformation[colNames[0].length]; + Arrays.fill(outputTypes, Types.STRING); + return new RowTypeInfo(outputTypes, colNames[0]); + } + } + }; + } +} diff --git a/flink-ml-benchmark/src/main/resources/bucketizer-benchmark.json b/flink-ml-benchmark/src/main/resources/bucketizer-benchmark.json new file mode 100644 index 0000000..6362e75 --- /dev/null +++ b/flink-ml-benchmark/src/main/resources/bucketizer-benchmark.json @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +{ + "version": 1, + "bucketizer100000000": { + "inputData": { + "className": "org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator", + "paramMap": { + "colNames": [ + [ + "col0" + ] + ], + "seed": 2, + "numValues": 100000000 + } + }, + "stage": { + "className": "org.apache.flink.ml.feature.bucketizer.Bucketizer", + "paramMap": { + "outputCols": [ + "outputCol0" + ], + "handleInvalid": "skip", + "inputCols": [ + "col0" + ], + "splitsArray": [ + [ + -1.0, + 0.0, + 0.5, + 1.0, + 2.0 + ] + ] + } + } + } +} \ No newline at end of file diff --git a/flink-ml-benchmark/src/main/resources/standardscaler-benchmark.json b/flink-ml-benchmark/src/main/resources/standardscaler-benchmark.json new file mode 100644 index 0000000..0995ef9 --- /dev/null +++ b/flink-ml-benchmark/src/main/resources/standardscaler-benchmark.json @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +{ + "version": 1, + "standardscaler10000000": { + "inputData": { + "className": "org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator", + "paramMap": { + "vectorDim": 100, + "colNames": [ + [ + "features" + ] + ], + "seed": 2, + "numValues": 10000000 + } + }, + "stage": { + "className": "org.apache.flink.ml.feature.standardscaler.StandardScaler", + "paramMap": { + "inputCol": "features", + "withMean": true, + "withStd": true, + "outputCol": "outputCol" + } + } + } +} \ No newline at end of file diff --git a/flink-ml-benchmark/src/main/resources/stringindexer-benchmark.json b/flink-ml-benchmark/src/main/resources/stringindexer-benchmark.json new file mode 100644 index 0000000..f8b97cf --- /dev/null +++ b/flink-ml-benchmark/src/main/resources/stringindexer-benchmark.json @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +{ + "version": 1, + "stringindexer100000000": { + "inputData": { + "className": "org.apache.flink.ml.benchmark.datagenerator.common.RandStringGenerator", + "paramMap": { + "colNames": [ + [ + "col0" + ] + ], + "seed": 2, + "numValues": 100000000, + "numDistinctValue": 100 + } + }, + "stage": { + "className": "org.apache.flink.ml.feature.stringindexer.StringIndexer", + "paramMap": { + "outputCols": [ + "outputCol0" + ], + "handleInvalid": "skip", + "inputCols": [ + "col0" + ], + "stringOrderType": "arbitrary" + } + } + } +} \ No newline at end of file diff --git a/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java b/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java index 7d2883a..25e9930 100644 --- a/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java +++ b/flink-ml-benchmark/src/test/java/org/apache/flink/ml/benchmark/DataGeneratorTest.java @@ -22,7 +22,9 @@ import org.apache.flink.api.common.restartstrategy.RestartStrategies; import org.apache.flink.configuration.Configuration; import org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorArrayGenerator; import org.apache.flink.ml.benchmark.datagenerator.common.DenseVectorGenerator; +import org.apache.flink.ml.benchmark.datagenerator.common.DoubleGenerator; import org.apache.flink.ml.benchmark.datagenerator.common.LabeledPointWithWeightGenerator; +import org.apache.flink.ml.benchmark.datagenerator.common.RandomStringGenerator; import org.apache.flink.ml.linalg.DenseVector; import org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -136,4 +138,53 @@ public class DataGeneratorTest { } assertEquals(generator.getNumValues(), count); } + + @Test + public void testRandomStringGenerator() { + String col1 = "col1"; + String col2 = "col2"; + + RandomStringGenerator generator = + new RandomStringGenerator() + .setColNames(new String[] {col1, col2}) + .setSeed(2L) + .setNumValues(5) + .setNumDistinctValue(2); + + int count = 0; + for (CloseableIterator<Row> it = generator.getData(tEnv)[0].execute().collect(); + it.hasNext(); ) { + Row row = it.next(); + count++; + String value1 = (String) row.getField(col1); + String value2 = (String) row.getField(col2); + assertTrue(Integer.parseInt(value1) < generator.getNumDistinctValue()); + assertTrue(Integer.parseInt(value2) < generator.getNumDistinctValue()); + } + assertEquals(generator.getNumValues(), count); + } + + @Test + public void testDoubleGenerator() { + String col1 = "col1"; + String col2 = "col2"; + + DoubleGenerator generator = + new DoubleGenerator() + .setColNames(new String[] {"col1", "col2"}) + .setSeed(2L) + .setNumValues(5); + + int count = 0; + for (CloseableIterator<Row> it = generator.getData(tEnv)[0].execute().collect(); + it.hasNext(); ) { + Row row = it.next(); + count++; + double value1 = (Double) row.getField(col1); + double value2 = (Double) row.getField(col2); + assertTrue(value1 <= 1 && value1 >= 0); + assertTrue(value2 <= 1 && value2 >= 0); + } + assertEquals(generator.getNumValues(), count); + } }
