IGNITE-9158: [ML] Pipeline this closes #4715
Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/cd0ead32 Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/cd0ead32 Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/cd0ead32 Branch: refs/heads/ignite-7251 Commit: cd0ead329e7621a50bc0dc2aaa9acaab83c5eb78 Parents: 7acf4e5 Author: zaleslaw <zaleslaw....@gmail.com> Authored: Tue Sep 18 15:10:16 2018 +0300 Committer: Yury Babak <yba...@gridgain.com> Committed: Tue Sep 18 15:10:16 2018 +0300 ---------------------------------------------------------------------- .../examples/ml/tutorial/Step_3_Categorial.java | 4 +- .../Step_3_Categorial_with_One_Hot_Encoder.java | 6 +- .../ml/tutorial/Step_4_Add_age_fare.java | 4 +- .../examples/ml/tutorial/Step_5_Scaling.java | 4 +- .../tutorial/Step_5_Scaling_with_Pipeline.java | 107 +++++++++++++ .../ignite/examples/ml/tutorial/Step_6_KNN.java | 4 +- .../ml/tutorial/Step_7_Split_train_test.java | 4 +- .../ignite/examples/ml/tutorial/Step_8_CV.java | 4 +- .../ml/tutorial/Step_8_CV_with_Param_Grid.java | 4 +- .../ml/tutorial/Step_9_Go_to_LogReg.java | 4 +- .../org/apache/ignite/ml/pipeline/Pipeline.java | 152 +++++++++++++++++++ .../apache/ignite/ml/pipeline/PipelineMdl.java | 84 ++++++++++ .../apache/ignite/ml/pipeline/package-info.java | 22 +++ .../preprocessing/encoding/EncoderTrainer.java | 4 +- .../org/apache/ignite/ml/IgniteMLTestSuite.java | 2 + .../ignite/ml/pipeline/PipelineMdlTest.java | 75 +++++++++ .../apache/ignite/ml/pipeline/PipelineTest.java | 111 ++++++++++++++ .../ignite/ml/pipeline/PipelineTestSuite.java | 33 ++++ .../encoding/EncoderTrainerTest.java | 12 +- .../scoring/evaluator/EvaluatorTest.java | 4 +- 20 files changed, 615 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial.java index 54726ec..03ff527 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial.java @@ -66,8 +66,8 @@ public class Step_3_Categorial { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(4) + .withEncodedFeature(1) + .withEncodedFeature(4) .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial_with_One_Hot_Encoder.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial_with_One_Hot_Encoder.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial_with_One_Hot_Encoder.java index 4a10c29..a4535ba 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial_with_One_Hot_Encoder.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_3_Categorial_with_One_Hot_Encoder.java @@ -68,9 +68,9 @@ public class Step_3_Categorial_with_One_Hot_Encoder { IgniteBiFunction<Integer, Object[], Vector> oneHotEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.ONE_HOT_ENCODER) - .encodeFeature(0) - .encodeFeature(1) - .encodeFeature(4) + .withEncodedFeature(0) + .withEncodedFeature(1) + .withEncodedFeature(4) .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_4_Add_age_fare.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_4_Add_age_fare.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_4_Add_age_fare.java index 166de44..789d7e8 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_4_Add_age_fare.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_4_Add_age_fare.java @@ -65,8 +65,8 @@ public class Step_4_Add_age_fare { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here. + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here. .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling.java index 937a041..e3de585 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling.java @@ -68,8 +68,8 @@ public class Step_5_Scaling { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here. + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here. .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling_with_Pipeline.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling_with_Pipeline.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling_with_Pipeline.java new file mode 100644 index 0000000..1d5900f --- /dev/null +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_5_Scaling_with_Pipeline.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.examples.ml.tutorial; + +import java.io.FileNotFoundException; +import org.apache.ignite.Ignite; +import org.apache.ignite.IgniteCache; +import org.apache.ignite.Ignition; +import org.apache.ignite.ml.math.functions.IgniteBiFunction; +import org.apache.ignite.ml.pipeline.Pipeline; +import org.apache.ignite.ml.pipeline.PipelineMdl; +import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainer; +import org.apache.ignite.ml.preprocessing.encoding.EncoderType; +import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainer; +import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainer; +import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainer; +import org.apache.ignite.ml.selection.scoring.evaluator.Evaluator; +import org.apache.ignite.ml.selection.scoring.metric.Accuracy; +import org.apache.ignite.ml.tree.DecisionTreeClassificationTrainer; +import org.apache.ignite.thread.IgniteThread; + +/** + * {@link MinMaxScalerTrainer} and {@link NormalizationTrainer} are used in this example due to different values + * distribution in columns and rows. + * <p> + * Code in this example launches Ignite grid and fills the cache with test data (based on Titanic passengers data).</p> + * <p> + * After that it defines preprocessors that extract features from an upstream data and perform other desired changes + * over the extracted data, including the scaling.</p> + * <p> + * Then, it trains the model based on the processed data using decision tree classification.</p> + * <p> + * Finally, this example uses {@link Evaluator} functionality to compute metrics from predictions.</p> + */ +public class Step_5_Scaling_with_Pipeline { + /** Run example. */ + public static void main(String[] args) throws InterruptedException { + System.out.println(); + System.out.println(">>> Tutorial step 5 (scaling) via Pipeline example started."); + + try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) { + IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), + Step_5_Scaling_with_Pipeline.class.getSimpleName(), () -> { + try { + IgniteCache<Integer, Object[]> dataCache = TitanicUtils.readPassengers(ignite); + + // Defines first preprocessor that extracts features from an upstream data. + // Extracts "pclass", "sibsp", "parch", "sex", "embarked", "age", "fare". + IgniteBiFunction<Integer, Object[], Object[]> featureExtractor + = (k, v) -> new Object[]{v[0], v[3], v[4], v[5], v[6], v[8], v[10]}; + + IgniteBiFunction<Integer, Object[], Double> lbExtractor = (k, v) -> (double) v[1]; + + PipelineMdl<Integer, Object[]> mdl = new Pipeline<Integer, Object[], Object[]>() + .addFeatureExtractor(featureExtractor) + .addLabelExtractor(lbExtractor) + .addPreprocessor(new EncoderTrainer<Integer, Object[]>() + .withEncoderType(EncoderType.STRING_ENCODER) + .withEncodedFeature(1) + .withEncodedFeature(6)) + .addPreprocessor(new ImputerTrainer<Integer, Object[]>()) + .addPreprocessor(new MinMaxScalerTrainer<Integer, Object[]>()) + .addPreprocessor(new NormalizationTrainer<Integer, Object[]>() + .withP(1)) + .addTrainer(new DecisionTreeClassificationTrainer(5, 0)) + .fit(ignite, dataCache); + + System.out.println("\n>>> Trained model: " + mdl); + + double accuracy = Evaluator.evaluate( + dataCache, + mdl, + mdl.getFeatureExtractor(), + mdl.getLabelExtractor(), + new Accuracy<>() + ); + + System.out.println("\n>>> Accuracy " + accuracy); + System.out.println("\n>>> Test Error " + (1 - accuracy)); + + System.out.println(">>> Tutorial step 5 (scaling) via Pipeline example completed."); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + }); + + igniteThread.start(); + igniteThread.join(); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_6_KNN.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_6_KNN.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_6_KNN.java index 7d0986d..e99494b 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_6_KNN.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_6_KNN.java @@ -69,8 +69,8 @@ public class Step_6_KNN { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here. + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here. .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_7_Split_train_test.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_7_Split_train_test.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_7_Split_train_test.java index 9f1a505..2ce2b27 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_7_Split_train_test.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_7_Split_train_test.java @@ -74,8 +74,8 @@ public class Step_7_Split_train_test { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here. + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here. .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV.java index 12e5fe8..83c2cca 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV.java @@ -86,8 +86,8 @@ public class Step_8_CV { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here. + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here. .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid.java index 785064c..73a0303 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid.java @@ -88,8 +88,8 @@ public class Step_8_CV_with_Param_Grid { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here. + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here. .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_9_Go_to_LogReg.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_9_Go_to_LogReg.java b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_9_Go_to_LogReg.java index 93dff53..088caf7 100644 --- a/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_9_Go_to_LogReg.java +++ b/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_9_Go_to_LogReg.java @@ -79,8 +79,8 @@ public class Step_9_Go_to_LogReg { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here .fit(ignite, dataCache, featureExtractor http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/Pipeline.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/Pipeline.java b/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/Pipeline.java new file mode 100644 index 0000000..ba5740e --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/Pipeline.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.pipeline; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.apache.ignite.Ignite; +import org.apache.ignite.IgniteCache; +import org.apache.ignite.ml.Model; +import org.apache.ignite.ml.dataset.DatasetBuilder; +import org.apache.ignite.ml.dataset.impl.cache.CacheBasedDatasetBuilder; +import org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder; +import org.apache.ignite.ml.math.functions.IgniteBiFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.preprocessing.PreprocessingTrainer; +import org.apache.ignite.ml.trainers.DatasetTrainer; + +/** + * A simple pipeline, which acts as a global trainer which produce a Pipeline Model. + * A Pipeline consists of a sequence of stages, each of which is either a Preprocessing Stage or a Trainer. + * When {@code fit()} method is called, the stages are executed in order. + * + * @param <K> Type of a key in {@code upstream} data. + * @param <V> Type of a value in {@code upstream} data. + * @param <R> Type of a result in {@code upstream} feature extractor. + */ +public class Pipeline<K, V, R> { + /** Feature extractor. */ + private IgniteBiFunction<K, V, R> finalFeatureExtractor; + + /** Label extractor. */ + private IgniteBiFunction<K, V, Double> lbExtractor; + + /** Prerpocessor stages. */ + private List<PreprocessingTrainer> preprocessors = new ArrayList<>(); + + /** Final trainer stage. */ + private DatasetTrainer finalStage; + + /** + * Adds feature extractor as a zero stage. + * + * @param featureExtractor The parameter value. + * @return The updated Pipeline. + */ + public Pipeline<K, V, R> addFeatureExtractor(IgniteBiFunction<K, V, R> featureExtractor) { + this.finalFeatureExtractor = featureExtractor; + return this; + } + + /** + * Adds a label extractor for the produced model. + * + * @param lbExtractor The parameter value. + * @return The updated Pipeline. + */ + public Pipeline<K, V, R> addLabelExtractor(IgniteBiFunction<K, V, Double> lbExtractor) { + this.lbExtractor = lbExtractor; + return this; + } + + /** + * Adds a preprocessor. + * + * @param preprocessor The parameter value. + * @return The updated Pipeline. + */ + public Pipeline<K, V, R> addPreprocessor(PreprocessingTrainer preprocessor) { + preprocessors.add(preprocessor); + return this; + } + + /** + * Adds a trainer. + * + * @param trainer The parameter value. + * @return The updated Pipeline. + */ + public Pipeline<K, V, R> addTrainer(DatasetTrainer trainer) { + this.finalStage = trainer; + return this; + } + + /** + * Fits the pipeline to the input cache. + * + * @param ignite Ignite instance. + * @param cache Ignite cache with {@code upstream} data. + * @return The fitted model based on chain of preprocessors and final trainer. + */ + public PipelineMdl<K, V> fit(Ignite ignite, IgniteCache<K, V> cache) { + DatasetBuilder datasetBuilder = new CacheBasedDatasetBuilder<>(ignite, cache); + return fit(datasetBuilder); + } + + /** + * Fits the pipeline to the input mock data. + * + * @param data Data. + * @param parts Number of partitions. + * @return The fitted model based on chain of preprocessors and final trainer. + */ + public PipelineMdl<K, V> fit(Map<K, V> data, int parts) { + DatasetBuilder datasetBuilder = new LocalDatasetBuilder<>(data, parts); + return fit(datasetBuilder); + } + + /** Fits the pipeline to the input dataset builder. */ + private PipelineMdl<K, V> fit(DatasetBuilder datasetBuilder) { + assert lbExtractor != null; + assert finalFeatureExtractor != null; + + if (finalStage == null) + throw new IllegalStateException("The Pipeline should be finished with the Training Stage."); + + preprocessors.forEach(e -> { + + finalFeatureExtractor = e.fit( + datasetBuilder, + finalFeatureExtractor + ); + }); + + Model<Vector, Double> internalMdl = finalStage + .fit( + datasetBuilder, + finalFeatureExtractor, + lbExtractor + ); + + return new PipelineMdl<K, V>() + .withFeatureExtractor(finalFeatureExtractor) + .withLabelExtractor(lbExtractor) + .withInternalMdl(internalMdl); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/PipelineMdl.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/PipelineMdl.java b/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/PipelineMdl.java new file mode 100644 index 0000000..edd70eb --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/PipelineMdl.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.pipeline; + +import org.apache.ignite.ml.Model; +import org.apache.ignite.ml.math.functions.IgniteBiFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; + +/** + * Wraps the model produced by {@link Pipeline}. + * + * @param <K> Type of a key in {@code upstream} data. + * @param <V> Type of a value in {@code upstream} data. + */ +public class PipelineMdl<K, V> implements Model<Vector, Double> { + /** Internal model produced by {@link Pipeline}. */ + private Model<Vector, Double> internalMdl; + + /** Feature extractor. */ + private IgniteBiFunction<K, V, Vector> featureExtractor; + + /** Label extractor. */ + private IgniteBiFunction<K, V, Double> lbExtractor; + + /** */ + @Override public Double apply(Vector vector) { + return internalMdl.apply(vector); + } + + /** */ + public IgniteBiFunction<K, V, Vector> getFeatureExtractor() { + return featureExtractor; + } + + /** */ + public IgniteBiFunction<K, V, Double> getLabelExtractor() { + return lbExtractor; + } + + /** */ + public Model<Vector, Double> getInternalMdl() { + return internalMdl; + } + + /** */ + public PipelineMdl<K, V> withInternalMdl(Model<Vector, Double> internalMdl) { + this.internalMdl = internalMdl; + return this; + } + + /** */ + public PipelineMdl<K, V> withFeatureExtractor(IgniteBiFunction featureExtractor) { + this.featureExtractor = featureExtractor; + return this; + } + + /** */ + public PipelineMdl<K, V> withLabelExtractor(IgniteBiFunction<K, V, Double> lbExtractor) { + this.lbExtractor = lbExtractor; + return this; + } + + /** */ + @Override public String toString() { + return "PipelineMdl{" + + "internalMdl=" + internalMdl + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/package-info.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/package-info.java new file mode 100644 index 0000000..d29140a --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/pipeline/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Contains Pipeline API. + */ +package org.apache.ignite.ml.pipeline; http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java index a23d642..8b2d9b7 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java @@ -207,7 +207,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V, Object[] * @param idx The index of encoded feature. * @return The changed trainer. */ - public EncoderTrainer<K, V> encodeFeature(int idx) { + public EncoderTrainer<K, V> withEncodedFeature(int idx) { handledIndices.add(idx); return this; } @@ -229,7 +229,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V, Object[] * @param handledIndices Indices of features which should be encoded. * @return The changed trainer. */ - public EncoderTrainer<K, V> withEncoderType(Set<Integer> handledIndices) { + public EncoderTrainer<K, V> withEncodedFeatures(Set<Integer> handledIndices) { this.handledIndices = handledIndices; return this; } http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/test/java/org/apache/ignite/ml/IgniteMLTestSuite.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/IgniteMLTestSuite.java b/modules/ml/src/test/java/org/apache/ignite/ml/IgniteMLTestSuite.java index 22ae953..481e1fa 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/IgniteMLTestSuite.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/IgniteMLTestSuite.java @@ -26,6 +26,7 @@ import org.apache.ignite.ml.genetic.GAGridTestSuite; import org.apache.ignite.ml.knn.KNNTestSuite; import org.apache.ignite.ml.math.MathImplMainTestSuite; import org.apache.ignite.ml.nn.MLPTestSuite; +import org.apache.ignite.ml.pipeline.PipelineTestSuite; import org.apache.ignite.ml.preprocessing.PreprocessingTestSuite; import org.apache.ignite.ml.regressions.RegressionsTestSuite; import org.apache.ignite.ml.selection.SelectionTestSuite; @@ -49,6 +50,7 @@ import org.junit.runners.Suite; KNNTestSuite.class, MLPTestSuite.class, DatasetTestSuite.class, + PipelineTestSuite.class, PreprocessingTestSuite.class, GAGridTestSuite.class, SelectionTestSuite.class, http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineMdlTest.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineMdlTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineMdlTest.java new file mode 100644 index 0000000..d740577 --- /dev/null +++ b/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineMdlTest.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.pipeline; + +import org.apache.ignite.ml.TestUtils; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.impl.DenseVector; +import org.apache.ignite.ml.regressions.logistic.binomial.LogisticRegressionModel; +import org.junit.Test; + +/** + * Tests for {@link PipelineMdl}. + */ +public class PipelineMdlTest { + /** Precision in test checks. */ + private static final double PRECISION = 1e-6; + + /** */ + @Test + public void testPredict() { + Vector weights = new DenseVector(new double[] {2.0, 3.0}); + + verifyPredict(getMdl(new LogisticRegressionModel(weights, 1.0).withRawLabels(true))); + } + + private PipelineMdl<Integer, double[]> getMdl(LogisticRegressionModel internalMdl) { + return new PipelineMdl<Integer, double[]>() + .withFeatureExtractor(null) + .withLabelExtractor(null) + .withInternalMdl(internalMdl); + } + + /** */ + private void verifyPredict(PipelineMdl mdl) { + Vector observation = new DenseVector(new double[] {1.0, 1.0}); + TestUtils.assertEquals(sigmoid(1.0 + 2.0 * 1.0 + 3.0 * 1.0), mdl.apply(observation), PRECISION); + + observation = new DenseVector(new double[] {2.0, 1.0}); + TestUtils.assertEquals(sigmoid(1.0 + 2.0 * 2.0 + 3.0 * 1.0), mdl.apply(observation), PRECISION); + + observation = new DenseVector(new double[] {1.0, 2.0}); + TestUtils.assertEquals(sigmoid(1.0 + 2.0 * 1.0 + 3.0 * 2.0), mdl.apply(observation), PRECISION); + + observation = new DenseVector(new double[] {-2.0, 1.0}); + TestUtils.assertEquals(sigmoid(1.0 - 2.0 * 2.0 + 3.0 * 1.0), mdl.apply(observation), PRECISION); + + observation = new DenseVector(new double[] {1.0, -2.0}); + TestUtils.assertEquals(sigmoid(1.0 + 2.0 * 1.0 - 3.0 * 2.0), mdl.apply(observation), PRECISION); + } + + /** + * Sigmoid function. + * + * @param z The regression value. + * @return The result. + */ + private static double sigmoid(double z) { + return 1.0 / (1.0 + Math.exp(-z)); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTest.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTest.java new file mode 100644 index 0000000..91bbcd4 --- /dev/null +++ b/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.pipeline; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import org.apache.ignite.ml.TestUtils; +import org.apache.ignite.ml.common.TrainerTest; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.VectorUtils; +import org.apache.ignite.ml.nn.UpdatesStrategy; +import org.apache.ignite.ml.optimization.updatecalculators.SimpleGDParameterUpdate; +import org.apache.ignite.ml.optimization.updatecalculators.SimpleGDUpdateCalculator; +import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainer; +import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainer; +import org.apache.ignite.ml.regressions.logistic.binomial.LogisticRegressionSGDTrainer; +import org.junit.Test; + +/** + * Tests for {@link Pipeline}. + */ +public class PipelineTest extends TrainerTest { + /** + * Test trainer on classification model y = x. + */ + @Test + public void testTrainWithTheLinearlySeparableCase() { + Map<Integer, Double[]> cacheMock = new HashMap<>(); + + for (int i = 0; i < twoLinearlySeparableClasses.length; i++) { + double[] row = twoLinearlySeparableClasses[i]; + Double[] convertedRow = new Double[row.length]; + for (int j = 0; j < row.length; j++) + convertedRow[j] = row[j]; + cacheMock.put(i, convertedRow); + } + + LogisticRegressionSGDTrainer<?> trainer = new LogisticRegressionSGDTrainer<>(new UpdatesStrategy<>( + new SimpleGDUpdateCalculator().withLearningRate(0.2), + SimpleGDParameterUpdate::sumLocal, + SimpleGDParameterUpdate::avg + ), 100000, 10, 100, 123L); + + PipelineMdl<Integer, Double[]> mdl = new Pipeline<Integer, Double[], Vector>() + .addFeatureExtractor((k, v) -> VectorUtils.of(Arrays.copyOfRange(v, 1, v.length))) + .addLabelExtractor((k, v) -> v[0]) + .addPreprocessor(new MinMaxScalerTrainer<Integer, Object[]>()) + .addPreprocessor(new NormalizationTrainer<Integer, Object[]>() + .withP(1)) + .addTrainer(trainer) + .fit( + cacheMock, + parts + ); + + TestUtils.assertEquals(0, mdl.apply(VectorUtils.of(100, 10)), PRECISION); + TestUtils.assertEquals(1, mdl.apply(VectorUtils.of(10, 100)), PRECISION); + } + + /** + * Test the missed final state. + */ + @Test(expected = IllegalStateException.class) + public void testTrainWithMissedFinalStage() { + Map<Integer, Double[]> cacheMock = new HashMap<>(); + + for (int i = 0; i < twoLinearlySeparableClasses.length; i++) { + double[] row = twoLinearlySeparableClasses[i]; + Double[] convertedRow = new Double[row.length]; + for (int j = 0; j < row.length; j++) + convertedRow[j] = row[j]; + cacheMock.put(i, convertedRow); + } + + LogisticRegressionSGDTrainer<?> trainer = new LogisticRegressionSGDTrainer<>(new UpdatesStrategy<>( + new SimpleGDUpdateCalculator().withLearningRate(0.2), + SimpleGDParameterUpdate::sumLocal, + SimpleGDParameterUpdate::avg + ), 100000, 10, 100, 123L); + + PipelineMdl<Integer, Double[]> mdl = new Pipeline<Integer, Double[], Vector>() + .addFeatureExtractor((k, v) -> VectorUtils.of(Arrays.copyOfRange(v, 1, v.length))) + .addLabelExtractor((k, v) -> v[0]) + .addPreprocessor(new MinMaxScalerTrainer<Integer, Object[]>()) + .addPreprocessor(new NormalizationTrainer<Integer, Object[]>() + .withP(1)) + .fit( + cacheMock, + parts + ); + + TestUtils.assertEquals(0, mdl.apply(VectorUtils.of(100, 10)), PRECISION); + TestUtils.assertEquals(1, mdl.apply(VectorUtils.of(10, 100)), PRECISION); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTestSuite.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTestSuite.java b/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTestSuite.java new file mode 100644 index 0000000..4c28db9 --- /dev/null +++ b/modules/ml/src/test/java/org/apache/ignite/ml/pipeline/PipelineTestSuite.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.pipeline; + +import org.junit.runner.RunWith; +import org.junit.runners.Suite; + +/** + * Test suite for the pipeline. + */ +@RunWith(Suite.class) +@Suite.SuiteClasses({ + PipelineTest.class, + PipelineMdlTest.class +}) +public class PipelineTestSuite { + // No-op. +} http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java index c0157e9..6d01901 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java @@ -68,8 +68,8 @@ public class EncoderTrainerTest { EncoderTrainer<Integer, String[]> strEncoderTrainer = new EncoderTrainer<Integer, String[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(0) - .encodeFeature(1); + .withEncodedFeature(0) + .withEncodedFeature(1); EncoderPreprocessor<Integer, String[]> preprocessor = strEncoderTrainer.fit( datasetBuilder, @@ -94,8 +94,8 @@ public class EncoderTrainerTest { EncoderTrainer<Integer, Object[]> strEncoderTrainer = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.ONE_HOT_ENCODER) - .encodeFeature(0) - .encodeFeature(1); + .withEncodedFeature(0) + .withEncodedFeature(1); EncoderPreprocessor<Integer, Object[]> preprocessor = strEncoderTrainer.fit( datasetBuilder, @@ -120,8 +120,8 @@ public class EncoderTrainerTest { EncoderTrainer<Integer, Object[]> strEncoderTrainer = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(0) - .encodeFeature(1); + .withEncodedFeature(0) + .withEncodedFeature(1); EncoderPreprocessor<Integer, Object[]> preprocessor = strEncoderTrainer.fit( datasetBuilder, http://git-wip-us.apache.org/repos/asf/ignite/blob/cd0ead32/modules/ml/src/test/java/org/apache/ignite/ml/selection/scoring/evaluator/EvaluatorTest.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/selection/scoring/evaluator/EvaluatorTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/selection/scoring/evaluator/EvaluatorTest.java index ad91df0..6f7aa36 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/selection/scoring/evaluator/EvaluatorTest.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/selection/scoring/evaluator/EvaluatorTest.java @@ -286,8 +286,8 @@ public class EvaluatorTest extends GridCommonAbstractTest { IgniteBiFunction<Integer, Object[], Vector> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>() .withEncoderType(EncoderType.STRING_ENCODER) - .encodeFeature(1) - .encodeFeature(6) // <--- Changed index here + .withEncodedFeature(1) + .withEncodedFeature(6) // <--- Changed index here .fit(ignite, cache, featureExtractor