[incubator-mxnet] branch master updated: Add Kaggle w. gluon pipeline and k fold cross validation (#7915)

jxie Wed, 20 Sep 2017 13:06:28 -0700

This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new 67c0ad7  Add Kaggle w. gluon pipeline and k fold cross validation 
(#7915)
67c0ad7 is described below

commit 67c0ad7b945e90935d97f4663504fad025880f89
Author: Aston Zhang <ast...@amazon.com>
AuthorDate: Wed Sep 20 13:06:07 2017 -0700

    Add Kaggle w. gluon pipeline and k fold cross validation (#7915)
    
    * Add Kaggle w. gluon pipeline and k fold cross validation
    
    * add descriptions
---
 example/gluon/kaggle_k_fold_cross_validation.py | 164 ++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/example/gluon/kaggle_k_fold_cross_validation.py 
b/example/gluon/kaggle_k_fold_cross_validation.py
new file mode 100644
index 0000000..7911e4d
--- /dev/null
+++ b/example/gluon/kaggle_k_fold_cross_validation.py
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# This example provides an end-to-end pipeline for a common Kaggle competition.
+# The entire pipeline includes common utilities such as k-fold cross validation
+# and data pre-processing.
+#
+# Specifically, the example studies the `House Prices: Advanced Regression
+# Techniques` challenge as a case study.
+#
+# The link to the problem on Kaggle:
+# https://www.kaggle.com/c/house-prices-advanced-regression-techniques
+
+import numpy as np
+import pandas as pd
+from mxnet import autograd
+from mxnet import gluon
+from mxnet import ndarray as nd
+
+# After logging in www.kaggle.com, the training and testing data sets can be 
downloaded at:
+# 
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv
+# 
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv
+train = pd.read_csv("train.csv")
+test = pd.read_csv("test.csv")
+all_X = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
+                      test.loc[:, 'MSSubClass':'SaleCondition']))
+
+# Get all the numerical features and apply standardization.
+numeric_feas = all_X.dtypes[all_X.dtypes != "object"].index
+all_X[numeric_feas] = all_X[numeric_feas].apply(lambda x:
+                                                (x - x.mean()) / (x.std()))
+# Convert categorical feature values to numerical (including N/A).
+all_X = pd.get_dummies(all_X, dummy_na=True)
+# Approximate N/A feature value by the mean value of the current feature.
+all_X = all_X.fillna(all_X.mean())
+
+num_train = train.shape[0]
+
+# Convert data formats to NDArrays to feed into gluon.
+X_train = all_X[:num_train].as_matrix()
+X_test = all_X[num_train:].as_matrix()
+y_train = train.SalePrice.as_matrix()
+
+X_train = nd.array(X_train)
+y_train = nd.array(y_train)
+y_train.reshape((num_train, 1))
+
+X_test = nd.array(X_test)
+square_loss = gluon.loss.L2Loss()
+
+def get_rmse_log(net, X_train, y_train):
+    """Gets root mse between the logarithms of the prediction and the truth."""
+    num_train = X_train.shape[0]
+    clipped_preds = nd.clip(net(X_train), 1, float('inf'))
+    return np.sqrt(2 * nd.sum(square_loss(
+        nd.log(clipped_preds), nd.log(y_train))).asscalar() / num_train)
+
+def get_net():
+    """Gets a neural network. Better results are obtained with 
modifications."""
+    net = gluon.nn.Sequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(50, activation="relu"))
+        net.add(gluon.nn.Dense(1))
+    net.initialize()
+    return net
+
+def train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
+          weight_decay, batch_size):
+    """Trains the model."""
+    dataset_train = gluon.data.ArrayDataset(X_train, y_train)
+    data_iter_train = gluon.data.DataLoader(dataset_train, batch_size,
+                                            shuffle=True)
+    trainer = gluon.Trainer(net.collect_params(), 'adam',
+                            {'learning_rate': learning_rate,
+                             'wd': weight_decay})
+    net.collect_params().initialize(force_reinit=True)
+    for epoch in range(epochs):
+        for data, label in data_iter_train:
+            with autograd.record():
+                output = net(data)
+                loss = square_loss(output, label)
+            loss.backward()
+            trainer.step(batch_size)
+            avg_loss = get_rmse_log(net, X_train, y_train)
+        if epoch > verbose_epoch:
+            print("Epoch %d, train loss: %f" % (epoch, avg_loss))
+    return avg_loss
+
+def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
+                       learning_rate, weight_decay, batch_size):
+    """Conducts k-fold cross validation for the model."""
+    assert k > 1
+    fold_size = X_train.shape[0] // k
+
+    train_loss_sum = 0.0
+    test_loss_sum = 0.0
+    for test_idx in range(k):
+        X_val_test = X_train[test_idx * fold_size: (test_idx + 1) *
+                                                   fold_size, :]
+        y_val_test = y_train[test_idx * fold_size: (test_idx + 1) * fold_size]
+        val_train_defined = False
+        for i in range(k):
+            if i != test_idx:
+                X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
+                y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
+                if not val_train_defined:
+                    X_val_train = X_cur_fold
+                    y_val_train = y_cur_fold
+                    val_train_defined = True
+                else:
+                    X_val_train = nd.concat(X_val_train, X_cur_fold, dim=0)
+                    y_val_train = nd.concat(y_val_train, y_cur_fold, dim=0)
+        net = get_net()
+        train_loss = train(net, X_val_train, y_val_train, epochs, 
verbose_epoch,
+                           learning_rate, weight_decay, batch_size)
+        train_loss_sum += train_loss
+        test_loss = get_rmse_log(net, X_val_test, y_val_test)
+        print("Test loss: %f" % test_loss)
+        test_loss_sum += test_loss
+    return train_loss_sum / k, test_loss_sum / k
+
+# The sets of parameters. Better results are obtained with modifications.
+# These parameters can be fine-tuned with k-fold cross-validation.
+k = 5
+epochs = 100
+verbose_epoch = 95
+learning_rate = 0.3
+weight_decay = 100
+batch_size = 100
+
+train_loss, test_loss = \
+    k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
+                       learning_rate, weight_decay, batch_size)
+print("%d-fold validation: Avg train loss: %f, Avg test loss: %f" %
+      (k, train_loss, test_loss))
+
+def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
+          weight_decay, batch_size):
+    """Trains the model and predicts on the test data set."""
+    net = get_net()
+    _ = train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
+                 weight_decay, batch_size)
+    preds = net(X_test).asnumpy()
+    test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
+    submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
+    submission.to_csv('submission.csv', index=False)
+
+learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
+      weight_decay, batch_size)

-- 
To stop receiving notification emails like this one, please contact
['"comm...@mxnet.apache.org" <comm...@mxnet.apache.org>'].

[incubator-mxnet] branch master updated: Add Kaggle w. gluon pipeline and k fold cross validation (#7915)

Reply via email to