[scikit-learn] Validation curve - Learning curve

S Hamidizade Wed, 28 Mar 2018 23:56:03 -0700

Dear Mr. / Ms.

I would appreciate if you could let me know in the following example code:


from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import
train_test_split,StratifiedKFold,learning_curve,validation_curve,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(train_sizes, train_scores, test_scores, title,
alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score',
color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red',
marker='o')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


def plot_validation_curve(param_range, train_scores, test_scores,
title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, label='train score',
color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red',
marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean -
test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Parameter value')
    plt.ylabel('F-measure')
    plt.legend(loc='best')
    plt.show()

X, y = make_classification(n_classes=2, class_sep=2,weights=[0.9,
0.1], n_informative=3, n_redundant=1, flip_y=0, n_features=20,
n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape {}'.format(Counter(y)))

ln = X.shape
names = ["x%s" % i for i in range(1, ln[1] + 1)]

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)
st=StandardScaler()

rg = LogisticRegression(class_weight = { 0:1, 1:6.5 }, random_state =
42, solver = 'saga',max_iter=100,n_jobs=-1)

param_grid = {'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005],
              'clf__class_weight':[{ 0:1, 1:6 },{ 0:1, 1:4 },{ 0:1,
1:5.5 },{ 0:1, 1:4.5 },{ 0:1, 1:5 }]
              }

pipeline = Pipeline(steps=[('scaler', st),
                           ('clf', rg )])

cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring =  'f1')
rg_cv.fit(X_train, y_train)
print("Tuned rg best params: {}".format(rg_cv.best_params_))

ypred = rg_cv.predict(X_train)
print(classification_report(y_train, ypred))
print('######################')
ypred2 = rg_cv.predict(X_test)
print(classification_report(y_test, ypred2))

plt.figure(figsize=(9,6))
param_range1=[i / 10000.0 for i in range(1, 11)]
param_range2=[{ 0:1, 1:6 },{ 0:1, 1:4 },{ 0:1, 1:5.5 },{ 0:1, 1:4.5
},{ 0:1, 1:5 }]

if __name__ == '__main__':
    train_sizes, train_scores, test_scores = learning_curve(
              estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
                train_sizes=np.arange(0.1,1.1,0.1), cv= cv,
scoring='f1', n_jobs= - 1)

    plot_learning_curve(train_sizes, train_scores, test_scores,
title='Learning curve for Logistic Regression')

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train,
param_name="clf__C", param_range=param_range1,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range1, train_scores, test_scores,
title="Validation Curve for C", alpha=0.1)

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train,
param_name="clf__class_weight", param_range=param_range2,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range2, train_scores, test_scores,
title="Validation Curve for class_weight", alpha=0.1)


1- Why when the best estimator of GridSearchCv is passed into the learning
curve function, it prints all the previous print lines several times (run
on windows)?

2- How to plot validation curve for class weight? TypeError: float()
argument must be a string or a number, not 'dict'
Thanks in advance.
Best regards,

_______________________________________________
scikit-learn mailing list
scikit-learn@python.org
https://mail.python.org/mailman/listinfo/scikit-learn

[scikit-learn] Validation curve - Learning curve

Reply via email to