Hello,I need an advice on the usage of K-Fold cross-validation for the master's thesis I'm supervising. As I know, we run it with the best parameters but do we use train or test dataset? I'm sharing the python code that I'm working on. I would appreciate if you correct my mistakes.Yours sincerely,Murat Dıramalı
#!/usr/bin/env python # coding: utf-8
import pandas as pd import numpy as np from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split, KFold, GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score # Generate 1000 samples with 10 features data, labels = make_classification(n_samples=1000, n_features=10, random_state=42) # Split the data into a training set and a test set X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42) # Set up the K-Fold cross-validation n_splits = 10 kf = KFold(n_splits=n_splits, shuffle=True) # Set up the Random Forest Classifier rfc = RandomForestClassifier() # Set up the hyperparameters to tune param_grid = { 'n_estimators': [50, 100, 150, 200], 'max_depth': [10, 20, 30, 40], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], } # Set up the grid search with cross-validation grid_search = GridSearchCV(rfc, param_grid, cv=kf, scoring='accuracy', n_jobs=-1) # Run the grid search grid_search.fit(X_train, y_train) # Report the best hyperparameters best_params = grid_search.best_params_ print("Best hyperparameters:", best_params) # Run k-fold cross-validation with the best hyperparameters on the training set accuracies = [] for train_index, test_index in kf.split(X_train): # Split the training set into training and validation sets X_train_fold, X_val_fold = X_train[train_index], X_train[test_index] y_train_fold, y_val_fold = y_train[train_index], y_train[test_index] # Set up and train the Random Forest Classifier with the best hyperparameters rfc = RandomForestClassifier(**best_params) rfc.fit(X_train_fold, y_train_fold) # Make predictions on the validation set and calculate the accuracy score y_pred = rfc.predict(X_val_fold) acc = accuracy_score(y_val_fold, y_pred) # Add the accuracy score to the list of accuracies accuracies.append(acc) # Report the accuracy scores for each fold print("Accuracy scores for each fold:", accuracies) # Train the Random Forest Classifier with the best hyperparameters on the entire training set rfc = RandomForestClassifier(**best_params) rfc.fit(X_train, y_train) # Make predictions on the test set and calculate the accuracy score y_pred = rfc.predict(X_test) acc = accuracy_score(y_test, y_pred) print("Accuracy on test set:", acc)
_______________________________________________ scikit-learn mailing list scikit-learn@python.org https://mail.python.org/mailman/listinfo/scikit-learn