Hi,
OS denied me memory upon running CV in the script below. I am still
investigating whether it was a mistake of the scheduler on the server, but
I think the process had access to 240 GB memory but reproducibly crashes
upon using 120035176K with the error message below. I paste my conda info
output at the very end.
Could it be an sklearn/NumPy/Python issue?
Thanks!
Laszlo
******* Script:
#import scipy as sp
import numpy as np
import pandas as pd
import multiprocessing as mp
# import iopro
from sklearn import grid_search
from sklearn import cross_validation
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import ElasticNetCV
from sklearn.externals import joblib
def main():
print("Started.")
# n = 10**6
# notreatadapter =
iopro.text_adapter('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreat.csv',
parser='csv')
# X = notreatadapter[1:][:] #[0:n]
# y = notreatadapter[0][:] #[0:n]
# notreatdata =
pd.read_stata('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreat.dta')
# notreatdata = notreatdata.iloc[:10000,:]
# X = notreatdata.iloc[:,1:]
# y = notreatdata.iloc[:,0]
X =
pd.read_csv('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreat.csv')#,nrows=833333)
y = X.iloc[:,0].values #
pd.read_csv('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/absnt.csv',
parser='csv',usecols=[0])
X = X.iloc[:,1:].values
n = y.shape[0]
print("Data lodaded.")
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y, test_size=0.4, random_state=0)
print("Data split.")
scaler = StandardScaler()
scaler.fit(X_train) # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) # apply same transformation to test
data
print("Data scaled.")
# build a model
joblib.dump(X_train,'/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreatX')
joblib.dump(y_train,'/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreaty')
print("Data dumped.")
X_train =
joblib.load('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreatX',
mmap_mode='r+')
y_train =
joblib.load('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreaty',
mmap_mode='r+')
print("Data reloaded and mmapped.")
# model = SGDClassifier(penalty='elasticnet',n_iter = np.ceil(10**6 /
float(n)),shuffle=True)
print("CV starts.")
en = ElasticNetCV(l1_ratio=[.05, .15, .5, .7, .9, .95, .99,
1],n_jobs=-1)
en.fit(X_train, y_train)
print("Best for alphas:")
print(en.alpha_)
print("Best l1-ratio:")
print(en.l1_ratio_)
print("Coefficients:")
print(en.coef_)
#evaluate
y_pred = en.predict(X_test)
test_score = r2_score(y_test, y_pred)
print("Test estimator has R^2 %2.2f in the test sample.",test_score)
if __name__=='__main__':
mp.freeze_support()
main()
*** Error:
Traceback (most recent call last):
File
"/n/chetty/Users/lsandor/sweden/.code/controls/lasso/scikit_notreat_predictors_en.py",
line 67, in <module>
main()
File
"/n/chetty/Users/lsandor/sweden/.code/controls/lasso/scikit_notreat_predictors_en.py",
line 50, in main
en.fit(X_train, y_train)
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py",
line 1101, in fit
mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 604, in __call__
self._pool = MemmapingPool(n_jobs, **poolargs)
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py",
line 559, in __init__
super(MemmapingPool, self).__init__(**poolargs)
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py",
line 400, in __init__
super(PicklingPool, self).__init__(**poolargs)
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/pool.py",
line 159, in __init__
self._repopulate_pool()
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/pool.py",
line 223, in _repopulate_pool
w.start()
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/process.py",
line 130, in start
self._popen = Popen(self)
File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/forking.py",
line 121, in __init__
self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory
*** Conda:
Current conda install:
platform : linux-64
conda version : 3.4.1
python version : 2.7.6.final.0
root environment : /n/sw/fasrcsw/apps/Core/Anaconda/1.9.2-fasrc01/x
(read only)
default environment : /n/chetty/Users/lsandor/envs/laszlo
envs directories : /n/chetty/Users/lsandor/envs
/n/sw/fasrcsw/apps/Core/Anaconda/1.9.2-fasrc01/x/envs
package cache : /n/chetty/Users/lsandor/envs/.pkgs
/n/sw/fasrcsw/apps/Core/Anaconda/1.9.2-fasrc01/x/pkgs
channel URLs : http://repo.continuum.io/pkgs/free/linux-64/
http://repo.continuum.io/pkgs/pro/linux-64/
config file : None
is foreign system : False
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
Scikit-learn-general mailing list
Scikit-learn-general@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/scikit-learn-general