Hi,

OS denied me memory upon running CV in the script below. I am still
investigating whether it was a mistake of the scheduler on the server, but
I think the process had access to 240 GB memory but reproducibly crashes
upon using 120035176K with the error message below. I paste my conda info
output at the very end.


Could it be an sklearn/NumPy/Python issue?


Thanks!


Laszlo




******* Script:

#import scipy as sp

import numpy as np

import pandas as pd

import multiprocessing as mp

# import iopro

from sklearn import grid_search

from sklearn import cross_validation

from sklearn.preprocessing import StandardScaler

# from sklearn.linear_model import SGDClassifier

from sklearn.linear_model import ElasticNetCV

from sklearn.externals import joblib

def main():

    print("Started.")

    # n = 10**6

#    notreatadapter =
iopro.text_adapter('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreat.csv',
parser='csv')

#    X = notreatadapter[1:][:] #[0:n]

#    y = notreatadapter[0][:] #[0:n]

    # notreatdata =
pd.read_stata('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreat.dta')

    # notreatdata = notreatdata.iloc[:10000,:]

    # X = notreatdata.iloc[:,1:]

    # y = notreatdata.iloc[:,0]

    X =
pd.read_csv('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreat.csv')#,nrows=833333)

    y = X.iloc[:,0].values #
pd.read_csv('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/absnt.csv',
parser='csv',usecols=[0])

    X = X.iloc[:,1:].values

    n = y.shape[0]

    print("Data lodaded.")

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y, test_size=0.4, random_state=0)

    print("Data split.")

    scaler = StandardScaler()

    scaler.fit(X_train)  # Don't cheat - fit only on training data

    X_train = scaler.transform(X_train)

    X_test = scaler.transform(X_test)  # apply same transformation to test
data

    print("Data scaled.")

    # build a model


joblib.dump(X_train,'/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreatX')


joblib.dump(y_train,'/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreaty')

    print("Data dumped.")

    X_train =
joblib.load('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreatX',
mmap_mode='r+')

    y_train =
joblib.load('/n/regal/chetty_lab/lsandor/.sweden/T/data/controls/notreaty',
mmap_mode='r+')

    print("Data reloaded and mmapped.")

    # model = SGDClassifier(penalty='elasticnet',n_iter = np.ceil(10**6 /
float(n)),shuffle=True)

    print("CV starts.")

    en = ElasticNetCV(l1_ratio=[.05, .15, .5, .7, .9, .95, .99,
1],n_jobs=-1)

    en.fit(X_train, y_train)

    print("Best for alphas:")

    print(en.alpha_)

    print("Best l1-ratio:")

    print(en.l1_ratio_)

    print("Coefficients:")

    print(en.coef_)

    #evaluate

    y_pred = en.predict(X_test)

    test_score = r2_score(y_test, y_pred)

    print("Test estimator has R^2 %2.2f in the test sample.",test_score)

if __name__=='__main__':

    mp.freeze_support()

    main()




*** Error:

Traceback (most recent call last):

  File
"/n/chetty/Users/lsandor/sweden/.code/controls/lasso/scikit_notreat_predictors_en.py",
line 67, in <module>

    main()

  File
"/n/chetty/Users/lsandor/sweden/.code/controls/lasso/scikit_notreat_predictors_en.py",
line 50, in main

    en.fit(X_train, y_train)

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py",
line 1101, in fit

    mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 604, in __call__

    self._pool = MemmapingPool(n_jobs, **poolargs)

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py",
line 559, in __init__

    super(MemmapingPool, self).__init__(**poolargs)

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py",
line 400, in __init__

    super(PicklingPool, self).__init__(**poolargs)

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/pool.py",
line 159, in __init__

    self._repopulate_pool()

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/pool.py",
line 223, in _repopulate_pool

    w.start()

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/process.py",
line 130, in start

    self._popen = Popen(self)

  File
"/n/chetty/Users/lsandor/envs/laszlo/lib/python2.7/multiprocessing/forking.py",
line 121, in __init__

    self.pid = os.fork()

OSError: [Errno 12] Cannot allocate memory



*** Conda:

Current conda install:

             platform : linux-64

        conda version : 3.4.1

       python version : 2.7.6.final.0

     root environment : /n/sw/fasrcsw/apps/Core/Anaconda/1.9.2-fasrc01/x
 (read only)

  default environment : /n/chetty/Users/lsandor/envs/laszlo

     envs directories : /n/chetty/Users/lsandor/envs


/n/sw/fasrcsw/apps/Core/Anaconda/1.9.2-fasrc01/x/envs

        package cache : /n/chetty/Users/lsandor/envs/.pkgs


/n/sw/fasrcsw/apps/Core/Anaconda/1.9.2-fasrc01/x/pkgs

         channel URLs : http://repo.continuum.io/pkgs/free/linux-64/

                        http://repo.continuum.io/pkgs/pro/linux-64/


          config file : None


    is foreign system : False
------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
Scikit-learn-general mailing list
Scikit-learn-general@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/scikit-learn-general

Reply via email to