Re: [Scikit-learn-general] Sample weighting in RandomizedSearchCV

José Guilherme Camargo de Souza Tue, 09 Jun 2015 11:05:23 -0700

Hi Andy,

Thanks for your reply. The full traceback is below, weights.shape and
the training data shape are:


(773,)
(773, 82)

I weas using a ExtraTreeClassifier but the same thing happens with an
SVC. It doesn't seem to be an estimator-specific issue.


"""
Traceback (most recent call last):
  File "/Users/jgcdesouza/Desktop/script.py",
line 228, in <module>
    main()
  File "/Users/jgcdesouza/Desktop/script.py",
line 195, in main
    search.fit(X_train, y_train)
  File 
"/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py",
line 898, in fit
    return self._fit(X, y, sampled_params)
  File 
"/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py",
line 505, in _fit
    for parameters in parameter_iterable
  File 
"/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 666, in __call__
    self.retrieve()
  File 
"/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 549, in retrieve
    raise exception_type(report)
sklearn.externals.joblib.my_exceptions.JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
    ...........................................................................
/Users/jgcdesouza/Desktop/script.py
in <module>()
    223     wwc = calculate_weighting(ywc_train)
    224     # In[ ]:
    225
    226
    227 if __name__ == "__main__":
--> 228     main()
    229
    230
    231
    232

...........................................................................
/Users/jgcdesouza/Desktop/script.py
in main()
    190     print weights.shape
    191     print X_train.shape
    192     search = RandomizedSearchCV(svc_pipe,
svc_param_dist, n_iter=n_iter, scoring="accuracy",
    193                                              n_jobs=-1,
iid=True, cv=5, refit=True, verbose=1, random_state=seed,
    194
fit_params={"sample_weight": weights})
--> 195     search.fit(X_train, y_train)
        search.fit = <bound method RandomizedSearchCV.fit
of Randomiz...t=True,
          scoring='accuracy', verbose=1)>
        X_train = array([[ 20.       ,  20.       ,   1.       ,
....   1.       ,
          4.       ,   4.       ]])
        y_train = array([ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
...,  0.,  1.,
        1.,  0.,  0.,  1.,  1.,  0.])
    196     print "CPS ET _79ner best params: ",
search.best_params_
    197     cps_et_preds_79ner = search.predict(Xcps_test)
    198     cps_et_acc_79ner = accuracy_score(ycps_test, cps_et_preds_79ner)
    199

...........................................................................
/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py
in fit(self=RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='accuracy', verbose=1), X=array([[ 20.       ,  20.
     ,   1.       , ....   1.       ,
          4.       ,   4.       ]]), y=array([ 1.,  1.,  1.,  0.,  0.,
 0.,  0.,  0.,  ...,  0.,  1.,
        1.,  0.,  0.,  1.,  1.,  0.]))
    893
    894         """
    895         sampled_params = ParameterSampler(self.param_distributions,
    896                                           self.n_iter,
    897
random_state=self.random_state)
--> 898         return self._fit(X, y, sampled_params)
        self._fit = <bound method RandomizedSearchCV._fit of Randomi...t=True,
          scoring='accuracy', verbose=1)>
        X = array([[ 20.       ,  20.       ,   1.       , ....   1.       ,
          4.       ,   4.       ]])
        y = array([ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  ...,  0.,  1.,
        1.,  0.,  0.,  1.,  1.,  0.])
        sampled_params = <sklearn.grid_search.ParameterSampler object>
    899
    900
    901
    902

...........................................................................
/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py
in _fit(self=RandomizedSearchCV(cv=5, error_score='raise',
  ...it=True,
          scoring='accuracy', verbose=1), X=array([[ 20.       ,  20.
     ,   1.       , ....   1.       ,
          4.       ,   4.       ]]), y=array([ 1.,  1.,  1.,  0.,  0.,
 0.,  0.,  0.,  ...,  0.,  1.,
        1.,  0.,  0.,  1.,  1.,  0.]),
parameter_iterable=<sklearn.grid_search.ParameterSampler object>)
    500         )(
    501             delayed(_fit_and_score)(clone(base_estimator), X,
y, self.scorer_,
    502                                     train, test, self.verbose,
parameters,
    503                                     self.fit_params,
return_parameters=True,
    504                                     error_score=self.error_score)
--> 505                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterSampler object>
    506                 for train, test in cv)
    507
    508         # Out is a list of triplet: score, estimator, n_test_samples
    509         n_fits = len(out)

...........................................................................
/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py
in __call__(self=Parallel(n_jobs=-1), iterable=<itertools.islice
object>)
    661             if pre_dispatch == "all" or n_jobs == 1:
    662                 # The iterable was consumed all at once by the
above for loop.
    663                 # No need to wait for async callbacks to trigger to
    664                 # consumption.
    665                 self._iterating = False
--> 666             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    667             # Make sure that we get a last message telling us
we are done
    668             elapsed_time = time.time() - self._start_time
    669             self._print('Done %3i out of %3i | elapsed: %s finished',
    670                         (len(self._output),

    ---------------------------------------------------------------------------
    Sub-process traceback:
    ---------------------------------------------------------------------------
    ValueError                                         Tue Jun  9 10:43:22 2015
PID: 4756              Python 2.7.10: /Users/jgcdesouza/anaconda/bin/python
...........................................................................
/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc
in _fit_and_score(estimator=Pipeline(steps=[('standardscaler',
StandardScale...one, shrinking=True, tol=0.001, verbose=False))]),
X=array([[ 20.       ,  20.       ,   1.       , ....   1.       ,
          4.       ,   4.       ]]), y=array([ 1.,  1.,  1.,  0.,  0.,
 0.,  0.,  0.,  ...,  0.,  1.,
        1.,  0.,  0.,  1.,  1.,  0.]),
scorer=make_scorer(accuracy_score), train=array([149, 151, 153, 154,
155, 156, 160, 162, 1..., 765,
       766, 767, 768, 769, 770, 771, 772]), test=array([  0,   1,   2,
  3,   4,   5,   6,   7,  ...45, 146, 147, 148, 150, 152, 157, 158,
159, 161]), verbose=1, parameters={'svc__C': 0.033262207360466804},
fit_params={'sample_weight': array([ 0.54980595,  0.54980595,
0.54980595,  0...5,
        0.54980595,  0.54980595,  0.45019405])},
return_train_score=False, return_parameters=True, error_score='raise')
   1454
   1455     try:
   1456         if y_train is None:
   1457             estimator.fit(X_train, **fit_params)
   1458         else:
-> 1459             estimator.fit(X_train, y_train, **fit_params)
   1460
   1461     except Exception as e:
   1462         if error_score == 'raise':
   1463             raise

...........................................................................
/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/pipeline.pyc
in fit(self=Pipeline(steps=[('standardscaler', StandardScale...one,
shrinking=True, tol=0.001, verbose=False))]), X=array([[ 16.       ,
16.       ,   1.       , ....   1.       ,
          4.       ,   4.       ]]), y=array([ 1.,  1.,  1.,  1.,  1.,
 1.,  1.,  0.,  ...,  0.,
        1.,  1.,  0.,  0.,  1.,  1.,  0.]),
**fit_params={'sample_weight': array([ 0.54980595,  0.54980595,
0.54980595,  0...5,
        0.54980595,  0.54980595,  0.45019405])})
    135             pipeline.
    136         y : iterable, default=None
    137             Training targets. Must fulfill label requirements
for all steps of
    138             the pipeline.
    139         """
--> 140         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    141         self.steps[-1][-1].fit(Xt, y, **fit_params)
    142         return self
    143
    144     def fit_transform(self, X, y=None, **fit_params):

...........................................................................
/Users/jgcdesouza/anaconda/lib/python2.7/site-packages/sklearn/pipeline.pyc
in _pre_transform(self=Pipeline(steps=[('standardscaler',
StandardScale...one, shrinking=True, tol=0.001, verbose=False))]),
X=array([[ 16.       ,  16.       ,   1.       , ....   1.       ,
          4.       ,   4.       ]]), y=array([ 1.,  1.,  1.,  1.,  1.,
 1.,  1.,  0.,  ...,  0.,
        1.,  1.,  0.,  0.,  1.,  1.,  0.]),
**fit_params={'sample_weight': array([ 0.54980595,  0.54980595,
0.54980595,  0...5,
        0.54980595,  0.54980595,  0.45019405])})
    111     # Estimator interface
    112
    113     def _pre_transform(self, X, y=None, **fit_params):
    114         fit_params_steps = dict((step, {}) for step, _ in self.steps)
    115         for pname, pval in six.iteritems(fit_params):
--> 116             step, param = pname.split('__', 1)
    117             fit_params_steps[step][param] = pval
    118         Xt = X
    119         for name, transform in self.steps[:-1]:
    120             if hasattr(transform, "fit_transform"):

ValueError: need more than 1 value to unpack
___________________________________________________________________________

Process finished with exit code 1
"""



José Guilherme


On Mon, Jun 8, 2015 at 6:21 PM, Andy <t3k...@gmail.com> wrote:
> Hi Jose.
> That should work.
> Can you provide the full traceback?
> Also can you provide weights.shape?
>
> Andy
>
> On 06/08/2015 08:49 PM, José Guilherme Camargo de Souza wrote:
>> Hi all,
>>
>> I am having a different issue when trying to use sample_weights with
>> RandomizedSearchCV:
>>
>> weights = np.array(calculate_weighting(y_train))
>> search = RandomizedSearchCV(estimator, param_dist, n_iter=n_iter,
>> scoring="accuracy",
>>                                           n_jobs=-1, iid=True, cv=5,
>> refit=True, verbose=1, random_state=seed,
>>                                           fit_params={"sample_weight": 
>> weights})
>>
>> search.fit(X_train, y_train)
>>
>> where weights has the same number of instances in X_train.
>> I get the following error:
>>
>> ValueError: need more than 1 value to unpack
>>
>> I am using scikit-learn 0.16.1, therefore a more recent version than
>> 0.15b. Was there some sort of change in the behavior of fit_params
>> from 0.15b to 0.16.1?
>>
>> What is the current recommended way to pass the sample_weights vector
>> to a *SearchCV object, if any?
>>
>> Thanks!
>> José
>>
>>
>> On Tue, Jul 8, 2014 at 9:33 AM, Hamed Zamani <hamedzam...@acm.org> wrote:
>>> Dear Joel,
>>>
>>> Yes. After updating the version of Scikit-learn to 0.15b2 the problem was
>>> solved.
>>>
>>> Thanks,
>>> Hamed
>>>
>>>
>>>
>>> On Tue, Jul 8, 2014 at 2:51 PM, Joel Nothman <joel.noth...@gmail.com> wrote:
>>>> This shouldn't be the case, though it's not altogether well-documented.
>>>> According to
>>>> https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/cross_validation.py#L1225,
>>>> if the fit_params value has the same length as the samples, it should be
>>>> similarly indexed.
>>>>
>>>> So this would be a bug ... if it is found at master. I'm guessing, Hamed,
>>>> that you are using scikit-learn version 0.14? Please check this works with
>>>> the latest 0.15b.
>>>>
>>>> However, fit_params will not account for the weights in the scoring
>>>> function. Noel has solved this; pending some more tests, this should
>>>> hopefully be merged, including support for RandomizedSearchCV(...,
>>>> sample_weight=weights_array) soon. (The work seems to have stalled a 
>>>> little.
>>>> If someone wants to see this feature included quickly, perhaps Noel would 
>>>> be
>>>> willing for someone else to finish this PR for him.)
>>>>
>>>> - Joel
>>>>
>>>>
>>>> On 8 July 2014 07:49, Kyle Kastner <kastnerk...@gmail.com> wrote:
>>>>> It looks like fit_params are passed wholesale to the classifier being fit
>>>>> - this means the sample weights will be a different size than the fold of
>>>>> (X, y) fed to the classifier (since the weights aren't getting 
>>>>> KFolded...).
>>>>> Unfortunately I do not see a way to accomodate for this currently -
>>>>> sample_weights may be a special case where we would need to introspect the
>>>>> fit_params and modify them before passing to the underlying classifier...
>>>>> can you file a bug report on github?
>>>>>
>>>>>
>>>>> On Tue, Jul 8, 2014 at 1:27 PM, Hamed Zamani <hamedzam...@acm.org> wrote:
>>>>>> Dear all,
>>>>>>
>>>>>> I am using Scikit-Learn library and I want to weight all training
>>>>>> samples (according to unbalanced data). According to the tutorial and 
>>>>>> what I
>>>>>> found in the web, I should use this method:
>>>>>>
>>>>>> search = RandomizedSearchCV(estimator, param_distributions,
>>>>>> n_iter=args.iterations, scoring=mae_scorer,n_jobs=1, refit=True,
>>>>>> cv=KFold(X_train.shape[0], 10, shuffle=True, random_state=args.seed),
>>>>>> verbose=1, random_state=args.seed, fit_params={'sample_weight':
>>>>>> weights_array})
>>>>>>
>>>>>> search.fit(X_trains, y_train)
>>>>>>
>>>>>> where "wights_array" is an array containing the weight of each training
>>>>>> sample. After running the code, I was stopped with the following 
>>>>>> exception:
>>>>>>
>>>>>> ValueError: operands could not be broadcast together with shapes (1118,)
>>>>>> (1006,) (1118,)
>>>>>>
>>>>>> It is worth noting that the size of "X_trains", "y_train", and
>>>>>> "weights_array" are equal to 1118.
>>>>>>
>>>>>> When I changed the number of folds from 10 to 2, the exception was
>>>>>> changed to this one:
>>>>>>
>>>>>> ValueError: operands could not be broadcast together with shapes (1118,)
>>>>>> (559,) (1118,)
>>>>>>
>>>>>> Do you know what is the problem? I guess the problem is with "KFold"
>>>>>> method. Any idea is appreciated.
>>>>>>
>>>>>> Kind Regards,
>>>>>> Hamed
>>>>>>
>>>>>>
>>>>>>
>>>>>> ------------------------------------------------------------------------------
>>>>>> Open source business process management suite built on Java and Eclipse
>>>>>> Turn processes into business applications with Bonita BPM Community
>>>>>> Edition
>>>>>> Quickly connect people, data, and systems into organized workflows
>>>>>> Winner of BOSSIE, CODIE, OW2 and Gartner awards
>>>>>> http://p.sf.net/sfu/Bonitasoft
>>>>>> _______________________________________________
>>>>>> Scikit-learn-general mailing list
>>>>>> Scikit-learn-general@lists.sourceforge.net
>>>>>> https://lists.sourceforge.net/lists/listinfo/scikit-learn-general
>>>>>>
>>>>>
>>>>>
>>>>> ------------------------------------------------------------------------------
>>>>> Open source business process management suite built on Java and Eclipse
>>>>> Turn processes into business applications with Bonita BPM Community
>>>>> Edition
>>>>> Quickly connect people, data, and systems into organized workflows
>>>>> Winner of BOSSIE, CODIE, OW2 and Gartner awards
>>>>> http://p.sf.net/sfu/Bonitasoft
>>>>> _______________________________________________
>>>>> Scikit-learn-general mailing list
>>>>> Scikit-learn-general@lists.sourceforge.net
>>>>> https://lists.sourceforge.net/lists/listinfo/scikit-learn-general
>>>>>
>>>>
>>>>
>>>> ------------------------------------------------------------------------------
>>>> Open source business process management suite built on Java and Eclipse
>>>> Turn processes into business applications with Bonita BPM Community
>>>> Edition
>>>> Quickly connect people, data, and systems into organized workflows
>>>> Winner of BOSSIE, CODIE, OW2 and Gartner awards
>>>> http://p.sf.net/sfu/Bonitasoft
>>>> _______________________________________________
>>>> Scikit-learn-general mailing list
>>>> Scikit-learn-general@lists.sourceforge.net
>>>> https://lists.sourceforge.net/lists/listinfo/scikit-learn-general
>>>>
>>>
>>> ------------------------------------------------------------------------------
>>> Open source business process management suite built on Java and Eclipse
>>> Turn processes into business applications with Bonita BPM Community Edition
>>> Quickly connect people, data, and systems into organized workflows
>>> Winner of BOSSIE, CODIE, OW2 and Gartner awards
>>> http://p.sf.net/sfu/Bonitasoft
>>> _______________________________________________
>>> Scikit-learn-general mailing list
>>> Scikit-learn-general@lists.sourceforge.net
>>> https://lists.sourceforge.net/lists/listinfo/scikit-learn-general
>>>
>> ------------------------------------------------------------------------------
>> _______________________________________________
>> Scikit-learn-general mailing list
>> Scikit-learn-general@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/scikit-learn-general
>
>
> ------------------------------------------------------------------------------
> _______________________________________________
> Scikit-learn-general mailing list
> Scikit-learn-general@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/scikit-learn-general

------------------------------------------------------------------------------
_______________________________________________
Scikit-learn-general mailing list
Scikit-learn-general@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/scikit-learn-general

Re: [Scikit-learn-general] Sample weighting in RandomizedSearchCV

Reply via email to