Hello,
I posted this question in the Google groups but it does not seem to attract any attention. So I am posting this here. If this is not correct, please tell me.
I have taken some Scikit source code that used the standard grid search and adapted it to using a
pipe with the use of the SFS. I use the the "seuclidean" metric with the ball-tree algorithm that requires a metric parameter - a variance vector. When I execute the Scikit standard code I have no problem. However with the SFS in a Pipeline I have two errors:
- If I do not provide the metric's parameters I get the (see stack trace 1):
TypeError: __init__() takes exactly 1 positional argument (0 given)
- If I provide the parameter I get (see stack trace 2):
ValueError: SEuclidean dist: size of V does not match
Error 2 is understandable - because SFS does feature selection, I cannot pre-calculate this value. It depends on the features used. I was expecting the metric parameters to be automatically calculate and therefore not require this input. I also tried to pass None
as the parameter, but with no success.
Can anyone shed light on how I should proceed? I have added my code below in case this helps
(data sets managed with Pandas).
TIA,
Hugo
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
# get the unormalized data
X = dy[ dy.columns.difference(['label']).values ]
y = dy['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
V = X_train.var().values
C = X_train.cov().values
CPI = np.linalg.pinv(C)
CI = np.linalg.inv(C)
# k_range : must be less than the training size. What happens if number of features > sample size
k_range = range(1, len(X.columns))
weights = ['uniform' , 'distance']
#algos_all = ['auto', 'ball_tree', 'kd_tree', 'brute']
algos_all = ['ball_tree', 'kd_tree', 'brute']
algos = ['brute', 'kd_tree']
leaf_sizes = range(5, 60, 10)
metrics = ["euclidean", "manhattan", "chebyshev", "minkowski"]
# Metric can only be used with certain algorithms
# Metrics intended for real-valued vector spaces:
seuclidean = {
'sfs__k_features' : list(range(1,len(X.columns))),
'sfs__estimator__metric' : ['seuclidean'],
'sfs__estimator__metric_params': [ {'V':V} ], # will be automatically calculated
'sfs__estimator__algorithm' : ['ball_tree'], # TODO , ['brute', 'ball_tree'],
'sfs__estimator__n_neighbors' : list(k_range),
'sfs__estimator__weights' : weights,
'sfs__estimator__leaf_size' : list(leaf_sizes) }
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import mlxtend
# Instantiate the algorithm
knn = KNeighborsClassifier(n_neighbors=10)
#print(knn.get_params().keys())
sfs1 = SFS(estimator=knn,
k_features=3,
forward=True,
floating=False,
scoring='accuracy',
print_progress=False,
cv=5)
# !?!? n_jobs=-1)
pipe = Pipeline([
('standardize', preprocessing.MinMaxScaler()),
('sfs', sfs1),
('knn', knn)])
# See KNeighborsClassifier equivalent param_grid
param_grid = [
seuclidean
]
# Instantiate the grid search
gs = GridSearchCV(estimator=pipe,
param_grid=param_grid,
scoring='accuracy',
#n_jobs=-1, for better stack tracing
cv=5,
verbose=1,
refit=True)
# Run the grid search
gs = gs.fit(X_train.values, y_train)
Stack Trace 1
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
TypeError Traceback (most recent call last)
<ipython-input-68-4ef553dad211> in <module>()
167
168 # Run the grid search
--> 169 gs = gs.fit(X_train.values, y_train)
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
162 the pipeline.
163 """
--> 164 Xt, fit_params = self._pre_transform(X, y, **fit_params)
165 self.steps[-1][-1].fit(Xt, y, **fit_params)
166 return self
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/pipeline.py in _pre_transform(self, X, y, **fit_params)
143 for name, transform in self.steps[:-1]:
144 if hasattr(transform, "fit_transform"):
--> 145 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
146 else:
147 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in fit_transform(self, X, y)
239
240 def fit_transform(self, X, y):
--> 241 self.fit(X, y)
242 return self.transform(X)
243
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in fit(self, X, y)
136 self._inclusion(orig_set=orig_set,
137 subset=prev_subset,
--> 138 X=X, y=y)
139 else:
140 k_idx, k_score, cv_scores = \
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y)
205 for feature in remaining:
206 new_subset = tuple(subset | {feature})
--> 207 cv_scores = self._calc_score(X, y, new_subset)
208 all_avg_scores.append(cv_scores.mean())
209 all_cv_scores.append(cv_scores)
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in _calc_score(self, X, y, indices)
190 scoring=self.scorer,
191 n_jobs=self.n_jobs,
--> 192 pre_dispatch=self.pre_dispatch)
193 else:
194 self.est_.fit(X[:, indices], y)
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1431 train, test, verbose, None,
1432 fit_params)
-> 1433 for train, test in cv)
1434 return np.array(scores)[:, 0]
1435
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/neighbors/base.py in fit(self, X, y)
801 self._y = self._y.ravel()
802
--> 803 return self._fit(X)
804
805
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/neighbors/base.py in _fit(self, X)
256 self._tree = BallTree(X, self.leaf_size,
257 metric=self.effective_metric_,
--> 258 **self.effective_metric_params_)
259 elif self._fit_method == 'kd_tree':
260 self._tree = KDTree(X, self.leaf_size,
sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree.__init__ (sklearn/neighbors/ball_tree.c:8381)()
sklearn/neighbors/dist_metrics.pyx in sklearn.neighbors.dist_metrics.DistanceMetric.get_metric (sklearn/neighbors/dist_metrics.c:4330)()
sklearn/neighbors/dist_metrics.pyx in sklearn.neighbors.dist_metrics.SEuclideanDistance.__init__ (sklearn/neighbors/dist_metrics.c:5888)()
TypeError: __init__() takes exactly 1 positional argument (0 given)
Stack Trace 2
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
ValueError Traceback (most recent call last)
<ipython-input-69-558dd50887b6> in <module>()
167
168 # Run the grid search
--> 169 gs = gs.fit(X_train.values, y_train)
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
162 the pipeline.
163 """
--> 164 Xt, fit_params = self._pre_transform(X, y, **fit_params)
165 self.steps[-1][-1].fit(Xt, y, **fit_params)
166 return self
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/pipeline.py in _pre_transform(self, X, y, **fit_params)
143 for name, transform in self.steps[:-1]:
144 if hasattr(transform, "fit_transform"):
--> 145 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
146 else:
147 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in fit_transform(self, X, y)
239
240 def fit_transform(self, X, y):
--> 241 self.fit(X, y)
242 return self.transform(X)
243
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in fit(self, X, y)
136 self._inclusion(orig_set=orig_set,
137 subset=prev_subset,
--> 138 X=X, y=y)
139 else:
140 k_idx, k_score, cv_scores = \
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y)
205 for feature in remaining:
206 new_subset = tuple(subset | {feature})
--> 207 cv_scores = self._calc_score(X, y, new_subset)
208 all_avg_scores.append(cv_scores.mean())
209 all_cv_scores.append(cv_scores)
/home/hmf/my_py3/lib/python3.4/site-packages/mlxtend/feature_selection/sequential_feature_selector.py in _calc_score(self, X, y, indices)
190 scoring=self.scorer,
191 n_jobs=self.n_jobs,
--> 192 pre_dispatch=self.pre_dispatch)
193 else:
194 self.est_.fit(X[:, indices], y)
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1431 train, test, verbose, None,
1432 fit_params)
-> 1433 for train, test in cv)
1434 return np.array(scores)[:, 0]
1435
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/neighbors/base.py in fit(self, X, y)
801 self._y = self._y.ravel()
802
--> 803 return self._fit(X)
804
805
/home/hmf/my_py3/lib/python3.4/site-packages/sklearn/neighbors/base.py in _fit(self, X)
256 self._tree = BallTree(X, self.leaf_size,
257 metric=self.effective_metric_,
--> 258 **self.effective_metric_params_)
259 elif self._fit_method == 'kd_tree':
260 self._tree = KDTree(X, self.leaf_size,
sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree.__init__ (sklearn/neighbors/ball_tree.c:8793)()
sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree._recursive_build (sklearn/neighbors/ball_tree.c:10053)()
sklearn/neighbors/ball_tree.pyx in sklearn.neighbors.ball_tree.init_node (sklearn/neighbors/ball_tree.c:20030)()
sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree.rdist (sklearn/neighbors/ball_tree.c:9932)()
sklearn/neighbors/dist_metrics.pyx in sklearn.neighbors.dist_metrics.SEuclideanDistance.rdist (sklearn/neighbors/dist_metrics.c:6065)()
ValueError: SEuclidean dist: size of V does not match