我最近正在使用管道学习 gridsearchCV。但是,我很确定我的管道模型没有问题,如下所示。
from sklearn.pipeline import Pipeline, FeatureUnion
from Transformers import TextTransformer, ColumnExtractor
a = TextTransformer('description', max_features=300)
b = TextTransformer('features', max_features=300)
c = TextTransformer('street_address', max_features = 300)
d = TextTransformer('display_address', max_features = 300)
pipeline = Pipeline([
('test', FeatureUnion
([
('description', a ), # can pass in either a pipeline
#('features', b),
#('street', c),
#('display', d),
#('lat_long', ColumnExtractor(['latitude', 'longitude']))
])),
('clf', SVC())
])
使用同一段代码:
这行得通
pipeline.fit(df,df['interest_level'])
pipeline.predict(df)
这失败了:
pg = {'clf__C': [0.1,0.2]}
grid = GridSearchCV(pipeline, param_grid = pg)
grid.fit(df, df['interest_level'])
我相信这不是我的管道问题,因为在管道上纯粹调用 fit 和 predict 就可以正常工作。但是使用 gridsearch 它会抛出一个值错误。这对我来说没有任何意义。我很确定我也正确使用了 api
更新错误:
这段代码会失败
Transformer class
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, max_features):
self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
tokenizer=self._custom_tokenizer, analyzer='word',
max_features=max_features)
self._vectorizer = None
self._column = column
但是,如果我将 init 更改为此。它会起作用
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, max_features):
print 'column', column
print 'init'
self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
tokenizer=self._custom_tokenizer, analyzer='word',
max_features=max_features)
self._vectorizer = None
self._column = 'description'
问题在于 TextTransformer 类被初始化了两次。而恰好第二次没有将参数传递给列导致错误
变形金刚类
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, max_features):
self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
tokenizer=self._custom_tokenizer, analyzer='word',
max_features=max_features)
self._vectorizer = None
self._column = column
def _custom_tokenizer(self, string):
# string = re.sub('^[\w]', '', string)
tokens = nltk.word_tokenize(string)
cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]
def _clean_html_tags(self, content):
return BeautifulSoup(content, 'lxml').text
def fit(self, df, y = None):
if self._column == 'features':
df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
return self
def transform(self, df, y = None):
if self._column == 'features':
df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
return self._vectorizer.transform(df[self._column])
class ColumnExtractor(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self.cols = cols
def transform(self, df, y = None):
return df[self.cols].values
def fit(self, X, y=None):
return self
堆栈跟踪:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-39-437510c295ef> in <module>()
23 ('clf',SVC(probability = True))
24 ])
---> 25 scores = cross_val_score(pipeline, df[['description','features','street_address','display_address','latitude', 'longitude']], df['interest_level'], cv=2)
26
27 for train_index, test_index in skf.split(df, df['interest_level']):
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
138 train, test, verbose, None,
139 fit_params)
--> 140 for train, test in cv_iter)
141 return np.array(scores)[:, 0]
142
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
236 estimator.fit(X_train, **fit_params)
237 else:
--> 238 estimator.fit(X_train, y_train, **fit_params)
239
240 except Exception as e:
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
C:\ProgramData\Anaconda2\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
<ipython-input-38-8d6ae99b7816> in fit(self, df, y)
29 if self._column == 'features':
30 df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
---> 31 self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
32 return self
33
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3550 loc = indexer.item()
3551 else:
-> 3552 raise ValueError("cannot label index with a null key")
3553
3554 return self.iget(loc, fastpath=fastpath)
ValueError: cannot label index with a null key
最佳答案
我知道这是旧的,但万一有人偶然发现了这个:
问题出在 TextTransformer
类中。特别是这一行:
self._column = column
应该是
self.column = column
(没有下划线)。据我所知,GridSearchCV
在实例化转换器后使用了 set_params
函数。因此,如果您保存通过 __int__
传入的任何功能,您需要使用完全相同的名称。
关于python - 变压器在管道中初始化两次,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43308042/