python - 变压器在管道中初始化两次

标签 python pandas scikit-learn

我最近正在使用管道学习 gridsearchCV。但是,我很确定我的管道模型没有问题,如下所示。

from sklearn.pipeline import Pipeline, FeatureUnion
from Transformers import TextTransformer, ColumnExtractor

a = TextTransformer('description', max_features=300)
b = TextTransformer('features', max_features=300)
c = TextTransformer('street_address', max_features = 300)
d = TextTransformer('display_address', max_features = 300)
pipeline = Pipeline([
        ('test', FeatureUnion
         ([
            ('description', a ), # can pass in either a pipeline
            #('features', b),
            #('street', c),
            #('display', d),
            #('lat_long', ColumnExtractor(['latitude', 'longitude']))
        ])),
    ('clf', SVC())
    ])

使用同一段代码:

这行得通

pipeline.fit(df,df['interest_level'])
pipeline.predict(df)

这失败了:

pg = {'clf__C': [0.1,0.2]}
grid = GridSearchCV(pipeline, param_grid = pg)
grid.fit(df, df['interest_level'])

我相信这不是我的管道问题,因为在管道上纯粹调用 fit 和 predict 就可以正常工作。但是使用 gridsearch 它会抛出一个值错误。这对我来说没有任何意义。我很确定我也正确使用了 api

更新错误:

这段代码会失败

Transformer class 
class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = column

但是,如果我将 init 更改为此。它会起作用

class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features):
        print 'column', column
        print 'init'
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = 'description'

问题在于 TextTransformer 类被初始化了两次。而恰好第二次没有将参数传递给列导致错误

变形金刚类

from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = column

    def _custom_tokenizer(self, string):
        # string = re.sub('^[\w]', '', string)
        tokens = nltk.word_tokenize(string)
        cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
        return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]

    def _clean_html_tags(self, content):
        return BeautifulSoup(content, 'lxml').text

    def fit(self, df, y = None):
        if self._column == 'features':
            df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
        self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
        return self

    def transform(self, df, y = None):
        if self._column == 'features':
            df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
        return self._vectorizer.transform(df[self._column])

class ColumnExtractor(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def transform(self, df, y = None):
        return df[self.cols].values

    def fit(self, X, y=None):
        return self

堆栈跟踪:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-39-437510c295ef> in <module>()
     23     ('clf',SVC(probability = True))
     24     ])
---> 25 scores = cross_val_score(pipeline, df[['description','features','street_address','display_address','latitude', 'longitude']], df['interest_level'], cv=2)
     26 
     27 for train_index, test_index in skf.split(df, df['interest_level']):

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
    138                                               train, test, verbose, None,
    139                                               fit_params)
--> 140                       for train, test in cv_iter)
    141     return np.array(scores)[:, 0]
    142 

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
    239 
    240     except Exception as e:

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    732             delayed(_fit_transform_one)(trans, name, weight, X, y,
    733                                         **fit_params)
--> 734             for name, trans, weight in self._iter())
    735 
    736         if not result:

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
    575                        **fit_params):
    576     if hasattr(transformer, 'fit_transform'):
--> 577         res = transformer.fit_transform(X, y, **fit_params)
    578     else:
    579         res = transformer.fit(X, y, **fit_params).transform(X)

C:\ProgramData\Anaconda2\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    495         else:
    496             # fit method of arity 2 (supervised transformation)
--> 497             return self.fit(X, y, **fit_params).transform(X)
    498 
    499 

<ipython-input-38-8d6ae99b7816> in fit(self, df, y)
     29         if self._column == 'features':
     30             df[self._column] = df[self._column].apply(lambda x : ' '.join(x))
---> 31         self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
     32         return self
     33 

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3550                         loc = indexer.item()
   3551                     else:
-> 3552                         raise ValueError("cannot label index with a null key")
   3553 
   3554             return self.iget(loc, fastpath=fastpath)

ValueError: cannot label index with a null key

最佳答案

我知道这是旧的,但万一有人偶然发现了这个:

问题出在 TextTransformer 类中。特别是这一行:

self._column = column

应该是

self.column = column

(没有下划线)。据我所知,GridSearchCV 在实例化转换器后使用了 set_params 函数。因此,如果您保存通过 __int__ 传入的任何功能,您需要使用完全相同的名称

关于python - 变压器在管道中初始化两次,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43308042/

相关文章:

python - csv 读取器范围属性错误

Python CSV 插入最后换行符 - 我怎样才能避免它?

python - Pandas 群体内的变量转移

python - 具有多个时间序列的 PCA 作为具有 sklearn 的一个实例的特征

python - 用于特征选择的单变量线性回归测试?

python - 用于大型数据集的 sklearn utils compute_class_weight 函数

python - OneHotEncoder错误: cannot convert string to float

python - 使用 PatchCollection 重新绘制轮廓填充图

python - Pandas 计算列中的负值序列

python - 如何计算列中使用的前 3 个单词并将结果存储在字典中