python - 完整的 sklearn 管道示例

标签 python pandas scikit-learn pipeline

我正在尝试使用 sklearn 管道。但是我在网上尝试了各种教程,但对我没有帮助。

import pandas as pd 
import numpy as np
import json
import seaborn as sb 
from sklearn.metrics import log_loss
from sklearn import linear_model 
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from Transformers import TextTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
df = df[['description', 'interest_level']]
from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('description', max_features=50)
b = TextTransformer('features', max_features=10)
pipeline = Pipeline([
    ('description',a ), # can pass in either a pipeline
        #('features',b ) # or a transformer
J    ('clf', SVC())  # classifier
])
pipeline.fit(df[:,'interest_level'])

我的文本转换器

from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features=5000):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = column

    def _custom_tokenizer(self, string):
        # string = re.sub('^[\w]', '', string)
        tokens = nltk.word_tokenize(string)
        cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
        return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]

    def _clean_html_tags(self, content):
        return BeautifulSoup(content, 'lxml').text

    def fit(self, df):
        self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
        return self

    def transform(self, df):
        return self._vectorizer.transform(df[self._column]).todense()

但是,我似乎做对了。它一直在 ipython notebook 中抛出这个异常

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-11-b3788282dc5c> in <module>()
      8     ('clf', SVC())  # classifier
      9 ])
---> 10 pipeline.fit(df[:,'interest_level'])

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1382         """Return the cached item, item represents a label indexer."""
   1383         cache = self._item_cache
-> 1384         res = cache.get(item)
   1385         if res is None:
   1386             values = self._data.get(item)

TypeError: unhashable type

数据说明

    description interest_level
10  A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...   medium
10000       low
100004  Top Top West Village location, beautiful Pre-w...   high
100007  Building Amenities - Garage - Garden - fitness...   low
100013  Beautifully renovated 3 bedroom flex 4 bedroom...   low

兴趣水平将是我的目标变量

最佳答案

您只适合一列 (df[:, 'interest_level]),但是您的第一步 (transformer a: TextTransformer) 正在尝试访问列描述

关于python - 完整的 sklearn 管道示例,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43013565/

相关文章:

Python - 点文件到 png 文件未找到错误

python - 获取公共(public) Google 文档文件的下载链接

python - 在 Pandas 数据框中扩展时间序列数据

python - 如何同时监控loss和val_loss以避免神经网络对训练集或测试集过度拟合?

Python Pandas - 有条件地覆盖另一个数据框中的 x 行

python - 如何按列拆分 DataFrame

python - 如何更改算法中的参数以获得更好的性能?

Python MatPlot 条函数参数

python - 在 pytest 参数化中标记输入

python - 自动识别图像中的图案