数据来自: https://www.kaggle.com/andieminogue/newspaper-churn
代码如下所示,df_og是加载的数据帧,一些列已被删除,NaN行被删除,除此之外,数据尚未转换:
df_og2 = df_og.drop(['Address','State','City','SubscriptionID','Zip Code'], axis = 1)
df_og2 = df_og2.dropna()
df_og2.reset_index(drop=True, inplace=True)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import ordinal
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from imblearn.under_sampling import RandomUnderSampler
X = df_og2.drop(['Subscriber'], axis=1)
y = df_og2[['Subscriber']]
X, y = RandomUnderSampler(random_state=2).fit_resample(X,y)
numeric = df_og2.select_dtypes(include=['int64']).columns.tolist()
multi_label_cat = ['HH Income', 'Age range', 'weekly fee']
binary_label_cat = ['Home Ownership', 'dummy for Children']
onehot_cat = ['County','Deliveryperiod','Ethnicity','Language','Nielsen Prizm', 'Source Channel']
def getScores(model):
HH = df_og2['HH Income'].unique().tolist()
AR = df_og2['Age range'].unique().tolist()
WF = df_og2['weekly fee'].unique().tolist()
HH.sort()
HH.insert(0,HH.pop())
AR.sort()
WF.sort()
WF.append(WF.pop(5))
HH_map = {i:j for i,j in zip(HH,[i for i in range(len(HH))])}
AR_map = {i:j for i,j in zip(AR,[i for i in range(len(AR))])}
WF_map = {i:j for i,j in zip(WF,[i for i in range(len(WF))])}
col_map = [{'col': 'HH Income', 'mapping': HH_map}, {'col': 'Age range', 'mapping': AR_map},
{'col': 'weekly fee', 'mapping': WF_map}]
encode_pipeline = ColumnTransformer([
('multi_label_cat', ordinal.OrdinalEncoder(mapping=col_map), multi_label_cat),
('binary_label_cat', OrdinalEncoder(), binary_label_cat),
('onehot_cat', OneHotEncoder(), onehot_cat),
], remainder = 'passthrough', sparse_threshold = 0)
full_pipeline = Pipeline([
('encode',encode_pipeline),
('scale', StandardScaler()),
('model', model)
])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2)
scoring = {'acc': 'accuracy',
'f1': 'f1',
'prec': 'precision',
'rec': 'recall'}
scores = cross_validate(full_pipeline, X, y, scoring=scoring , cv=cv, n_jobs=-1)
print(scores)
from sklearn.linear_model import LogisticRegression
getScores(LogisticRegression())
有时,将折叠和重复次数更改为较大的数字会在分数中产生一些值(在相同的过程中),其中大多数为 NaN。然而,在尝试修复它之后,我无法再重现该现象。
最佳答案
我通过使用显示错误消息的 cross_val_score
解决了该问题。有 2 个错误,第一个是 OneHot 编码的问题,由于某些奇怪的原因,它没有对仅与一行关联的唯一值进行编码,将 handle_unknown='ignore'
参数添加到 OneHotEncoder()
修复了这个问题。之后遇到的另一个问题是 invalid pos_label=1 can only be one of ['YES','NO']
,我假设 y
的序数编码已完成当情况不成立时,由 cross_validate()
函数自动执行。我通过预先编码 y
解决了这个问题。
关于python - 为什么 cross_validate 返回 NaN 分数?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68992742/