python - 单独训练和部署

标签 python machine-learning

我正在尝试自动论文评分的代码。使用以下代码,每次运行程序时,它都会开始训练,最后查看结果,这需要很长时间。

我尝试使用 pickel 将训练和部署分开,但无法相处。

def main():
    print "Fetching data..."
    train_df = util.get_training_data('../data/training_set_rel3.tsv')
    valid_df = util.get_validation_data('../data/valid_set.tsv')

    print "Standardizing scores..."
    train_df, valid_df = util.append_standardized_column(train_df, valid_df, 'score')

print "Calculating perplexity feature..."

train_df, valid_df = Perplexity().fill_perplexity_columns(train_df, valid_df)

print "Calculating number of sentences feature..."

train_df, valid_df = fill_sentence_column(train_df, valid_df)

print "Cleaning for spelling and word count..."
# cleaned up data for spelling feature
vectorizer_train_spelling = util.vectorizer_clean_spelling(train_df)
train_essays_spelling = vectorizer_train_spelling['essay'].values
vectorizer_valid_spelling = util.vectorizer_clean_spelling(valid_df)
valid_essays_spelling = vectorizer_valid_spelling['essay'].values

print "Calculating total words feature..."

train_df, valid_df = fill_total_words_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling)

print "Calculating unique words feature..."

train_df, valid_df = fill_unique_words_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling)

print "Calculating spelling feature..."
# spelling feature
train_df, valid_df = fill_spelling_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling)

print "Calculating pos tags features..."

train_df, valid_df = fill_pos_columns(train_df, valid_df)

print "Cleaning for TFIDF..."
# cleaned up data for tfidf vector feature
vectorizer_train = util.vectorizer_clean(train_df)
train_essays = vectorizer_train['essay'].values
vectorizer_valid = util.vectorizer_clean(valid_df)
valid_essays = vectorizer_valid['essay'].values

print "Calculating TFIDF features with unigram..."
train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 1)

# print "Calculating TFIDF features with trigram..."
# train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 3)

print train_df.head()

print valid_df.head()

COLS = ['essay_set', 'spelling_correct', 'std_sentence_count', 'std_unique_words', 'std_total_words',
        'std_unique_words',
        'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X', 'std_perplexity',
        'std_score']

train_df = train_df[COLS].join(train_df.filter(regex=("tfidf_*")))
valid_df = valid_df[COLS].join(valid_df.filter(regex=("tfidf_*")))

print train_df.shape
print valid_df.shape

max_essay_set = max(train_df['essay_set'])

linreg_scores_df = pd.DataFrame(columns=['essay_set', 'p', 'spearman'])

lasso_scores_df = pd.DataFrame(columns=['essay_set', 'alpha', 'p', 'spearman'])
ridge_scores_df = pd.DataFrame(columns=['essay_set', 'alpha', 'p', 'spearman'])

alphas = [x * 1.0 / 20 for x in range(20, 0, -1)]

for i in range(1, max_essay_set + 1):

    print ""

    train_x = np.asarray((train_df[train_df['essay_set'] == i]).drop(['essay_set', 'std_score'], axis=1))
    train_std_scores = np.asarray((train_df[train_df['essay_set'] == i])['std_score'], dtype="|S6").astype(np.float)

    regr = LinReg(fit_intercept=False, copy_X=False)
    regr.fit(train_x, train_std_scores)

    valid_x = np.asarray((valid_df[valid_df['essay_set'] == i]).drop(['essay_set', 'std_score'], axis=1))
    valid_pred_std_scores = regr.predict(valid_x)

    linreg_spear, p = Spearman(a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores)
    linreg_scores_df = linreg_scores_df.append({'essay_set': i, 'p': p, 'spearman': linreg_spear},
                                               ignore_index=True)

    print "Linear for Essay Set " + str(i) + ":", linreg_spear

    for a in alphas:
        ridge = linear_model.Ridge(alpha=a)
        ridge.fit(train_x, train_std_scores)
        valid_pred_std_scores_ridge = ridge.predict(valid_x)

        ridge_spear, p = Spearman(a=(valid_df[valid_df['essay_set'] == i])["std_score"],
                                  b=valid_pred_std_scores_ridge)
        ridge_scores_df = ridge_scores_df.append({'essay_set': i, 'alpha': a, 'p': p, 'spearman': ridge_spear},
                                                 ignore_index=True)

        print "Alpha = " + str(a) + " Ridge for Essay Set " + str(i) + ":", ridge_spear

        lasso = linear_model.Lasso(alpha=a)
        lasso.fit(train_x, train_std_scores)
        valid_pred_std_scores_lasso = lasso.predict(valid_x)

        lasso_spear, p = Spearman(a=(valid_df[valid_df['essay_set'] == i])["std_score"],
                                  b=valid_pred_std_scores_lasso)
        lasso_scores_df = lasso_scores_df.append({'essay_set': i, 'alpha': a, 'p': p, 'spearman': lasso_spear},
                                                 ignore_index=True)

        print "Alpha = " + str(a) + "Lasso for Essay Set " + str(i) + ":", lasso_spear

print linreg_scores_df
print ridge_scores_df
print lasso_scores_df

linreg_scores_df.to_pickle('linreg_scores-01.pickle')
ridge_scores_df.to_pickle('ridge_scores-01.pickle')
lasso_scores_df.to_pickle('lasso_scores-01.pickle')

因此,我想将训练和部署分开,以便当用户运行程序时可以直接查看输出,并且仅在第一次进行训练。

最佳答案

如果您使用 sklearn 库,它有保存训练模型的方法。它使用 joblib.dump() 保存模型,使用 joblib.load() 加载训练模型。
这是链接https://scikit-learn.org/stable/modules/model_persistence.html

关于python - 单独训练和部署,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56058639/

相关文章:

python - python中的 'super object returned is unbound'是什么意思?

python - Razorpay Webhook 签名验证错误

python - 将自定义按钮添加到 NavigationToolbarTkAgg

machine-learning - 最大似然估计到底意味着什么?

python - 事件序列、递归神经网络、PyBrain

python-3.x - 支持向量回归

python - 从另一个 ipynb 文件导入 ipynb 文件?

python - 如何使用 python 计算一个字母出现的次数?

r - 错误 : *** line 1 of `undefined.cases' : bad value of . .. 属性

algorithm - 强化学习中的 SARSA