python - 如何从整数中取回数据。我的 model.predict() 不起作用

标签 python machine-learning scikit-learn nlp text-classification

我有一个 csv。包含'性别','诊断','测试','物理检查','医学'这些列。我想根据“性别”、“诊断”、“测试”、“物理检查”这些列来预测“医学”列。 我已经这样做了:

import nltk
import re
import pandas as pd
from io import StringIO
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer

stop_words=set(stopwords.words("english"))

def stop_words_filtering(wordlist):
    return [w for w in wordlist if not w in stop_words]

def tagg(wordlist):
    try:
        #print(wordlist)
        tagged_list=[]
        tagged=nltk.pos_tag(wordlist)
        return tagged
    except Exception as e:
        print(e)

filter_words_by_parts_of_speach=lambda words:[word for word,pp in words if pp!='CD' and pp!='']
join_words=lambda words:' '.join(words)
remove_numbering = lambda string:re.sub("\d\.", "", string)
remove_punchuation = lambda string:re.sub(r'[^\w\s]','',string)

df=pd.read_csv('Pescription_details.csv')
col = ['gender','diagnosis','test','physical_exam','medicine']
df = df[col]

df = df.replace(np.nan, '', regex=True)
df.columns = ['gender','diagnosis','test','physical_exam','medicine']

for colm in col:
    df[colm]=df[colm].str.replace('\n',' ')
    df[colm]=df[colm].apply(remove_numbering).apply(remove_punchuation).apply(word_tokenize).apply(stop_words_filtering).apply(tagg).apply(filter_words_by_parts_of_speach).apply(join_words)

#df.to_csv('keyword.csv')
df=df.apply(lambda x: pd.factorize(x)[0])
#print(df.head(10))
X=df[['gender','diagnosis','test','physical_exam']]
y=df[['medicine']]


vect = CountVectorizer()
vect.fit(X)
simple_train_dtm = vect.transform(X)
percent = 0.0
rds=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=138)

y_train, y_test=y_train.values.ravel(),y_test.values.ravel()

vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
vect.vocabulary_.get(u'algorithm')

nb = MultinomialNB()

nb.fit(X_train,y_train)

y_pred_class = nb.predict(X_test)

percentage=metrics.accuracy_score(y_test, y_pred_class)*100
print(percentage)

ber=BernoulliNB()
ber.fit(X_train,y_train)
y_pred_class = ber.predict(X_test)
percentage=metrics.accuracy_score(y_test, y_pred_class)*100
print('bernoli',percentage)

print(ber.predict( [vect.transform(["Male"]),
vect.transform(["Old inferior myocardial infarction.Occasional chest pain on lifting weight at shop."]),
vect.transform(["Electrocardiogram:  Old inferior myocardial infarction. FBS:  95 mg/dL. Creatinine: 1.99 mg/dL. SGPT:  Normal. Fasting lipid profile:  Normal. Echo:  Akinetic inferior wall."]),
vect.transform(["Chest:  Clear. 1st heart sound and 2nd heart sound: Audible."])
] ))

但是这段代码给了我错误


47.02702702702703
bernoli 51.891891891891895
Traceback (most recent call last):
  File "backend.py", line 90, in <module>
    vect.transform(["Chest:  Clear. 1st heart sound and 2nd heart sound: Audible."])
  File "/home/android/.local/lib/python3.6/site-packages/sklearn/naive_bayes.py", line 65, in predict
    jll = self._joint_log_likelihood(X)
  File "/home/android/.local/lib/python3.6/site-packages/sklearn/naive_bayes.py", line 943, in _joint_log_likelihood
    X = check_array(X, accept_sparse='csr')
  File "/home/android/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 521, in check_array
    "if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[<1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

如何预测“医学”栏? 该代码的输出应该类似于“Olmesartan Clopidogrel Rosuvastatin5 Ivabradine”。这些是药品名称

最佳答案

from sklearn.feature_extraction.text import CountVectorizer

# creating dummy dataframe with 2 textual column text and gender, 
df = pd.DataFrame([['Old inferior myocardial infarction.Occasional chest pain on lifting weight at shop.',1,'Male'],\
              ['Chest:  Clear. 1st heart sound and 2nd heart sound: Audible.',0,'Female']],columns = ['text','label','Gender'])


 #now assign x as you have done and passing to it Countvectorizer object
 x = df[['text','Gender']]

 vect = CountVectorizer
 x_train = vect.fit_transform(x)

 #now when you look at at vocabulary created by countvectorizer
 print(vect.vocabulary_)
 #op
 {'text': 1, 'gender': 0} #it will give you only column name, 
#rather it should give you word which is present in your text as well as gender column
  
#to deal with this concatenate all string column
df['combined_text'] = df['text'] + ' ' + df['Gender']
x_train = vect.fit_transform(df['combined_text'])

#now when you look at its vocabulary, it will give all word present in combined text column
print(vect.vocabulary_)
{'old': 15, 'inferior': 10, 'myocardial': 13, 'infarction': 9, 'occasional': 14, 'chest': 5, 'pain': 17, 'on': 16, 'lifting': 11, 'weight': 20, 'at': 3, 'shop': 18, 'male': 12, 'clear': 6, '1st': 0, 'heart': 8, 'sound': 19, 'and': 2, '2nd': 1, 'audible': 4, 'female': 7} 

#now coming to your predict function of model
model.fit(x_train, df['label'])

#you are using vect.transform for every individual text in predict function which is 
#creating separate sparse matrix, which is throwing error.    

model.predict(vect.transform(['old inferior myocardial','Clear. 1st heart sound']))

#now you will get two output since you are passing 2 text to predict
array([1, 0], dtype=int64) # 

关于python - 如何从整数中取回数据。我的 model.predict() 不起作用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59091802/

相关文章:

parsing - 将大型文档/文本/HTML 分解为片段的良好 ML 模型/技术是什么?

python - 如何修改 Scikit-Learn 决策树算法中的分割标准(基尼/熵)?

python - 具有不同特征维度的FeatureUnion

python - 正则化线性回归中的知识迁移

python - Python 中的 AI 工具入门

python - 接收 bool 结果以查看 crontab 是否存在

python - 如何让 Python 的 multiprocessing Queue 的 .empty() 方法返回正确的值?还是替代品?

python - 使用 dictConfig 时将 Python 日志设置为覆盖日志文件?

r - 如何计算 R 中的频率(计数)变量?

tensorflow - 最小化前馈神经网络的tensorflow.js中的损失