python - 如何从整数中取回数据。我的 model.predict() 不起作用

我有一个 csv。包含'性别'，'诊断'，'测试'，'物理检查'，'医学'这些列。我想根据“性别”、“诊断”、“测试”、“物理检查”这些列来预测“医学”列。我已经这样做了:

import nltk
import re
import pandas as pd
from io import StringIO
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer

stop_words=set(stopwords.words("english"))

def stop_words_filtering(wordlist):
    return [w for w in wordlist if not w in stop_words]

def tagg(wordlist):
    try:
        #print(wordlist)
        tagged_list=[]
        tagged=nltk.pos_tag(wordlist)
        return tagged
    except Exception as e:
        print(e)

filter_words_by_parts_of_speach=lambda words:[word for word,pp in words if pp!='CD' and pp!='']
join_words=lambda words:' '.join(words)
remove_numbering = lambda string:re.sub("\d\.", "", string)
remove_punchuation = lambda string:re.sub(r'[^\w\s]','',string)

df=pd.read_csv('Pescription_details.csv')
col = ['gender','diagnosis','test','physical_exam','medicine']
df = df[col]

df = df.replace(np.nan, '', regex=True)
df.columns = ['gender','diagnosis','test','physical_exam','medicine']

for colm in col:
    df[colm]=df[colm].str.replace('\n',' ')
    df[colm]=df[colm].apply(remove_numbering).apply(remove_punchuation).apply(word_tokenize).apply(stop_words_filtering).apply(tagg).apply(filter_words_by_parts_of_speach).apply(join_words)

#df.to_csv('keyword.csv')
df=df.apply(lambda x: pd.factorize(x)[0])
#print(df.head(10))
X=df[['gender','diagnosis','test','physical_exam']]
y=df[['medicine']]


vect = CountVectorizer()
vect.fit(X)
simple_train_dtm = vect.transform(X)
percent = 0.0
rds=0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=138)

y_train, y_test=y_train.values.ravel(),y_test.values.ravel()

vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
vect.vocabulary_.get(u'algorithm')

nb = MultinomialNB()

nb.fit(X_train,y_train)

y_pred_class = nb.predict(X_test)

percentage=metrics.accuracy_score(y_test, y_pred_class)*100
print(percentage)

ber=BernoulliNB()
ber.fit(X_train,y_train)
y_pred_class = ber.predict(X_test)
percentage=metrics.accuracy_score(y_test, y_pred_class)*100
print('bernoli',percentage)

print(ber.predict( [vect.transform(["Male"]),
vect.transform(["Old inferior myocardial infarction.Occasional chest pain on lifting weight at shop."]),
vect.transform(["Electrocardiogram:  Old inferior myocardial infarction. FBS:  95 mg/dL. Creatinine: 1.99 mg/dL. SGPT:  Normal. Fasting lipid profile:  Normal. Echo:  Akinetic inferior wall."]),
vect.transform(["Chest:  Clear. 1st heart sound and 2nd heart sound: Audible."])
] ))

但是这段代码给了我错误


47.02702702702703
bernoli 51.891891891891895
Traceback (most recent call last):
  File "backend.py", line 90, in <module>
    vect.transform(["Chest:  Clear. 1st heart sound and 2nd heart sound: Audible."])
  File "/home/android/.local/lib/python3.6/site-packages/sklearn/naive_bayes.py", line 65, in predict
    jll = self._joint_log_likelihood(X)
  File "/home/android/.local/lib/python3.6/site-packages/sklearn/naive_bayes.py", line 943, in _joint_log_likelihood
    X = check_array(X, accept_sparse='csr')
  File "/home/android/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 521, in check_array
    "if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[<1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>
 <1x5 sparse matrix of type '<class 'numpy.int64'>'
        with 0 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

如何预测“医学”栏？该代码的输出应该类似于“Olmesartan Clopidogrel Rosuvastatin5 Ivabradine”。这些是药品名称

最佳答案

from sklearn.feature_extraction.text import CountVectorizer

# creating dummy dataframe with 2 textual column text and gender, 
df = pd.DataFrame([['Old inferior myocardial infarction.Occasional chest pain on lifting weight at shop.',1,'Male'],\
              ['Chest:  Clear. 1st heart sound and 2nd heart sound: Audible.',0,'Female']],columns = ['text','label','Gender'])


 #now assign x as you have done and passing to it Countvectorizer object
 x = df[['text','Gender']]

 vect = CountVectorizer
 x_train = vect.fit_transform(x)

 #now when you look at at vocabulary created by countvectorizer
 print(vect.vocabulary_)
 #op
 {'text': 1, 'gender': 0} #it will give you only column name, 
#rather it should give you word which is present in your text as well as gender column
  
#to deal with this concatenate all string column
df['combined_text'] = df['text'] + ' ' + df['Gender']
x_train = vect.fit_transform(df['combined_text'])

#now when you look at its vocabulary, it will give all word present in combined text column
print(vect.vocabulary_)
{'old': 15, 'inferior': 10, 'myocardial': 13, 'infarction': 9, 'occasional': 14, 'chest': 5, 'pain': 17, 'on': 16, 'lifting': 11, 'weight': 20, 'at': 3, 'shop': 18, 'male': 12, 'clear': 6, '1st': 0, 'heart': 8, 'sound': 19, 'and': 2, '2nd': 1, 'audible': 4, 'female': 7} 

#now coming to your predict function of model
model.fit(x_train, df['label'])

#you are using vect.transform for every individual text in predict function which is 
#creating separate sparse matrix, which is throwing error.    

model.predict(vect.transform(['old inferior myocardial','Clear. 1st heart sound']))

#now you will get two output since you are passing 2 text to predict
array([1, 0], dtype=int64) #

关于python - 如何从整数中取回数据。我的 model.predict() 不起作用，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/59091802/

python - 如何从整数中取回数据。我的 model.predict() 不起作用

上一篇：python - <训练样本> 和 <验证样本> 是什么意思？

下一篇：machine-learning - 强化学习文献中的 "soft"是什么意思？