如何使用 Python 一次性预处理 NLP 文本(小写字母、删除特殊字符、删除数字、删除电子邮件等)?
Here are all the things I want to do to a Pandas dataframe in one pass in python:
1. Lowercase text
2. Remove whitespace
3. Remove numbers
4. Remove special characters
5. Remove emails
6. Remove stop words
7. Remove NAN
8. Remove weblinks
9. Expand contractions (if possible not necessary)
10. Tokenize
以下是我单独完成所有操作的方式:
def preprocess(self, dataframe):
self.log.info("In preprocess function.")
dataframe1 = self.remove_nan(dataframe)
dataframe2 = self.lowercase(dataframe1)
dataframe3 = self.remove_whitespace(dataframe2)
# Remove emails and websites before removing special characters
dataframe4 = self.remove_emails(self, dataframe3)
dataframe5 = self.remove_website_links(self, dataframe4)
dataframe6 = self.remove_special_characters(dataframe5)
dataframe7 - self.remove_numbers(dataframe6)
self.remove_stop_words(dataframe8) # Doesn't return anything for now
dataframe7 = self.tokenize(dataframe6)
self.log.info(f"Sample of preprocessed data: {dataframe4.head()}")
return dataframe7
def remove_nan(self, dataframe):
"""Pass in a dataframe to remove NAN from those columns."""
return dataframe.dropna()
def lowercase(self, dataframe):
logging.info("Converting dataframe to lowercase")
lowercase_dataframe = dataframe.apply(lambda x: x.lower())
return lowercase_dataframe
def remove_special_characters(self, dataframe):
self.log.info("Removing special characters from dataframe")
no_special_characters = dataframe.replace(r'[^A-Za-z0-9 ]+', '', regex=True)
return no_special_characters
def remove_numbers(self, dataframe):
self.log.info("Removing numbers from dataframe")
removed_numbers = dataframe.str.replace(r'\d+','')
return removed_numbers
def remove_whitespace(self, dataframe):
self.log.info("Removing whitespace from dataframe")
# replace more than 1 space with 1 space
merged_spaces = dataframe.str.replace(r"\s\s+",' ')
# delete beginning and trailing spaces
trimmed_spaces = merged_spaces.apply(lambda x: x.str.strip())
return trimmed_spaces
def remove_stop_words(self, dataframe):
# TODO: An option to pass in a custom list of stopwords would be cool.
set(stopwords.words('english'))
def remove_website_links(self, dataframe):
self.log.info("Removing website links from dataframe")
no_website_links = dataframe.str.replace(r"http\S+", "")
return no_website_links
def tokenize(self, dataframe):
tokenized_dataframe = dataframe.apply(lambda row: word_tokenize(row))
return tokenized_dataframe
def remove_emails(self, dataframe):
no_emails = dataframe.str.replace(r"\S*@\S*\s?")
return no_emails
def expand_contractions(self, dataframe):
# TODO: Not a priority right now. Come back to it later.
return dataframe
最佳答案
以下函数执行您提到的所有事情。
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def preprocess(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence=sentence.replace('{html}',"")
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
stem_words=[stemmer.stem(w) for w in filtered_words]
lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
return " ".join(filtered_words)
df['cleanText']=df['Text'].map(lambda s:preprocess(s))
关于python - 如何一次性预处理 NLP 文本(小写、删除特殊字符、删除数字、删除电子邮件等)?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54396405/