Reputation: 253
all I have a bunch of tweet text in a pandas column. I have made a class to process all of the various aspects of the text ie remove punctuation, expand contractions, remove special characters, etc. I have succeeded in processing individual rows with the class, however, I can't figure out how to apply the methods to the entire column of text.
class ProcessTweetText:
def __init__(self, text):
self.text = text
def remove_web_link(self):
self.text = re.sub(r"http\S+", "", self.text)
return self.text
def remove_html(self):
self.text = self.text.replace('\n', ' ')
return self.text
def replace_contractions(self):
return contractions.fix(self.text)
def remove_hyphen(self):
self.text = self.text.replace('—', ' ')
self.text = self.text.replace('-', ' ')
return self.text
def remove_mentions(self):
self.text = re.sub('@[A-Za-z0-9_]\S+', '', self.text)
return self.text
def remove_hashtags(self):
self.text = re.sub('#[A-Za-z0-9_]\S+', '', self.text)
return self.text
def remove_punctuation(self):
self.text = ''.join([c for c in self.text if c not in string.punctuation])
return self.text
def remove_special_characters(self):
self.text = re.sub('[^a-zA-Z0-9 -]', '', self.text)
return self.text
def process_text(self):
example.remove_web_link()
example.remove_html()
example.replace_contractions()
example.remove_hyphen()
example.remove_hyphen()
example.remove_mentions()
example.remove_hashtags()
example.remove_punctuation()
example.remove_special_characters()
example = ProcessTweetText(df['original_tweets'][100])
example.process_text()
example.text
Perhaps this is not the correct way to go about this, as I am still new to using classes. However, any help applying the desired changes to the pandas' column would be much appreciated. Thanks guys!!
Upvotes: 0
Views: 436
Reputation: 4215
If you want to keep your structure you can use something like this:
def foo(text):
example = ProcessTweetText(text)
example.process_text()
return example.text
df['original_tweets'].apply(foo)
But actually I don't see the point of using a class for this purpose. You could simply do it like this:
def foo(text):
text = re.sub(r"http\S+", "", text)
text = text.replace('\n', ' ')
text = text.replace('—', ' ')
text = text.replace('-', ' ')
text = re.sub('@[A-Za-z0-9_]\S+', '', text)
text = re.sub('#[A-Za-z0-9_]\S+', '', text)
text = ''.join([c for c in text if c not in string.punctuation])
text = re.sub('[^a-zA-Z0-9 -]', '', text)
return text
df['original_tweets'].apply(foo)
Upvotes: 1