Reputation: 51
I am trying to clean my text data from the text file and I face this error:TypeError: expected string or bytes-like object.
filenames= os.listdir("/input")
raw_files = []
for filename in filenames:
with open('/input') as myfile:
raw_files.append(myfile.read().split())
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
global stopwords
import gensim
import re
stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")
def clean_sentences(text):
tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)
sent_list = []
for sent in tokens:
sent_str = ''
for i, word in enumerate(nltk.word_tokenize(sent)):
# nltk doesn't handle apostrophes correctly
if word[0] == "'":
sent_str = sent_str[:-1]
# only adds words and digits
if re.sub('[a-zA-Z0-9]',"", str(word)):
sent_str += str(word.lower() + ' ')
sent_list.append(sent_str.strip()).apply(str)
return str(sent_list)
# takes list of clean sentences and converts to list of tokens
def tokens_only(text):
tokens = []
for sentence in text:
tokens.extend(sentence.split(" "))
return tokens
# takes in text, cleans it, and returns lemma only
def lemma_tokens(text):
import gensim
tokens = tokens_only (str(clean_sentences(text)))
return [stemmer.stem(token) for token in tokens]
all_lemma = []
all_tokens = []
all_sentences = []
all_sentences_label = []
for i, doc in enumerate(raw_files):
# clean sentences
tmp_list= str(clean_sentences(doc))
all_sentences.extend(tmp_list)
for j in range(len(tmp_list)):
all_sentences_label.append(filenames[i])
# convert list of clean sentences to tokens
tmp_list = tokens_only(tmp_list)
all_tokens.extend(tmp_list)
# gets root word for tokens in document
all_lemma.extend(lemma_tokens(doc))
I am getting these errors below. Traceback (most recent call last): Traceback:
File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\exception.py" in inner
34. response = get_response(request)
File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
115. response = self.process_exception_by_middleware(e, request)
File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
113. response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "C:\Users\User\waqaf\waqaf\views.py" in output4
572. tmp_list= str(clean_sentences(doc))
File "C:\Users\User\waqaf\waqaf\views.py" in clean_sentences
531. tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py" in sent_tokenize
106. return tokenizer.tokenize(text)
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in tokenize
1277. return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in sentences_from_text
1331. return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in <listcomp>
1331. return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in span_tokenize
1321. for sl in slices:
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _realign_boundaries
1362. for sl1, sl2 in _pair_iter(slices):
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _pair_iter
318. prev = next(it)
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _slices_from_text
1335. for match in self._lang_vars.period_context_re().finditer(text):
Exception Type: TypeError at /output4
Exception Value: expected string or bytes-like object
I have seen so many similar post but none of it help me solved my problem. I already used str() and apply(str) but it still doesn't work. I continue to get the error.
Upvotes: 2
Views: 2841
Reputation: 23150
Ultimately what is passed to sent_tokenize
is one of the items in raw_files
, i.e. the output of myfile.read().split()
, which is a list of strings. But it expects a single string.
I suggest to omit .split()
.
Upvotes: 2