ntsha
ntsha

Reputation: 51

TypeError: expected string or bytes-like object;

I am trying to clean my text data from the text file and I face this error:TypeError: expected string or bytes-like object.

filenames= os.listdir("/input")
raw_files = []

for filename in filenames:
    with open('/input') as myfile:
        raw_files.append(myfile.read().split())

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
global stopwords
import gensim
import re

stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")


def clean_sentences(text):
    tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)

    sent_list = []
    for sent in tokens:
        sent_str = ''
        for i, word in enumerate(nltk.word_tokenize(sent)):
            # nltk doesn't handle apostrophes correctly
            if word[0] == "'":
                sent_str = sent_str[:-1]

            # only adds words and digits
            if re.sub('[a-zA-Z0-9]',"", str(word)):
                sent_str += str(word.lower() + ' ')
                sent_list.append(sent_str.strip()).apply(str)

    return str(sent_list)

# takes list of clean sentences and converts to list of tokens
def tokens_only(text):
    tokens = []

    for sentence in text:
        tokens.extend(sentence.split(" "))

        return tokens

# takes in text, cleans it, and returns lemma only
def lemma_tokens(text):
     import gensim
     tokens = tokens_only (str(clean_sentences(text)))
     return [stemmer.stem(token) for token in tokens]

all_lemma = []
all_tokens = []
all_sentences = []
all_sentences_label = []

for i, doc in enumerate(raw_files):

    # clean sentences    
    tmp_list= str(clean_sentences(doc))
    all_sentences.extend(tmp_list)
    for j in range(len(tmp_list)):
        all_sentences_label.append(filenames[i])

    # convert list of clean sentences to tokens
    tmp_list = tokens_only(tmp_list)
    all_tokens.extend(tmp_list)

    # gets root word for tokens in document
    all_lemma.extend(lemma_tokens(doc))

I am getting these errors below. Traceback (most recent call last): Traceback:

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\exception.py" in inner
  34.             response = get_response(request)

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
  115.                 response = self.process_exception_by_middleware(e, request)

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
  113.                 response = wrapped_callback(request, *callback_args, **callback_kwargs)

File "C:\Users\User\waqaf\waqaf\views.py" in output4
  572.      tmp_list= str(clean_sentences(doc))

File "C:\Users\User\waqaf\waqaf\views.py" in clean_sentences
  531.      tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py" in sent_tokenize
  106.     return tokenizer.tokenize(text)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in tokenize
  1277.         return list(self.sentences_from_text(text, realign_boundaries))

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in sentences_from_text
  1331.         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in <listcomp>
  1331.         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in span_tokenize
  1321.         for sl in slices:

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _realign_boundaries
  1362.         for sl1, sl2 in _pair_iter(slices):

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _pair_iter
  318.         prev = next(it)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _slices_from_text
  1335.         for match in self._lang_vars.period_context_re().finditer(text):

Exception Type: TypeError at /output4
Exception Value: expected string or bytes-like object

I have seen so many similar post but none of it help me solved my problem. I already used str() and apply(str) but it still doesn't work. I continue to get the error.

Upvotes: 2

Views: 2841

Answers (1)

mkrieger1
mkrieger1

Reputation: 23150

Ultimately what is passed to sent_tokenize is one of the items in raw_files, i.e. the output of myfile.read().split(), which is a list of strings. But it expects a single string.

I suggest to omit .split().

Upvotes: 2

Related Questions