Reputation: 29
This is about plain word counting, to collect which words appear in a document and how how often.
I try to write a function were the input is a list of text lines. I go through all lines, split them into words, accumulate the recognized words and finally return the complete list.
First I have a while-loop that goes through all the characters in the list, and but ignores the white spaces. Inside this while loop I also try to recognize what kind of words I have. In this context, there are three kinds of words:
I have three if statements which check what kind of character I have. When I know what kind of word I have encountered, I try to extract the word itself. When the word starts with a letter or a digit, I take all consecutive characters of the same kind as part of the word.
But, in the third if statement, when I take care of the case when the current character is neither a letter nor a digit, I get problems.
When I give the input
wordfreq.tokenize(['15, delicious& Tarts.'])
I want the output to be
['15', ',', 'delicious', '&', 'tarts', '.']
When I test the function in the Python Console, it looks like this:
PyDev console: starting.
Python 3.7.4 (v3.7.4:e09359112e, Jul 8 2019, 14:54:52)
[Clang 6.0 (clang-600.0.57)] on darwin
import wordfreq
wordfreq.tokenize(['15, delicious& Tarts.'])
['15', 'delicious', 'tarts']
The function does not take neither the comma, the ampersand nor the dot into account! How do I fix this? See below for the code.
(The lower() method is because I want to ignore capitalization, e.g. 'Tarts' and 'tarts' are really the same words.)
# wordfreq.py
def tokenize(lines):
words = []
for line in lines:
start = 0
while start < len(line):
while line[start].isspace():
start = start + 1
if line[start].isalpha():
end = start
while line[end].isalpha():
end = end + 1
word = line[start:end]
words.append(word.lower())
start = end
elif line[start].isdigit():
end = start
while line[end].isdigit():
end = end + 1
word = line[start:end]
words.append(word)
start = end
else:
words.append(line[start])
start = start + 1
return words
Upvotes: 0
Views: 378
Reputation: 29
I found what the problem was. The line
start = start + 1
where supposed to be in the last else statement.
So my code looks like this and gives me the desired input specified above:
def tokenize(lines):
words = []
for line in lines:
start = 0
while start < len(line):
while line[start].isspace():
start = start + 1
end = start
if line[start].isalpha():
while line[end].isalpha():
end = end + 1
word = line[start:end]
word = word.lower()
words.append(word)
start = end
elif line[start].isdigit():
while line[end].isdigit():
end = end + 1
word = line[start:end]
words.append(word)
start = end
else:
word = line[start]
words.append(word)
start = start + 1
return words
However, when I use the testing script below to make sure that there is no corner cases that the function 'tokenize' missed out;...
import io
import sys
import importlib.util
def test(fun,x,y):
global pass_tests, fail_tests
if type(x) == tuple:
z = fun(*x)
else:
z = fun(x)
if y == z:
pass_tests = pass_tests + 1
else:
if type(x) == tuple:
s = repr(x)
else:
s = "("+repr(x)+")"
print("Condition failed:")
print(" "+fun.__name__+s+" == "+repr(y))
print(fun.__name__+" returned/printed:")
print(str(z))
fail_tests = fail_tests + 1
def run(src_path=None):
global pass_tests, fail_tests
if src_path == None:
import wordfreq
else:
spec = importlib.util.spec_from_file_location("wordfreq", src_path+"/wordfreq.py")
wordfreq = importlib.util.module_from_spec(spec)
spec.loader.exec_module(wordfreq)
pass_tests = 0
fail_tests = 0
fun_count = 0
def printTopMost(freq,n):
saved = sys.stdout
sys.stdout = io.StringIO()
wordfreq.printTopMost(freq,n)
out = sys.stdout.getvalue()
sys.stdout = saved
return out
if hasattr(wordfreq, "tokenize"):
fun_count = fun_count + 1
test(wordfreq.tokenize, [], [])
test(wordfreq.tokenize, [""], [])
test(wordfreq.tokenize, [" "], [])
test(wordfreq.tokenize, ["This is a simple sentence"], ["this","is","a","simple","sentence"])
test(wordfreq.tokenize, ["I told you!"], ["i","told","you","!"])
test(wordfreq.tokenize, ["The 10 little chicks"], ["the","10","little","chicks"])
test(wordfreq.tokenize, ["15th anniversary"], ["15","th","anniversary"])
test(wordfreq.tokenize, ["He is in the room, she said."], ["he","is","in","the","room",",","she","said","."])
else:
print("tokenize is not implemented yet!")
if hasattr(wordfreq, "countWords"):
fun_count = fun_count + 1
test(wordfreq.countWords, ([],[]), {})
test(wordfreq.countWords, (["clean","water"],[]), {"clean":1,"water":1})
test(wordfreq.countWords, (["clean","water","is","drinkable","water"],[]), {"clean":1,"water":2,"is":1,"drinkable":1})
test(wordfreq.countWords, (["clean","water","is","drinkable","water"],["is"]), {"clean":1,"water":2,"drinkable":1})
else:
print("countWords is not implemented yet!")
if hasattr(wordfreq, "printTopMost"):
fun_count = fun_count + 1
test(printTopMost,({},10),"")
test(printTopMost,({"horror": 5, "happiness": 15},0),"")
test(printTopMost,({"C": 3, "python": 5, "haskell": 2, "java": 1},3),"python 5\nC 3\nhaskell 2\n")
else:
print("printTopMost is not implemented yet!")
print(str(pass_tests)+" out of "+str(pass_tests+fail_tests)+" passed.")
return (fun_count == 3 and fail_tests == 0)
if __name__ == "__main__":
run()
... I get the following output:
/usr/local/bin/python3.7 "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py"
Traceback (most recent call last):
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 81, in <module>
run()
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 50, in run
test(wordfreq.tokenize, [" "], [])
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 10, in test
z = fun(x)
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/wordfreq.py", line 44, in tokenize
while line[start].isspace():
IndexError: string index out of range
Why does it say that the string index is out of range? How do I fix that problem?
Upvotes: 0
Reputation: 10809
itertools.groupby
could simplify this quite a bit. Basically, you group the characters in your string based on the category or type of character - alpha, digit or punctuation. In this example I only defined those three categories, but you can define as many or as little categories as you wish. Any character that doesn't match any of the categories (whitespace, in this example) is ignored:
def get_tokens(string):
from itertools import groupby
from string import ascii_lowercase, ascii_uppercase, digits, punctuation as punct
alpha = ascii_lowercase + ascii_uppercase
yield from ("".join(group) for key, group in groupby(string, key=lambda char: next((category for category in (alpha, digits, punct) if char in category), "")) if key)
print(list(get_tokens("15, delicious& Tarts.")))
Output:
['15', ',', 'delicious', '&', 'Tarts', '.']
>>>
Upvotes: 0
Reputation: 23
I'm not sure why you're doing upper and lower but here's how you can split it:
input = ['15, delicious& Tarts.']
line = input[0]
words = line.split(' ')
words = [word for word in words if word]
out:
['15,', 'delicious&', 'Tarts.']
edit, saw that you edited how you want your output to be. Just skip this line to get that output:
words = [word for word in words if word]
Upvotes: 0