Reputation: 438
I wrote simple script to grab unicode text from website and want the result written in a file row by row. The code works, apart from writing to file function. print(item)
works perfectly fine and gives exact result I need.
import requests
import unicodecsv, os
from bs4 import BeautifulSoup
import re
countries = ["ar","th","bn","my","chin","de","es","fr","hi","ja","ko","pt","ru","th","tr","vi","zh"]
f = open("lesson_list.txt","wb")
w = unicodecsv.writer(f, encoding='utf-8', delimiter=',', quotechar='"')
for country in countries:
toi = 1
print country
while toi<101:
print toi,
url = "http://www.englishspeak.com/"+ country +"/english-lesson.cfm?lessonID=" + str(toi)
r = requests.get(url)
soup = BeautifulSoup(r.content)
soup.unicode
titles = soup.find_all('font', {"color": "#006633"})
data = [0]
for index, item in enumerate(titles):
tmp = titles[index].encode("utf-8")
replaced = re.sub(r'<font color="#006633" face="Verdana" size="4">', ' ', tmp)
replaced = re.sub(r'\n', ' ', replaced)
replaced = re.sub(r'\r', ' ', replaced)
replaced = re.sub(r'</font>', ' ', replaced)
replaced = re.sub(r'\s+', ' ', replaced)
data[index] = replaced
toi += 1
for index, item in enumerate(data):
print(item)
w.writerow(item)
Upvotes: 0
Views: 71
Reputation: 20583
a quick look here, your list is [0], and once your enumerate past index 1, it's not storing anything in it.
# try rewriting this
data = []
for index, item in enumerate(titles):
tmp = titles[index].encode("utf-8")
replaced = re.sub(r'<font color="#006633" face="Verdana" size="4">', ' ', tmp)
replaced = re.sub(r'\n', ' ', replaced)
replaced = re.sub(r'\r', ' ', replaced)
replaced = re.sub(r'</font>', ' ', replaced)
replaced = re.sub(r'\s+', ' ', replaced)
data.append(replaced) # use append to add replaced to the list
...
# and writerow with data
print data # if you want to see the data on each loop
w.writerow(data)
Upvotes: 1
Reputation: 19
i guess it should serve the purpose:
import csv
import re
import requests
from bs4 import BeautifulSoup
countries = ["ar", "th", "bn", "my", "chin", "de", "es", "fr", "hi", "ja", "ko",
"pt", "ru", "th", "tr", "vi", "zh"]
FILENAME = "lesson_list.txt"
def run():
for country in countries:
toi = 1
while toi < 101:
url = "http://www.englishspeak.com/" + country + "/english-lesson.cfm?lessonID=" + str(toi)
r = requests.get(url)
soup = BeautifulSoup(r.content)
soup.unicode
titles = soup.find_all('font', {"color": "#006633"})
data = []
for index, item in enumerate(titles):
replaced = re.sub('<[^>]*>|\n|\r', '', titles[index].encode("utf-8"))
data.append(replaced)
toi += 1
print data, "item"
csv_export(data)
def csv_export(data, file_name=FILENAME):
file_item = open(file_name, 'wb')
dict_writer = csv.writer(file_item)
dict_writer.writerows(data)
if __name__ == '__main__':
run()
Upvotes: 1