Reputation: 2259
[Using Python 3.1] Does anyone have any idea how to make a Python 3 application allow the user to write a text file with multiple words separated with commas. The program should read the file, and download the Wikipedia page of the requested item. e.g. if they typed hello,python-3,chicken it would go to Wikipedia and download http://www.wikipedia.com/wiki/hello, http://www.wikip... Anyone think they can do this?
When I say "download" I mean download the text, doesn't matter about images.
Upvotes: 0
Views: 828
Reputation: 11
Check the following code, it downloads the html, without the images, but you can access them from the xml file that is being parsed to get the url.
from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node
def main():
print "Hello World"
keywords = []
key_file = open("example.txt", 'r')
if key_file:
temp_lines = key_file.readlines()
for keyword_line in temp_lines:
keywords.append(keyword_line.rstrip("\n"))
key_file.close()
print "Total keywords: %d" % len(keywords)
for keyword in keywords:
url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
xmldoc = minidom.parse(urllib.urlopen(url))
root_node = xmldoc.childNodes[0]
section_node = None
for node in root_node.childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Section":
section_node = node
break
if section_node is not None:
items = []
for node in section_node.childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Item":
items.append(node)
if len(items) == 0:
print "NO results found"
else:
print "\nResults found for " + keyword + ":\n"
for item in items:
for node in item.childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Text":
if len(node.childNodes) == 1:
print node.childNodes[0].data.encode('utf-8')
file_name = None
for node in items[0].childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Text":
if len(node.childNodes) == 1:
file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
break
if file_name is not None:
file = open(file_name, 'w')
if file:
for node in items[0].childNodes:
if node.nodeType == Node.ELEMENT_NODE and \
node.nodeName == "Url":
if len(node.childNodes) == 1:
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
header = { 'User-Agent' : user_agent }
request = urllib2.Request(url=node.childNodes[0].data, headers=header)
file.write(urllib2.urlopen(request).read())
file.close()
break
print "Sleeping"
sleep(2)
if __name__ == "__main__":
main()
Upvotes: 0
Reputation: 172269
You described exactly how to make such a program. So what is the question?
You read the file, split on commas, and download the URL. Done!
Upvotes: 1