Reputation: 3359
How do I get the text value of the title element? Is this even possible with a Dom Element? Will I have to parse out the text by hand?
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
import re
import urllib.request
from xml.dom import minidom
def download(url):
with urllib.request.urlopen(url) as res:
return res.read().decode('latin-1')
class RSSFeed(object):
def __init__(self, url):
self.url = url
self.raw_xml = download(url)
self.dom = minidom.parseString(self.raw_xml)
self.links = self.dom.getElementsByTagName('link')
def entries(self):
ret = {}
for element in self.dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0]
print(title.toprettyxml())
def __str__(self):
return self.dom.toprettyxml()
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=500'
feed = RSSFeed(feed_url)
dom = feed.dom
print(feedHow totries())
Upvotes: 2
Views: 3892
Reputation: 3359
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
import urllib.request
from xml.dom import minidom
def parse_feed(url):
with urllib.request.urlopen(url) as res:
dom = minidom.parseString(res.read().decode('latin-1'))
for element in dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0].firstChild.nodeValue
link = element.getElementsByTagName('link')[0].getAttribute('href')
author = element.getElementsByTagName('name')[0].firstChild.nodeValue
article = element.getElementsByTagName('content')[0].firstChild.nodeValue
yield type('Entry', (object,), dict(title=title, link=link, author=author, article=article))
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=500'
for entry in parse_feed(feed_url):
print(entry.title, entry.link)
Upvotes: 0
Reputation: 338316
The canonical way to determine the node value (i.e. text content) of any XML element is to
Minidom inexplicably does not implement this procedure, so if you must use minidom, you need to do it yourself.
So we need a few helper functions.
Let's collect them in a module.
# minidom_helpers.py
def get_descendant_nodes(context_node, predicate):
if not context_node:
yield None
for child in context_node.childNodes:
if predicate(child):
yield child
yield from get_descendant_nodes(child, predicate)
def get_text_value(context_node, default=None):
texts_nodes = get_descendant_nodes(context_node, lambda n: n.nodeType == n.TEXT_NODE)
text_value = ' '.join([str.strip(t.nodeValue) for t in texts_nodes])
return text_value if text_value else default
def get_first_child(context_node, element_name):
elems = context_node.getElementsByTagName(element_name)
return elems[0] if elems else None
Now we can do
import re
import urllib.request
from xml.dom import minidom
from minidom_helpers import *
class RSSFeed(object):
def __init__(self, url):
self.url = url
self.dom = minidom.parse(urllib.request.urlopen(url))
self.links = self.dom.getElementsByTagName('link')
def entries(self):
for entry in self.dom.getElementsByTagName('entry'):
yield {
"title": get_text_value(get_first_child(entry, 'title'))
}
def __str__(self):
return self.dom.toprettyxml()
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=500'
feed = RSSFeed(feed_url)
for entry in feed.entries():
print(entry)
A general note on parsing XML. Try to get into the habit of thinking of XML as binary data, instead of text.
XML parsers implement a complex mechanism of figuring out the file encoding automatically. It's not necessary and not smart to circumvent that mechanism by trying to decode the file or HTTP response into a string yourself ahead of time:
# BAD CODE, DO NOT USE
def download(url):
with urllib.request.urlopen(url) as res:
return res.read().decode('latin-1')
raw_xml = download(url)
dom = minidom.parseString(self.raw_xml)
The above makes hard-coded (and in your case: wrong) assumptions about the file encoding and will break when the server decides to start sending the file in UTF-16 for some reason.
If you think of XML as binary data instead of text, it gets both a lot easier and a lot more robust.
dom = minidom.parse(urllib.request.urlopen(url))
The XML parser will sniff the bytes and decide what encoding they are in.
This is also true for reading XML from files. Instead of
# BAD CODE, DO NOT USE
with open(path, 'r', encoding='latin-1') as fp:
dom = minidom.parseString(fp.read())
Use
with open(path, 'rb') as fp:
dom = minidom.parse(fp)
or simply
dom = minidom.parse(path)
Upvotes: 2
Reputation: 3359
def entries(self):
for element in self.dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0].firstChild.nodeValue
link = element.getElementsByTagName('link')[0].getAttribute('href')
author = element.getElementsByTagName('name')[0].firstChild.nodeValue
article = element.getElementsByTagName('content')[0].firstChild
yield type('Entry', (object,), dict(title=title, link=link, author=author, article=article))
Upvotes: 0