Mark
Mark

Reputation: 1295

Insert un-escaped html into rss feed made in django

I'm trying to use django to create a podcast rss feed using feedgenerator.Rss201rev2Feed As a feed generator, it works somewhat the opposite to BeautifulSoup: putting info into appropriate xml tags

It's working well, except I'd like to not escape all the html

In particular, I'd like the <itunes:summary> value of the rss feed to appear like this: <itunes:summary><![CDATA[Link to <a href="http://www.website.com">the website</a>]]></itunes:summary> as per the Apple spec

If I were rendering html in a normal view, I could use the |safe filter in the html template. I need something similar now, to selectively prevent < being escaped in the rss feed.

That is, I need the rss to appear with <![CDATA[...]] rather than escaping to &lt;![CDATA[...]]&gt;

However, it seems Django "Django autoescapes special characters in RSS feeds (or any XML for that matter) no matter what, regardless of whether you pass it through the safe filter or not" (see this 2009 question)

No luck so far:

Therefore, attempts to use mark_safe thus far have proved useless.

I'm also unsure how to interpret one idea to pass "autoescape=False to the render() calls in django.contrib.syndication.feeds".

The suggestion to add , escape=False into the addQuickElement comment returned an error

 handler.addQuickElement(u'itunes:summary',item['summary'], escape=False)
 TypeError: addQuickElement() got an unexpected keyword argument 'escape'

It's an old issue, but I can't find any solution so far.

Anybody know a neat way to get <![CDATA[... to appear in the final feed, rather than escaping to &lt;![CDATA[...?


EDIT: Here's the code in current form as when I posted this question (I haven't yet tried incorporating @Lego's answer)

import mimetypes

from django.conf import settings
from django.contrib.syndication.views import Feed

# For customising the feed
from django.utils.feedgenerator import Rss201rev2Feed
from django.utils import feedgenerator
# see also https://github.com/blancltd/blanc-basic-podcast/blob/master/blanc_basic_podcast/podcast/itunesfeed.py
# and https://github.com/aneumeier/feeds/blob/master/feeds/rss.py
# and https://github.com/blancltd/blanc-basic-podcast/blob/master/blanc_basic_podcast/podcast/feeds.py
# and https://docs.djangoproject.com/en/1.7/ref/contrib/syndication/#custom-feed-generators

from django.contrib.auth.models import User
from django.shortcuts import get_object_or_404
from django.utils.translation import ugettext_lazy as _
from django.contrib.sites.models import Site

from audiotracks.models import get_track_model, Playlist
Track = get_track_model()

ITEMS_PER_FEED = getattr(settings, 'AUDIOTRACKS_PODCAST_LIMIT', 99)
# MarkAdded @ToDo revisit that default maximum num. tracks per feed

from django.core.urlresolvers import reverse, reverse_lazy


from django_slack import slack_message




######################################################################
##### try adapting code from https://github.com/CaptainHayashi/django-lass-uryplayer/blob/master/uryplayer/feeds.py

from django.utils.feedgenerator import Rss201rev2Feed
from django.contrib.syndication.views import Feed
from django.contrib.sites.models import Site
from django.db.models import permalink
# from uryplayer.models import Podcast
import datetime
# MarkAdded in attempt to have un-escaped <![CDATA[...]]
from django.utils.safestring import mark_safe


# from https://stackoverflow.com/questions/275174/how-do-i-perform-html-decoding-encoding-using-python-django
try:
    from html.parser import HTMLParser  # py3
except ImportError:
    from HTMLParser import HTMLParser  # py2

unescape = HTMLParser().unescape
# print(unescape("&gt;"))
# That proved useless so far



class iTunesPodcastsFeedGenerator(Rss201rev2Feed):

    def rss_attributes(self):
        return {u"version": self._version, u"xmlns:atom": u"http://www.w3.org/2005/Atom", u'xmlns:itunes': u'http://www.itunes.com/dtds/podcast-1.0.dtd'}

    def add_root_elements(self, handler):
        super(iTunesPodcastsFeedGenerator, self).add_root_elements(handler)
        handler.addQuickElement(u'itunes:subtitle', self.feed['subtitle'])
        handler.addQuickElement(u'itunes:author', self.feed['author_name'])
        # handler.addQuickElement(u'itunes:summary', mark_safe(self.feed['description']))
        handler.addQuickElement(u'itunes:summary', unescape(mark_safe(self.feed['description'])))

        # handler.addQuickElement(u'itunes:image', self.feed['iTunes_image_url'])
        handler.addQuickElement('itunes:image', '' , { 'href' : self.feed['iTunes_image_url']})
        # that's from https://gitorious.org/podjango/podjango/commit/621857be0a3d7c44f1925c7daf471c38ea62c180?diffmode=sidebyside

        handler.addQuickElement(u'itunes:explicit', self.feed['iTunes_explicit'])
        handler.startElement(u"itunes:owner", {})
        handler.addQuickElement(u'itunes:name', self.feed['iTunes_name'])
        handler.addQuickElement(u'itunes:email', self.feed['iTunes_email'])
        handler.endElement(u"itunes:owner")

        # @ToDo: add categories

    def add_item_elements(self,  handler, item):
        super(iTunesPodcastsFeedGenerator, self).add_item_elements(handler, item)
        handler.addQuickElement(u'itunes:summary', unescape(item['summary']))
        handler.addQuickElement(u'itunes:explicit',item['explicit'])
        # handler.addQuickElement(u'itunes:image', item['iTunes_image_url'])
        handler.addQuickElement(u'itunes:image', '' , { 'href' : self.feed['iTunes_image_url']})


    # def __unicode__(self):
    #     return unicode(self.order_num)

class iTunesPodcastPost():
    def __init__(self, podcast): # note: podcast here = Track for me
        self.id = podcast.id
        # self.date_submitted = podcast.date_submitted
        self.pub_date = podcast.pub_date
        self.title = podcast.title if podcast.title else "Track"
        self.summary = unescape(podcast.description) if podcast.description else "Cool thing"
        # self.description = mark_safe("<![CDATA[%s]]>" % (podcast.description)) if podcast.description else ""
        self.description = podcast.description if podcast.description else "Cool thing"

        self.enclosure_url = podcast.awe_url # defined in models.py

        self.enclosure_length = podcast.size if podcast.size else 1 # u'unkown duration'
        self.enclosure_mime_type = u'audio/mpeg' # @ToDo generalise once we have other types
        self.explicit = u'yes' if podcast.explicit else u'no'
        self.url = podcast.get_absolute_url

        self.iTunes_image_url = podcast.main_image_url # if podcast.main_image_url else 'http://fun.com'

        self.length = podcast.time_duration if podcast.time_duration else 11 # "TBD"
        self.user_id = podcast.user_id
        self.user = User.objects.get(id = podcast.user_id)
        self.slug = podcast.slug
        self.duration = podcast.time_duration if podcast.time_duration else "5:00" # "Duration TBC"

        # if categories:
        #     categories = [to_unicode(c) for c in categories]
        # see https://docs.djangoproject.com/en/1.7/_modules/django/utils/feedgenerator/#SyndicationFeed.add_root_elements

    def __unicode__(self):
        return self.title

    def get_absolute_url(self):
        # return "%s" % self.url()
        # user = User.objects.get(id=self.user_id)
        return reverse('track_detail', args=[self.user.username, self.slug]) 


class iTunesPodcastsFeed(Feed):
    """
    A feed of podcasts for iTunes and other compatible podcatchers.
    Based on https://github.com/CaptainHayashi/django-lass-uryplayer/blob/master/uryplayer/feeds.py
    """

    def get_object(self, request, username, playlist_slug):
        self.request = request
        # return get_object_or_404(User, username=username)
        user = get_object_or_404(User, username=username)
        return get_object_or_404(Playlist, user_id=user.id, slug=playlist_slug)

    def link(self, playlist):
        # return self.request.build_absolute_uri("/@%s/" % user.username)
        user = User.objects.get(id=playlist.user_id)
        return reverse('playlist_index', args=[user.username, playlist.slug])

    def title(self, playlist):

        return playlist.title


    # description_template = mark_safe("defaults/playlist_description_missing.html")
    # not using that

    def description(self, playlist):
        if playlist.description:
            return playlist.description
            # return mark_safe("<![CDATA[%s]]>" % (playlist.description))
            # No, I won't wrap in CDATA until I can avoid escaping the "<" signs here

        else:
            return "[Auto text] The creator has not written a description."
            # return render_to_string("defaults/playlist_description_missing.txt")
            # pass


    def iTunes_image_url(self, obj): # TypeError: coercing to Unicode: need string or buffer, instancemethod found
        if obj.main_image_url:
            return unicode(obj.main_image_url) # obj.main_image_url
        else:
            return u'https://dl.dropboxusercontent.com/u/16441973/publicstatic/img/playlist-icon.png'


    # author_name = 'University Radio York'
    # modified from https://github.com/aneumeier/feeds/blob/master/feeds/rss.py
    def author_name(self, obj): # obj = playlist
        """
        Return the author for this feed.
        The feed is in `obj`, provided by `get_object`
        """
        if obj.author:
            return u"%s" % obj.author
        else:
            return 'Playlist created by %s' % (obj.user.username)

    def subtitle(self, obj): # obj = playlist
        """
        Return the author for this feed.
        The feed is in `obj`, provided by `get_object`
        """
        if obj.subtitle:
            return u"%s" % obj.author
        else:
            return '%s created in 2015' % (obj.title)

    # def summary(self, obj):
    #     return obj.description

    # @ToDo: finish adapting rest of this from the hard-coded URY values to actual values for my implementation

    iTunes_name = u'Hard-coded iTunes name for now'
    iTunes_email = u'[email protected]' 
    # @ToDo: make dynamic, not hard-coded


    iTunes_explicit = u'no'
    feed_type = iTunesPodcastsFeedGenerator
    feed_copyright = "Copyright 1967-%s University Radio York" % datetime.date.today().year



    def feed_extra_kwargs(self, playlist):
        extra = {}
        extra['iTunes_name'] = self.iTunes_name
        extra['iTunes_email'] = self.iTunes_email
        # extra['iTunes_image_url'] = self.iTunes_image_url
        def get_image(self, playlist):
            if playlist.main_image_url:
                return playlist.main_image_url
            else:
                return "https://dl.dropboxusercontent.com/u/16441973/publicstatic/img/rss_playlist_icon_placeholder.png"
                # @ToDo: replace with Awesound logo
            # return render_to_string("defaults/playlist_description_missing.txt")
            # pass

        extra['iTunes_image_url'] = get_image(self, playlist)
        extra['iTunes_explicit'] = self.iTunes_explicit

        return extra


    def items(self, playlist):
        """
        Returns a list of items to publish in this feed.
        """
        posts = playlist.tracks.all().order_by('-pub_date').order_by("-created_at")[:99]
        posts = [iTunesPodcastPost(item) for item in posts]
        return posts

    def item_extra_kwargs(self, item):
        return {'summary':unescape(mark_safe(item.description)), 
            'explicit':item.explicit,   
            'iTunes_image_url':item.iTunes_image_url}
            # was summary: item.summary

    # MarkAdded
    def item_link(self, item):
        # return item.enclosure_length
        if item.user_id:
            # we have a normal track created by a user
            # user = User.objects.get(id = item.user_id)
            return reverse('track_detail', args=[item.user.username, item.slug])
        else:
            # we have a funny track without a user, e.g., created via command line
            return 'Exception:TrackWithoutUser'

    def item_pubdate(self, item):
        return item.pub_date

    def item_enclosure_url(self, item):
        return item.enclosure_url


    def item_enclosure_length(self, item):
        # return item.enclosure_length
        return item.length

    def item_enclosure_mime_type(self, item):
        # return item.enclosure_mime_type
        return 'audio/mpeg' # @ToDo: make dynamic

    def item_description(self, item):
        # return item.summary
        if item.description:
            return unescape(mark_safe(item.description))
        else:
            return "User has not written a description. This is an automatic message"


# current_site = Site.objects.get_current()
current_site = 'https://greatsite.com'
iTunes_feed = iTunesPodcastsFeed()



### the above will be called if both username and playlist_slug are deteced in the url
### there are two older methods to handle other situations

class AllTracks(Feed):
    #
    # working old method, not relevant to html escaping question
    #


class UserTracks(AllTracks):
    #
    # working old method, not relevant to my question
    #

all_tracks = AllTracks()
user_tracks = UserTracks()

### note, both of those are also subject to full html escaping also



def choose_feed(request, *args, **kwargs):
    """
    Pick up the user feed or the global feed depending on whether or not the
    URL contains a username parameter
    """
    # feed = user_tracks if 'username' in kwargs else all_tracks
    if 'username' in kwargs:
        if 'playlist_slug' in kwargs:
            # feed = podcast_feed  
            slug = kwargs['playlist_slug']
            feed = iTunes_feed
            if request.user:
                user = request.user
                slack_message('slackmessages/playlist_feed.slack', { #django_slack/slackmessages/
                    'playlist': Playlist.objects.get(slug=slug),
                    'user':user,
                    })
        else:

            feed = user_tracks
    else:
        feed = all_tracks     


    return feed.__call__(request, *args, **kwargs)

Upvotes: 5

Views: 2684

Answers (6)

Jolaiya Emmanuel
Jolaiya Emmanuel

Reputation: 31

What works for me was improving @RNC answer using the code below:

from django.utils.feedgenerator import Rss201rev2Feed
from django.utils.xmlutils import SimplerXMLGenerator
from django.contrib.syndication.views import Feed

#override django's simplerXMLGenerator class

class CustomXMLGenerator(SimplerXMLGenerator):
"""Override defaults django XML Generator to allow writing contents with CDATA prefix"""

def addQuickElement(self, name, contents=None, attrs=None):
    "Convenience method for adding an element with no children"
    if attrs is None:
        attrs = {}
    self.startElement(name, attrs)
    if contents is not None:
        if contents.startswith("<![CDATA["):
            # this is the main function that ignores the whitespace and doesn't escape the content
            self.ignorableWhitespace(contents)
        else:
            self.characters(contents)
    self.endElement(name)


class RSSFeedMixin(Rss201rev2Feed):
"""The wrapper class for the base RSSFeed class"""

    def write(self, outfile, encoding):
        #point to the custom class

        handler = CustomXMLGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

class GlobalFeed(Feed):


    def wrap_with_cdata(self, text):
        """Utility method to wrap a text in CDATA block"""
        content = "<![CDATA[ "
        content += text
        content += " ]]>"

        return content

       ...

    def item_author_name(self, item: Article) -> str:
        """
        Takes an item, as returned by items(), and returns the item's
        author's name as a normal Python string.
        """
        #wrap with the utility method
        return self.wrap_with_cdata(item.author.fullname)

I hope it helps.

Upvotes: 0

RNC
RNC

Reputation: 107

This is still the number one hit on google for this issue, so here's the fully fleshed out answer based on Nick's reply here:

from xml.sax.saxutils import XMLGenerator

class MySimplerXMLGenerator(XMLGenerator):
    def addQuickElement(self, name, contents=None, attrs=None):
        "Convenience method for adding an element with no children"
        if attrs is None:
            attrs = {}
        self.startElement(name, attrs)
        if contents is not None:
            if contents.startswith('<![CDATA['):
                self.unescaped_characters(contents)
            else:
                self.characters(contents)
        self.endElement(name)

    def characters(self, content):
        if content and re.search(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', content):
            # Fail loudly when content has control chars (unsupported in XML 1.0)
            # See https://www.w3.org/International/questions/qa-controls
            raise UnserializableContentError("Control characters are not supported in XML 1.0")
        XMLGenerator.characters(self, content)

    def unescaped_characters(self, content):
        if content and re.search(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', content):
            # Fail loudly when content has control chars (unsupported in XML 1.0)
            # See https://www.w3.org/International/questions/qa-controls
            raise UnserializableContentError("Control characters are not supported in XML 1.0")
        XMLGenerator.ignorableWhitespace(self, content)

    def startElement(self, name, attrs):
        # Sort attrs for a deterministic output.
        sorted_attrs = dict(sorted(attrs.items())) if attrs else attrs
        super().startElement(name, sorted_attrs)

The above is the same as Django's handler, but with the additional "unescaped_characters" method added and a conditional check on 'content' to see if it begins with '<![CDATA['. As you can see unescaped_characters calls the "ignoreWhitespace" method of saxutils' XMLGenerator, which is identical to its "characters" method except for not escaping anything.

From this point, you can add a new "write()" method to your Feed class, which should be done as noted in the Django code's comments to override handler methods, which provides your modified handler, like so, identical to the original but for the replaced handler class definition:

class iTunesPodcastsFeedGenerator(Rss201rev2Feed):

    def write(self, outfile, encoding):
        handler = MySimplerXMLGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

Now you've got a new handler that will selectively not-escape strings that begin with '<![CDATA[' so all you have to do is manually pre/post-pend those tags to your HTML fields and escape those strings by other means, like so if you have installed and want to use bleach for example...

 class iTunesPodcastsFeedGenerator(Rss201rev2Feed):

    def add_root_elements(self, handler):
        handler.addQuickElement("description", '<![CDATA[' + bleach.clean(self.feed['description'], strip=True, tags=['p', 'ul', 'li', 'a']) + ']]>')

As of this writing Apple (and the bulk of other podcast directories) allow paragraphs, unordered lists, and links in descriptions, so the above is a podcast feed example that should work fine.

Upvotes: 2

You can replace the code:

    contents = '<![CDATA[ contents ]]'
    xml.addQuickElement('element', contents=contents)

with:

    contents = 'contents'
    xml.startElement('element', {})
    xml._write(f'<![CDATA[ {contents} ]]')
    xml.endElement('element')

Upvotes: 2

T. Lacy
T. Lacy

Reputation: 11

Here is how I was able to get CDATA tags into my output without them being escaped. I created the AppleGenerator that inherits from SimplerXMLGenerator that Rss20rev2Feed uses by default. I then went and overrode the write function Rss201rev2feed uses to use the new AppleGenerator I created. Then for the AppleGenerator I overrode the characters and addQuickElement function to take an optional input to disable escaping if you want.

from django.utils.xmlutils import SimplerXMLGenerator
from xml.sax.saxutils import escape

class AppleGenerator(SimplerXMLGenerator):
    def addQuickElement(self, name, contents=None, attrs=None, escape_char=True):
        "Convenience method for adding an element with no children"
        if attrs is None: attrs = {}
        self.startElement(name, attrs)
        if contents is not None:
            self.characters(contents, escape_char=escape_char)
        self.endElement(name)

    def characters(self, content, escape_char=True):
        if content:
            self._finish_pending_start_element()
            if not isinstance(content, str):
                content = str(content, self._encoding)
            if escape_char:
                self._write(escape(content))
            else:
                self._write(content)

class ApplePodcastsFeedGenerator(Rss201rev2Feed):
    def write(self, outfile, encoding):
        handler = AppleGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

The overrides are basically exactly what the function did before but added a way to not escape them. Here is the source code for saxutils:

https://github.com/python/cpython/blob/3.7/Lib/xml/sax/saxutils.py

Here is the source code for django SimplerXMLGenerator: https://github.com/django/django/blob/master/django/utils/xmlutils.py

Upvotes: 1

ascripter
ascripter

Reputation: 6223

I was facing the same problem in Django 1.10 and traced it back to the point where all the escaping occurs. django.utils.RssFeed.write() writes items using django.utils.xmlutils.SimplerXMLGeneratoras a handler. This handler derives from xml.sax.saxutils.XMLGenerator which has a characters-method that escapes all content. So to unescape everything you put into the feed, first override the XML-handler:

from django.utils.xmlutils import SimplerXMLGenerator
class UnescapedXMLGenerator(SimplerXMLGenerator):
    def characters(self, content):
        """
        code is mainly copy-paste from Django 1.10 SimplerXMLGenerator.characters
        """
        if content and re.search(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', content):
            # Fail loudly when content has control chars (unsupported in XML 1.0)
            # See http://www.w3.org/International/questions/qa-controls
            raise UnserializableContentError("Control characters are not supported in XML 1.0")

        # next part from sax.saxutils.XMLGenerator, but without escaping
        if not isinstance(content, unicode):
            content = unicode(content, self._encoding)
        self._write(content)

Next step is to override your feed's write-method to use the new handler. Here an Rss2.01 Feed for example:

from django.utils import feedgenerator
class Rss201rev2FeedUnescaped(feedgenerator.Rss201rev2Feed):
    """
    Rss 2.01 Feed that doesn't escape content
    """
    def write(self, outfile, encoding):
        """
        code is mainly copy-paste from django.utils.feedgenerator.Rss201rev2Feed
        except that the handler is set to UnescapedXMLGenerator
        """
        handler = UnescapedXMLGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

Upvotes: 0

user764357
user764357

Reputation:

So, based on the documentation handler is an XMLGenerator, and calling addQuickElement has the assumption that all of the content is character data. Hence why its being escaped.

What you are probably going to have to do is override SyndicationFeed.add_item_elements(self, handler, item) and insert the a elements using addQuickElement, and add the itunes:summary tags using startElement and endElement.

class iTunesFeed(Rss201rev2Feed):
    def add_item_elements(self, handler item):
        super(iTunesFeed, self).add_root_elements(handler)
        handler.startElement('itunes:summary')
        handler.characters('Link to ')            
        handler.addQuickElement('a', 'the website', {'href':'http://www.website.com'})
        handler.endElement('itunes:summary')

This might not be 100% functional, but should get you pretty close.

Upvotes: 1

Related Questions