Remi Castonguay
Remi Castonguay

Reputation: 69

Looping through URLS in Python and generating feed with Feedgen

I'm pulling together a bunch of RSS feeds and grabbing items from them based on matching keywords. In the process I'm pulling certain fields--title, description, etc. but I'd like to pull the title of the source (title of the actual RSS feed).

import feedparser
from feedgen.feed import FeedGenerator
from dateutil import parser
from dateutil import tz
from pytz import timezone

# Define a custom time zone mapping for EDT
tzinfos = {"PST": tz.gettz("America/Los_Angeles"), "EDT": tz.gettz("America/New_York")}

feeds = [
    'https://vancouversun.com/feed/?x=1',
    'https://rss.cbc.ca/lineup/canada-britishcolumbia.xml',
    'https://rss.cbc.ca/lineup/topstories.xml',
    # Add more feeds as needed
]

keywords = ['autism', 'autistic', 'autisme', 'autistique', 'asperger', '#autism', 'neurodiversity']

# Create a new feed generator object.
fg = FeedGenerator()
fg.title('Autism News')
fg.link(href='https://rcastonguay.github.io/autismfeeds/index.xml', rel='self')
fg.description('An RSS feed filtered by autism keywords.')

# Iterate over feeds
for feed_url in feeds:
    d = feedparser.parse(feed_url)

    # Retrieve the source title
    if d.feed.get('title'):
        source_title = d.feed.title  # Use d.feed.title to get the feed title
    else:
        source_title = feed_url  # If feed title is not available, use the feed URL as the source title

    # Add source title as a regular entry in the feed
    fe = fg.add_entry()
    fe.title(f"{source_title}\n")

    # Iterate over entries for each feed and add them to the feed
    for entry in d.entries:
        if any(keyword in entry.title.lower() or keyword in entry.summary.lower() or keyword in entry.description.lower() for keyword in keywords):
            fe = fg.add_entry()
            fe.title(entry.title)
            fe.link(href=entry.link)
            fe.description(entry.description)
            date = parser.parse(entry.published, fuzzy=True, tzinfos=tzinfos)
            if date.tzinfo is None or date.tzinfo.utcoffset(date) is None:
                date = date.replace(tzinfo=tz.gettz('UTC')).astimezone(tz.gettz('PST'))
            else:
                date = date.astimezone(tz.gettz('PST'))
            fe.pubDate(date)

# Generate the RSS feed XML file.
rssfeed = fg.rss_str(pretty=True)
print(rssfeed.decode())

The output I get makes it clear that the code only executes the first loop and ignores the second one. Any advice is appreciated. Here's the XML output:

<?xml version='1.0' encoding='UTF-8'?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>Autism News</title>
    <link>https://rcastonguay.github.io/autismfeeds/index.xml</link>
    <description>An RSS feed filtered by autism keywords.</description>
    <atom:link href="https://rcastonguay.github.io/autismfeeds/index.xml" rel="self"/>
    <docs>http://www.rssboard.org/rss-specification</docs>
    <generator>python-feedgen</generator>
    <lastBuildDate>Thu, 23 Nov 2023 20:36:23 +0000</lastBuildDate>
    <item>
      <title>CBC | Top Stories News
</title>
    </item>
    <item>
      <title>CBC | British Columbia News
</title>
    </item>
    <item>
      <title>Vancouver Sun
</title>
    </item>
  </channel>
</rss>

Upvotes: 0

Views: 298

Answers (0)

Related Questions