Extracting links and title from several pages

Question

Im trying to build my own rss with downloadlinks

but the rss feed provides only the link to the whole season.

i'm taking that link to the whole season and want to extract the specific downloadlink to the episode itself (uploaded/ul)

this is what ive got so far. any possibilities to get that working ?!

import feedparser, urllib2, re
from BeautifulSoup import BeautifulSoup

episodenliste = ['Game.of.Thrones','Arrow']

episode_link = []
episode_title = []
d = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml')
for post in d.entries:
    if ('DEUTSCH' in post.title) and any (word in post.title for word in episodenliste) and ('720p' in post.title):
        post.title = post.title.replace('[DEUTSCH] ','')
        post.title = re.sub(r'(.*S\d+E\d+)(.*)',r'\1' ,post.title)
        episode_link.append(post.link)
        episode_title.append(post.title)
        print post.title + ": " + post.link + "
"

for search_title in episode_title:
    for get_dlLink in episode_link:
        page_ = urllib2.Request(get_dlLink)
        page = urllib2.urlopen(page_).read()
        soup = BeautifulSoup(page)
        print search_title
        title = soup.find('strong', text=search_title)
        if title is not None:
            print title
  #          link = title.parent
   #         links = link.find_all('a')
    #        print links
    #        for link2 in links:
     #           url = link2['href']
      #          print url
       #         pattern = 'http://download\.serienjunkies\.org.*%s_.*\.html' % ul
        #        if re.match(pattern, url):
         #           print url

as far as i can tell it works to that point where im searching the page for the title.

it gets to the pages parsed from the rss. but it doesnt find the title.

my idea was:

first find the title and than extract the 'children'/links from it

any help is appreciated thanks in advance

BlackJack · Accepted Answer

Wihout JavaScript enabled the HTML looks quite different:

Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS

Download: hier | filemonkey.in

Download: hier | share-online.biz

Download: hier | uploaded.to

As the title from the RSS feed without the [DEUTSCH] prefix is the first text in the paragraph on the page for the series, it can be the basis for searching and extracting the entry. Two elements up there is the

tag containing all the data for the episode. And that's links followed by the name of the file hoster.

import feedparser
import requests
from bs4 import BeautifulSoup

FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'


def is_interesting_entry(entry, title_prefix, series_names):
    return (
        entry.title.startswith(title_prefix)
        and any(name in entry.title for name in series_names)
    )


def process_entry(entry, title_prefix):
    if not entry.title.startswith(title_prefix):
        raise ValueError(
            'expected prefix {0!r} not found in {1!r}'.format(
                title_prefix, entry.title
            )
        )
    return (entry.title[len(title_prefix):], entry.link)


def process_feed(feed_url, title_prefix, series_names):
    return (
        process_entry(entry, title_prefix)
        for entry in feedparser.parse(feed_url).entries
        if is_interesting_entry(entry, title_prefix, series_names)
    )


def get_series_soup(url, cache=dict()):
    if url in cache:
        return cache[url]
    else:
        result = BeautifulSoup(requests.get(url).text)
        cache[url] = result
        return result


def get_download_urls(soup, title):
    title_text = soup.find(text=title)
    if not title_text:
        return dict()
    else:
        return dict(
            (a_tag.next_sibling.strip('| '), a_tag['href'])
            for a_tag in title_text.parent.parent('a')
        )


def main():
    series_names = ['Game.of.Thrones', 'Arrow']
    for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
        print
        print title
        hoster2url = get_download_urls(get_series_soup(url), title)
        if hoster2url:
            for hoster, download_url in sorted(hoster2url.iteritems()):
                print '{0:>20s}: {1}'.format(hoster, download_url)
        else:
            print '  --- No downloads ---'


if __name__ == '__main__':
    main()

Extracting links and title from several pages

Answers (2)

Related Questions