Reputation: 329
Im trying to build my own rss with downloadlinks
but the rss feed provides only the link to the whole season.
i'm taking that link to the whole season and want to extract the specific downloadlink to the episode itself (uploaded/ul)
this is what ive got so far. any possibilities to get that working ?!
import feedparser, urllib2, re
from BeautifulSoup import BeautifulSoup
episodenliste = ['Game.of.Thrones','Arrow']
episode_link = []
episode_title = []
d = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml')
for post in d.entries:
if ('DEUTSCH' in post.title) and any (word in post.title for word in episodenliste) and ('720p' in post.title):
post.title = post.title.replace('[DEUTSCH] ','')
post.title = re.sub(r'(.*S\d+E\d+)(.*)',r'\1' ,post.title)
episode_link.append(post.link)
episode_title.append(post.title)
print post.title + ": " + post.link + "\n"
for search_title in episode_title:
for get_dlLink in episode_link:
page_ = urllib2.Request(get_dlLink)
page = urllib2.urlopen(page_).read()
soup = BeautifulSoup(page)
print search_title
title = soup.find('strong', text=search_title)
if title is not None:
print title
# link = title.parent
# links = link.find_all('a')
# print links
# for link2 in links:
# url = link2['href']
# print url
# pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % ul
# if re.match(pattern, url):
# print url
as far as i can tell it works to that point where im searching the page for the title.
it gets to the pages parsed from the rss. but it doesnt find the title.
my idea was:
first find the title and than extract the 'children'/links from it
any help is appreciated thanks in advance
Upvotes: 0
Views: 347
Reputation: 4679
Wihout JavaScript enabled the HTML looks quite different:
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | filemonkey.in<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | share-online.biz<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | uploaded.to</p>
As the title from the RSS feed without the [DEUTSCH]
prefix is the first text in the paragraph on the page for the series, it can be the basis for searching and extracting the entry. Two elements up there is the <p>
tag containing all the data for the episode. And that's links followed by the name of the file hoster.
import feedparser
import requests
from bs4 import BeautifulSoup
FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'
def is_interesting_entry(entry, title_prefix, series_names):
return (
entry.title.startswith(title_prefix)
and any(name in entry.title for name in series_names)
)
def process_entry(entry, title_prefix):
if not entry.title.startswith(title_prefix):
raise ValueError(
'expected prefix {0!r} not found in {1!r}'.format(
title_prefix, entry.title
)
)
return (entry.title[len(title_prefix):], entry.link)
def process_feed(feed_url, title_prefix, series_names):
return (
process_entry(entry, title_prefix)
for entry in feedparser.parse(feed_url).entries
if is_interesting_entry(entry, title_prefix, series_names)
)
def get_series_soup(url, cache=dict()):
if url in cache:
return cache[url]
else:
result = BeautifulSoup(requests.get(url).text)
cache[url] = result
return result
def get_download_urls(soup, title):
title_text = soup.find(text=title)
if not title_text:
return dict()
else:
return dict(
(a_tag.next_sibling.strip('| '), a_tag['href'])
for a_tag in title_text.parent.parent('a')
)
def main():
series_names = ['Game.of.Thrones', 'Arrow']
for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
print
print title
hoster2url = get_download_urls(get_series_soup(url), title)
if hoster2url:
for hoster, download_url in sorted(hoster2url.iteritems()):
print '{0:>20s}: {1}'.format(hoster, download_url)
else:
print ' --- No downloads ---'
if __name__ == '__main__':
main()
Upvotes: 1
Reputation: 329
<item>
<title>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</title>
<description>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</description>
<pubDate>Fri, 18 Jul 2014 00:00:00 +0200</pubDate>
<link>http://serienjunkies.org/arrow/arrow-staffel-2-hdtvweb-dl-sd720p1080p/</link>
</item>
sorry, didnt know that
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br><div id="download_mirrors" class="download_main"><strong>Download:</strong> <a href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank" style="font-size:14px;font-weight:bold;">uploaded.net</a> <span style="font-size:10px">(best speed) </span><br><strong style="margin-left:14px">Mirrors:</strong> <img src="http://serienjunkies.org/media/img/stream/application_cascade.png" style="cursor:pointer;" title="Mirrors zeigen" onclick="toggle("Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS");"><div id="Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" style="display: none;">
<strong style="margin-left:20px">Mirror:</strong> <a href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank">filemonkey.in</a><br>
<strong style="margin-left:20px">Mirror:</strong> <a href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank">share-online.biz</a><br>
</div><div><strong style="margin-left:18px">Usenet:</strong> <a href="http://www.firstload.com/affiliate/log.php?log=50393&fn=Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" target="_blank">Highspeed Mirror</a></div></div></p>
Upvotes: 0