Reputation: 349
I am trying to fetch all movie/show netflix links from here http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html and also their country name. e.g from the page source, I want http://www.netflix.com/WiMovie/80048948, USA, etc. I have done the following. But it returns all links instead of the netflix ones I want. I am a little new to regex. How should I go about this?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
##reqlink = re.search('netflix',link.get('href'))
##if reqlink:
print link.get('href')
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
If I uncomment the lines above, I get the following error:
TypeError: expected string or buffer
What should I do?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
title=[]
country=[]
for line in r.iter_lines():
if count == 746:
urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(line)
for link in soup.findAll('a', href=re.compile('netflix')):
title.append(link.get('href'))
for link in soup.findAll('img'):
print link.get('alt')
country.append(link.get('alt'))
count = count + 1
print len(title), len(country)
The previous error has been worked upon. Now the only thing to look for is films with multiple countries. How to get them together.
e.g. for 10.0 Earthquake, link = http://www.netflix.com/WiMovie/80049286, country = UK, USA.
Upvotes: 0
Views: 148
Reputation: 349
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
final=[]
for line in r.iter_lines():
if count == 746:
soup = BeautifulSoup(line)
for row in soup.findAll('tr'):
url = row.find('a', href=re.compile('netflix'))
if url:
t=url.string
u=url.get('href')
one=[]
for country in row.findAll('img'):
one.append(country.get('alt'))
final.append({'Title':t,'Url':u,'Countries':one})
count = count + 1
final
is the final list.
Upvotes: 0
Reputation: 180522
Your code can be simplified to a couple of selects:
import requests
from bs4 import BeautifulSoup
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
for a in soup.select("a[href*=netflix]"):
print(a["href"])
And for the img:
co = {"UK", "USA"}
for img in soup.select("img[alt]"):
if img["alt"] in co:
print(img)
Upvotes: 1
Reputation: 1429
I think you'd have an easier iterating through the listing rows and using a generator to assemble the data structure you're looking for (ignore the minor differences in my code, I'm using Python3):
from bs4 import BeautifulSoup
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/' \
'complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
rows = soup.select('span[class="listings"] tr')
def get_movie_info(rows):
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
yield link, countries
print('\n'.join(map(str, get_movie_info(rows))))
Edit: Or if you're looking for a dict instead of a list:
def get_movie_info(rows):
output = {}
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
name = link.text
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
output[name or 'some_default'] = {'link': link, 'countries': countries}
return output
print('\n'.join(map(str, get_movie_info(rows).items())))
Upvotes: 0
Reputation: 2395
As for the first question - it failed for links that didn't have an href value. So instead of a string you got None
.
The following works:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/
07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
link_href = link.get('href')
if link_href:
reqlink = re.search('netflix',link_href)
if reqlink:
print link_href
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
As for the second question, I would recommend having a dictionary between the movie to a list of countries that it appears in, then it would be easier to format it in a string the way you want.
Upvotes: 0