Reputation: 263
from bs4 import BeautifulSoup
import urllib, time
class scrape(object):
def __init__(self):
self.urls = ['https://www.onthemarket.com/for-sale/property/wigan/', 'https://www.onthemarket.com/for-sale/property/wigan/?page=1', 'https://www.onthemarket.com/for-sale/property/wigan/?page=2', 'https://www.onthemarket.com/for-sale/property/wigan/?page=3', 'https://www.onthemarket.com/for-sale/property/wigan/?page=4', 'https://www.onthemarket.com/for-sale/property/wigan/?page=6']
self.telephones = []
def extract_info(self):
for link in self.urls:
data = urllib.request.urlopen(link).read()
soup = BeautifulSoup(data, "lxml")
for tel in soup.findAll("span", {"class":"call"}):
self.telephones.append(tel.text.strip())
time.sleep(1)
return self.telephones
to = scrape()
print(to.extract_info())
What is wrong? This code is hanging after second website. It should extract phone numbers from each webpage in list self.urls
Upvotes: 0
Views: 526
Reputation: 22440
All you need to do is put a headers
in your request parameter and make a go. Try this:
from bs4 import BeautifulSoup
import requests, time
class scrape(object):
def __init__(self):
self.urls = ['https://www.onthemarket.com/for-sale/property/wigan/', 'https://www.onthemarket.com/for-sale/property/wigan/?page=1', 'https://www.onthemarket.com/for-sale/property/wigan/?page=2', 'https://www.onthemarket.com/for-sale/property/wigan/?page=3', 'https://www.onthemarket.com/for-sale/property/wigan/?page=4', 'https://www.onthemarket.com/for-sale/property/wigan/?page=6']
self.telephones = []
def extract_info(self):
for link in self.urls:
data = requests.get(link,headers={"User-Agent":"Mozilla/5.0"}) #it should do the trick
soup = BeautifulSoup(data.text, "lxml")
for tel in soup.find_all("span",{"class":"call"}):
self.telephones.append(tel.text.strip())
time.sleep(1)
return self.telephones
crawl = scrape()
print(crawl.extract_info())
Upvotes: 2