Reputation: 905
Hi Guys Define a Function to Get list of all paginated URLs at bottom from links in txt file in python.
Here is an example of what i need done.
Input link
http://www.apartmentguide.com/apartments/Alabama/Hartselle/
Desired Output
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=6
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=7
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=8
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=9
so on to any limit each Input Url have.
This is the function i written so far but its not working i am not good with Python either .
import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2
lists = open("C:\Users\Administrator\Desktop\\3.txt","r")
read_list = lists.read()
line = read_list.split("\n")
def get_links(line):
for each in line:
r = requests.get(each)
sel = Se(text=r.text, type="html")
next_ = sel.xpath('//a[@class="next sprite"]//@href').extract()
for next_1 in next_:
next_2 = "http://www.apartmentguide.com"+next_1
print next_2
get_links(next_1)
get_links(line)
Upvotes: 1
Views: 1020
Reputation: 30210
Below are two ways to do this.
import mechanize
import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse
import pprint
#-- Mechanize --
br = mechanize.Browser()
def get_links_mechanize(root):
links = []
br.open(root)
for link in br.links():
try:
if dict(link.attrs)['class'] == 'page':
links.append(link.absolute_url)
except:
pass
return links
#-- Requests / BeautifulSoup / urlparse --
def get_links_bs(root):
links = []
r = requests.get(root)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
#with open("C:\Users\Administrator\Desktop\\3.txt","r") as f:
# for root in f:
# links = get_links(root)
# # <Do something with links>
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )
One uses mechanize
-- it's a bit smarter with URLs but it's a lot slower and may be overkill depending on what else you're doing.
The other uses requests
to fetch the page (urllib2 would suffice), BeautifulSoup
to parse the markup and urlparse
to form absolute URLs from the relative URLs in the page you listed.
Note that both of these functions return the following list:
['http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5']
which has duplicates. You can get rid of the duplicates by changing
return links
to
return list(set(links))
for whatever method you choose.
EDIT:
I noticed that the above functions only returned the links to pages 2-5, and you'd have to navigate those pages to see that there were in fact 10 pages.
A completely different approach would be to scrape the "root" page for number of results, then predict how many pages that would result in, then build links from that.
Since there are 20 results per page, figuring out how many pages is straightforward, consider:
import requests, re, math, pprint
def scrape_results(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
pprint.pprint(scrape_results(root))
This will be the fastest method of the 3, but possibly more error prone.
EDIT 2:
Maybe something like:
import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer
def get_pages(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
def get_listings(page):
links = []
r = requests.get(page)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
listings += get_listings(page)
pprint.pprint(listings)
print(len(listings))
Upvotes: 1
Reputation: 905
With Re i was unsure ,so tried xpath.
links = open("C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\2.txt","r")
read_list = links.read()
line = read_list.split("\n")
for each in line:
lines = []
r = requests.get(each)
sel = Selector(text=r.text,type="html")
mat = sel.xpath('//h1//strong/text()').extract()
mat = str(mat)
mat1 = mat.replace(" apartments for rent']","")
mat2 = mat1.replace("[u'","")
mat3 = int(mat2)
num_pages = int(math.ceil(mat3/20.0))
for i in range(num_pages):
lines.append("%s/Page%d" % (each, (i+1)))
with open('C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\test.csv', 'ab') as f:
writer = csv.writer(f)
for val in lines:
writer.writerow([val])
Upvotes: 0