Get list of all paginated URL's from links in txt file in python requests

Question

Hi Guys Define a Function to Get list of all paginated URLs at bottom from links in txt file in python.

Here is an example of what i need done.

Input link

http://www.apartmentguide.com/apartments/Alabama/Hartselle/

Desired Output

www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=6
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=7
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=8
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=9

so on to any limit each Input Url have.

This is the function i written so far but its not working i am not good with Python either .

import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2


lists = open("C:\Users\Administrator\Desktop\3.txt","r")
read_list = lists.read()
line = read_list.split("
")


def get_links(line):
    for each in line:
        r = requests.get(each)
        sel = Se(text=r.text, type="html")
        next_ = sel.xpath('//a[@class="next sprite"]//@href').extract()
        for next_1 in next_:
            next_2 = "http://www.apartmentguide.com"+next_1
            print next_2
        get_links(next_1)

get_links(line)

jedwards · Accepted Answer

Below are two ways to do this.

import mechanize

import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse

import pprint

#-- Mechanize --
br = mechanize.Browser()

def get_links_mechanize(root):
    links = []
    br.open(root)

    for link in br.links():
        try:
            if dict(link.attrs)['class'] == 'page':
                links.append(link.absolute_url)
        except:
            pass
    return links


#-- Requests / BeautifulSoup / urlparse --
def get_links_bs(root):
    links = []
    r = requests.get(root)

    for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
            links.append(urlparse.urljoin(root, link.get('href')))

    return links


#with open("C:\Users\Administrator\Desktop\3.txt","r") as f:
#    for root in f:
#        links = get_links(root) 
#        # 
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'

print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )

One uses mechanize -- it's a bit smarter with URLs but it's a lot slower and may be overkill depending on what else you're doing.

The other uses requests to fetch the page (urllib2 would suffice), BeautifulSoup to parse the markup and urlparse to form absolute URLs from the relative URLs in the page you listed.

Note that both of these functions return the following list:

['http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5']

which has duplicates. You can get rid of the duplicates by changing

return links

to

return list(set(links))

for whatever method you choose.

EDIT:

I noticed that the above functions only returned the links to pages 2-5, and you'd have to navigate those pages to see that there were in fact 10 pages.

A completely different approach would be to scrape the "root" page for number of results, then predict how many pages that would result in, then build links from that.

Since there are 20 results per page, figuring out how many pages is straightforward, consider:

import requests, re, math, pprint

def scrape_results(root):
    links = []
    r = requests.get(root)

    mat = re.search(r'We have (\d+) apartments for rent', r.text)
    num_results = int(mat.group(1))                     # 182 at the moment
    num_pages = int(math.ceil(num_results/20.0))        # ceil(182/20) => 10

    # Construct links for pages 1-10
    for i in range(num_pages):
        links.append("%s?page=%d" % (root, (i+1)))

    return links

pprint.pprint(scrape_results(root))

This will be the fastest method of the 3, but possibly more error prone.

EDIT 2:

Maybe something like:

import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer

def get_pages(root):
    links = []
    r = requests.get(root)

    mat = re.search(r'We have (\d+) apartments for rent', r.text)
    num_results = int(mat.group(1))                     # 182 at the moment
    num_pages = int(math.ceil(num_results/20.0))        # ceil(182/20) => 10

    # Construct links for pages 1-10
    for i in range(num_pages):
        links.append("%s?page=%d" % (root, (i+1)))

    return links

def get_listings(page):
    links = []
    r = requests.get(page)

    for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
            links.append(urlparse.urljoin(root, link.get('href')))

    return links

root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
    listings += get_listings(page)

pprint.pprint(listings)
print(len(listings))

Get list of all paginated URL's from links in txt file in python requests

Answers (2)

Related Questions

Get list of all paginated URL&#39;s from links in txt file in python requests

Answers (2)

Related Questions

Get list of all paginated URL's from links in txt file in python requests