Reputation: 197
I'm trying to scrape 20 pages of a website using BeautifulSoup. Each page has about 30 items and each of those items have 8 features which I want to retrieve and append as a tuple to a list called res
.
Now the code below is supposed to retrieve all the items and their features from the 20 pages and store them to res
, but it only seems to retrieve the first pages items and features, for some reason.
Any help is appreciated.
for i in range(30):
r = requests.get('https://www.olx.ba/pretraga?trazilica=+golf+2&kategorija=18&stranica='+ str(i))
soup = BeautifulSoup(r.text, 'lxml')
all_items = soup.select('div#rezultatipretrage div.listitem.artikal.obicniArtikal.imaHover-disabled.i.index')
for item in all_items:
naziv = item.find('p', class_='na').text
link = item.a['href']
lokacija = item.find('div', class_='lokacijadiv').text.strip()
godiste = item.find('span', class_='desnopolje').text
gorivo = item.find_all('p', class_='polje')[1].find('span', class_='desnopolje').text
if item.find('div', class_='cijena').span.text == 'PO DOGOVORU':
cijena = 'PO DOGOVORU'
else:
cijena = item.find('div', class_='cijena').span.text[:-2].strip()
cijena = int(cijena.replace('.',''))
stanje = item.find('div', class_='stanje k').text.strip()
datum = item.find('div', class_='kada').text
res.append((naziv, link, lokacija, godiste, gorivo, cijena, stanje, datum))
Upvotes: 0
Views: 213
Reputation: 195553
You need to only select all <div>
with listitem
class, to get all items from page, not only featured cars.
I made few changes and checks to your code to successfully scrape all 30 pages (I put "-"
as default value to some fields, so check your result if it's correct):
from bs4 import BeautifulSoup
import requests
from pprint import pprint
res = []
for i in range(30):
r = requests.get('https://www.olx.ba/pretraga?trazilica=+golf+2&kategorija=18&stranica='+ str(i))
soup = BeautifulSoup(r.text, 'lxml')
all_items = soup.select('div#rezultatipretrage div.listitem')
for item in all_items:
if not item.find('p', class_='na'):
continue
naziv = item.find('p', class_='na').text
link = item.a['href']
lokacija = item.find('div', class_='lokacijadiv').text.strip()
godiste = item.find('span', class_='desnopolje').text if item.find('span', class_='desnopolje') else '-'
try:
gorivo = item.find_all('p', class_='polje')[1].find('span', class_='desnopolje').text
except IndexError:
gorivo = '-'
if item.find('div', class_='cijena').span.text == 'PO DOGOVORU':
cijena = 'PO DOGOVORU'
else:
cijena = item.find('div', class_='cijena').span.contents[-1][:-2].strip()
cijena = int(cijena.replace('.',''))
stanje = item.find('div', class_='stanje k').text.strip() if item.find('div', class_='stanje k') else '-'
datum = item.find('div', class_='kada').text
res.append((naziv, link, lokacija, godiste, gorivo, cijena, stanje, datum))
pprint(res)
This prints all info from 30 pages:
[('VW GOLF 5 2.0 TDI, 2005 god. Registrovan',
'https://www.olx.ba/artikal/30396912/vw-golf-5-2-0-tdi-2005-god-registrovan/',
'Živinice',
'2005',
'Dizel',
8400,
'KORIŠTENO',
'Prije 4 dana'),
('VW GOLF 2 DIZEL TEK REGISTROVAN',
'https://www.olx.ba/artikal/30512948/vw-golf-2-dizel-tek-registrovan/',
'Ilijaš',
'1985',
'Dizel',
1550,
'KORIŠTENO',
'Jučer, 16:05'),
('Golf 5 2.0 DIZEL SDI TEK REGISTROVAN',
'https://www.olx.ba/artikal/30471980/golf-5-2-0-dizel-sdi-tek-registrovan/',
'Travnik',
'2004',
'Dizel',
7950,
'KORIŠTENO',
'Prije 5 dana'),
('Volkswagen Golf 6 2.0 TDI GTI-GTD-R LINE',
'https://www.olx.ba/artikal/30478894/volkswagen-golf-6-2-0-tdi-gti-gtd-r-line/',
'Banja Luka',
'2010',
'Dizel',
19500,
'KORIŠTENO',
'Prije 7 dana'),
('VW GOLF 5,2.0 TDI,103 KW,04 G.P,6 BRZ.MOTOR U KVARU',
'https://www.olx.ba/artikal/30485008/vw-golf-5-2-0-tdi-103-kw-04-g-p-6-brz-motor-u-kvaru/',
'Prnjavor',
'2004',
'Dizel',
5555,
'KORIŠTENO',
'Prije 4 dana'),
('VW Golf 6 2.0 TDI XENON-NAVI-KAMERA-KOZA',
'https://www.olx.ba/artikal/30448040/vw-golf-6-2-0-tdi-xenon-navi-kamera-koza/',
'Banja Luka',
'2010',
'Dizel',
19500,
'KORIŠTENO',
'Prije 7 dana'),
...and so on.
Upvotes: 1