Reputation: 811
I need to scrape car makes and model. Now I can scrape car makes select option list but can't scrape car models select option list because it is loaded when you select car make.
Maybe you have any ideas how I could get car models select options list which is loaded when you select car make.
Here is my code.
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
url="http://autoplius.lt/redaguoti/naudoti-automobiliai/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
car_makes_select = soup.find("select", {"id": "make_id"})
car_makes = car_makes_select.select("option")
for item in car_makes:
itemMain = item
itemMain = itemMain.get('value')
payload = {
'make_id': itemMain
}
form = requests.post(url, params=payload)
soup11 = BeautifulSoup(form.text, "html.parser")
model_select = soup11.find("select", {"id": "model_id"})
print model_select
Upvotes: 1
Views: 1040
Reputation: 180481
You need to post data:
The parent_id is the value in each option inside the make_id select:
import requests
from bs4 import BeautifulSoup
url = "http://autoplius.lt/redaguoti/naudoti-automobiliai/"
page = requests.get(url)
# form data fields can be hard coded bar parent_id
data = {"target_id": "model_id",
"project": "autoplius",
"category_id": "2",
"type": "edit",
"my_anns": "false",
"__block": "ann_ajax_0_plius",
"__opcode": "ajaxGetChildsTo"}
soup = BeautifulSoup(page.content, "html.parser")
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"}
# first two options hold no ids
car_makes = soup.select("#make_id option + option + option")
for car in car_makes:
print(car.text)
# pass id/value
data["parent_id"] = car["value"]
# data for a post not params
form = requests.post(url, data=data, headers=headers)
soup11 = BeautifulSoup(form.text, "html.parser")
print(soup11)
That post returns data in a format like:
"<option value=\"\">- Pasirinkite -<\/option><option value=\"1088\">-kita-<\/option><option value=\"16230\">208<\/option><option value=\"16231\">246<\/option><option value=\"16232\">250<\/option><option value=\"16233\">275<\/option><option value=\"16234\">288<\/option><option value=\"16235\">308<\/option><option value=\"16236\">328<\/option><option value=\"16237\">330<\/option><option value=\"1093\">348<\/option><option value=\"1089\">360<\/option><option value=\"16238\">365<\/option><option value=\"16239\">400<\/option><option value=\"16240\">412<\/option><option value=\"1086\">456<\/option><option value=\"16229\">458<\/option><option value=\"16245\">512<\/option><option value=\"1085\">550<\/option><option value=\"1084\">575<\/option><option value=\"1092\">599 GTB Fiorano<\/option><option value=\"16246\">612<\/option><option value=\"1090\">612 Scaglietti<\/option><option value=\"16247\">750<\/option><option value=\"1079\">Barchetta<\/option><option value=\"16250\">California<\/option><option value=\"16248\">Daytona<\/option><option value=\"1078\">Enzo<\/option><option value=\"1083\">F 355<\/option><option value=\"18638\">F 360<\/option><option value=\"1087\">F 40<\/option><option value=\"1091\">F 430<\/option><option value=\"1081\">F 50<\/option><option value=\"1080\">F 512<\/option><option value=\"1077\">Maranello<\/option><option value=\"1076\">Mondial<\/option><option value=\"16249\">Superamerica<\/option><option value=\"1075\">Testarossa<\/option>"
We need to tidy that up a bit and extract only the models:
car_makes = soup.select("#make_id option + option + option")
for car in car_makes:
print(car.text)
data["parent_id"] = car["value"]
form = requests.post(url, data=data, headers=headers)
soup11 = BeautifulSoup(form.content.strip('"').replace('\\"','"').replace("\/", "/"), "html.parser")
print([opt.text for opt in soup11.select("option + option + option") if not opt.text.isdigit()])
If we run the code we get output like:
In [10]: for car in car_makes:
....: print(car.text)
....: data["parent_id"] = car["value"]
....: form = requests.post(url, data=data, headers=headers)
....: soup11 = BeautifulSoup(form.content.strip('"').replace('\\"','"').replace("\/", "/"), "html.parser")
....: print([opt.text for opt in soup11.select("option + option + option") if not opt.text.isdigit()])
....:
AC
[u'Ace', u'Aceca', u'Cobra']
Acura
[u'CL', u'EL', u'ILX', u'Integra', u'MDX', u'NSX', u'RDX', u'RL', u'RSX', u'SLX', u'TL', u'TLX', u'TSX', u'Vigor', u'ZDX']
Aixam
[u'A751', u'City', u'Crossline', u'Ligier', u'Scouty']
Alfa Romeo
[u'4C', u'8C', u'Alfasud', u'Alfetta', u'Arna', u'Brera', u'Crosswagon Q4', u'Giulia', u'Giulietta', u'GT', u'GTV', u'Junior', u'Mito', u'RZ/SZ', u'Spider', u'Sportwagon', u'Sprint']
Alpina
[u'B12', u'B3', u'B5', u'B6', u'B7', u'B8', u'D10', u'D3', u'Roadster S']
AMC
[u'Ambassador', u'Concord', u'Eagle', u'Gremlin', u'Javelin', u'Matador', u'Pacer', u'Rambler', u'Rebel', u'Spirit']
ARO
[u'K450', u'Spartana']
Asia
[u'Hi-Topic', u'Retona', u'Rocsta', u'Towner']
Upvotes: 1