django11
django11

Reputation: 811

Scraping data which loaded when you select something

I need to scrape car makes and model. Now I can scrape car makes select option list but can't scrape car models select option list because it is loaded when you select car make.

Maybe you have any ideas how I could get car models select options list which is loaded when you select car make.

Here is my code.

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup



url="http://autoplius.lt/redaguoti/naudoti-automobiliai/"
page = requests.get(url)

soup = BeautifulSoup(page.content, "html.parser")

car_makes_select = soup.find("select", {"id": "make_id"})

car_makes = car_makes_select.select("option")


for item in car_makes:
    itemMain = item

    itemMain = itemMain.get('value')

    payload = {
        'make_id': itemMain
    }

    form = requests.post(url, params=payload)

    soup11 = BeautifulSoup(form.text, "html.parser")

    model_select = soup11.find("select", {"id": "model_id"})

    print model_select

Upvotes: 1

Views: 1040

Answers (1)

Padraic Cunningham
Padraic Cunningham

Reputation: 180481

You need to post data:

enter image description here

The parent_id is the value in each option inside the make_id select:

import requests
from bs4 import BeautifulSoup

url = "http://autoplius.lt/redaguoti/naudoti-automobiliai/"
page = requests.get(url)

# form data fields can be hard coded bar parent_id
data = {"target_id": "model_id",
        "project": "autoplius",
        "category_id": "2",
        "type": "edit",
        "my_anns": "false",
        "__block": "ann_ajax_0_plius",
        "__opcode": "ajaxGetChildsTo"}

soup = BeautifulSoup(page.content, "html.parser")


headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"}

# first two options hold no ids
car_makes = soup.select("#make_id option + option + option")

for car in car_makes:
    print(car.text)
    # pass  id/value
    data["parent_id"] = car["value"]
    # data for a post not params
    form = requests.post(url, data=data, headers=headers)
    soup11 = BeautifulSoup(form.text, "html.parser")
    print(soup11)

That post returns data in a format like:

"<option value=\"\">- Pasirinkite -<\/option><option value=\"1088\">-kita-<\/option><option value=\"16230\">208<\/option><option value=\"16231\">246<\/option><option value=\"16232\">250<\/option><option value=\"16233\">275<\/option><option value=\"16234\">288<\/option><option value=\"16235\">308<\/option><option value=\"16236\">328<\/option><option value=\"16237\">330<\/option><option value=\"1093\">348<\/option><option value=\"1089\">360<\/option><option value=\"16238\">365<\/option><option value=\"16239\">400<\/option><option value=\"16240\">412<\/option><option value=\"1086\">456<\/option><option value=\"16229\">458<\/option><option value=\"16245\">512<\/option><option value=\"1085\">550<\/option><option value=\"1084\">575<\/option><option value=\"1092\">599 GTB Fiorano<\/option><option value=\"16246\">612<\/option><option value=\"1090\">612 Scaglietti<\/option><option value=\"16247\">750<\/option><option value=\"1079\">Barchetta<\/option><option value=\"16250\">California<\/option><option value=\"16248\">Daytona<\/option><option value=\"1078\">Enzo<\/option><option value=\"1083\">F 355<\/option><option value=\"18638\">F 360<\/option><option value=\"1087\">F 40<\/option><option value=\"1091\">F 430<\/option><option value=\"1081\">F 50<\/option><option value=\"1080\">F 512<\/option><option value=\"1077\">Maranello<\/option><option value=\"1076\">Mondial<\/option><option value=\"16249\">Superamerica<\/option><option value=\"1075\">Testarossa<\/option>"

We need to tidy that up a bit and extract only the models:

car_makes = soup.select("#make_id option + option + option")

for car in car_makes:
    print(car.text)
    data["parent_id"] = car["value"]
    form = requests.post(url, data=data, headers=headers)
    soup11 = BeautifulSoup(form.content.strip('"').replace('\\"','"').replace("\/", "/"), "html.parser")
    print([opt.text for opt in soup11.select("option + option + option") if not opt.text.isdigit()])

If we run the code we get output like:

In [10]: for car in car_makes:
   ....:         print(car.text)
   ....:         data["parent_id"] = car["value"]
   ....:         form = requests.post(url, data=data, headers=headers)
   ....:         soup11 = BeautifulSoup(form.content.strip('"').replace('\\"','"').replace("\/", "/"), "html.parser")
   ....:         print([opt.text for opt in soup11.select("option + option + option") if not opt.text.isdigit()])
   ....:     
AC
[u'Ace', u'Aceca', u'Cobra']
Acura
[u'CL', u'EL', u'ILX', u'Integra', u'MDX', u'NSX', u'RDX', u'RL', u'RSX', u'SLX', u'TL', u'TLX', u'TSX', u'Vigor', u'ZDX']
Aixam
[u'A751', u'City', u'Crossline', u'Ligier', u'Scouty']
Alfa Romeo
[u'4C', u'8C', u'Alfasud', u'Alfetta', u'Arna', u'Brera', u'Crosswagon Q4', u'Giulia', u'Giulietta', u'GT', u'GTV', u'Junior', u'Mito', u'RZ/SZ', u'Spider', u'Sportwagon', u'Sprint']
Alpina
[u'B12', u'B3', u'B5', u'B6', u'B7', u'B8', u'D10', u'D3', u'Roadster S']
AMC
[u'Ambassador', u'Concord', u'Eagle', u'Gremlin', u'Javelin', u'Matador', u'Pacer', u'Rambler', u'Rebel', u'Spirit']
ARO
[u'K450', u'Spartana']
Asia
[u'Hi-Topic', u'Retona', u'Rocsta', u'Towner']

Upvotes: 1

Related Questions