Clara HL
Clara HL

Reputation: 41

Issue when trying to select an option from a list for scraping - Python

I am trying to scrape the table contained in the following page: https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=1&groups_id=

At the top of the table, I want to change model "1" by "- All -". I was writing the following lines of code:

link = f"https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=- All -&groups_id="
browser.get(link)

but this isn't working.

When I replace model=- All - by model=1 the code works, so I suspect there is something going on with my - All - option, but I can't figure out what.

Full code below with the loop through all Targets and Model options (the version above was simplified):

from bs4 import BeautifulSoup,NavigableString, Tag 
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import csv
import os
import numpy as np

os.chdir('THE DIRECTORY WHERE YOUR CHROMEDRIVER IS')
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])


browser = webdriver.Chrome(executable_path='THE DIRECTORY WHERE YOUR CHROMEDRIVER IS/chromedriver')
browser.get("https://predictioncenter.org/") #open page in browser

df = pd.DataFrame()

x = browser.find_elements(By.XPATH, "//a[contains(@id, 'ygtvlabelel6')]")[0].click()
x = browser.find_elements(By.XPATH, "//a[contains(@href, 'results.cgi')]")[0].click()
x = browser.find_elements(By.XPATH, "//a[contains(@id, 'a_T1024')]")[0].click()   

content = browser.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
    
#Get all possible options 
options = soup.find("select",{"name":"target"}).findAll("option")
list_prot = []
for i in options:
    name = i.text
    list_prot.append(name)

type_model = soup.find("select",{"name":"model"}).findAll("option")    
model_t=[]
for i in type_model:
    name = i.text
    model_t.append(name)

mod=model_t[0]

i=0
final=pd.DataFrame()
for target in list_prot:
    print(i)
    link = f"https://predictioncenter.org/casp14/results.cgi?view=tables&target={target}&model={mod}&groups_id="
    browser.get(link)

Upvotes: 0

Views: 59

Answers (2)

Md. Fazlul Hoque
Md. Fazlul Hoque

Reputation: 16187

You can select '- all -' from the model dropdown using selenium as it requires to click on the dropdown, then select the desired value using select_by_index() method and it should work as expectation.

Full Working code:

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select


driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
                    
URL = 'https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=1&groups_id='
driver.get(URL)
driver.maximize_window()
time.sleep(5)


WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.table > tbody tr:first-child > td:nth-child(3)'))).click()
time.sleep(2)

dropdown=Select(driver.find_element(By.CSS_SELECTOR,'select#model'))
time.sleep(2)
dropdown.select_by_index(0)
time.sleep(2)

soup = BeautifulSoup(driver.page_source, "html.parser")
table= soup.select_one('.table_results')

df = pd.read_html(str(table))[0]
print(df)

driver.quit() # close browser

Output:

    0             1        2                 3           4   ...       24       25       26      27     28
0    General       General  General           General     General  ...  Handed.       TM       TM   FlexE    ASE
1          #         Model      GR#           GR Name      Charts  ...  Handed.  TMscore  TMalign   FlexE    ASE
2          #           NaN      NaN               NaN         NaN  ...      NaN      NaN      NaN     NaN    NaN
3        NaN         Model      NaN               NaN         NaN  ...      NaN      NaN      NaN     NaN    NaN
4        NaN           NaN      NaN               NaN         NaN  ...      NaN      NaN      NaN     NaN    NaN
..       ...           ...      ...               ...         ...  ...      ...      ...      ...     ...    ...
592     508.  T1024TS170_4    170 s  BhageerathH-Plus  A  D  I  G  ...     0.53     0.23     0.42  201.20    NaN
593     509.  T1024TS063_5    063 s          ACOMPMOD  A  D  I  G  ...     0.50     0.17     0.27  746.50  92.81
594     510.  T1024TS305_1    305 s        CAO-SERVER  A  D  I  G  ...     0.48     0.22     0.30  151.85  18.11
595     511.  T1024TS342_2      342             CUTSP  A  D  I  G  ...     0.49     0.15     0.28  410.17    NaN
596     512.  T1024TS217_5      217           CAO-QA1  A  D  I  G  ...     0.50     0.20     0.32  167.41  17.24

[597 rows x 29 columns]

Upvotes: 1

Barry the Platipus
Barry the Platipus

Reputation: 10460

Here is one way of getting that table containing -All- results:

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

headers = {
    'Origin': 'https://predictioncenter.org',
    'Referer': 'https://predictioncenter.org/casp14/results.cgi',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
payload = {
    'target': 'T1024', 
    'groups_id': '',
    'model': '',
    'submit': 'Submit',
    'order': '',
    'field': '',
    'view': 'results',
    'lga_4_view': 'brief',
    'lga_5_view': 'brief',
    'dsc_view': 'brief',
    'dali_view': 'brief',
    'molprb_view': 'brief'
}

url = 'https://predictioncenter.org/casp14/results.cgi'

r = requests.post(url, headers=headers, data=payload)
results_table = bs(r.text, 'html.parser').select_one('table[class="table_results"]')
df = pd.read_html(str(results_table))[0]
print(df)

Result in terminal:

    0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28
0   General General General General General LGA Sequence Dependent (4Å) Full    LGA Sequence Dependent (4Å) Full    LGA Sequence Dependent (4Å) Full    LGA Sequence Independent (4Å) Full  LGA Sequence Independent (4Å) Full  MAMMOTH Dali Full   Molprobity Full lDDT    SphGr   CAD CAD RPF QCS QCS SOV CE  CoDM    DFM Handed. TM  TM  FlexE   ASE
1   #   Model   GR# GR Name Charts  GDT_TS  NP_P    Z-M1-GDT    AL0_P   AL4_P   Z-score Z-Score MP-Score    Global score    SG  AA  SS  RPF QCS contS   SOV CE  CoDM    DFM Handed. TMscore TMalign FlexE   ASE
2   1.  T1024TS427_3    427 AlphaFold2  A D I G 79.22   100.00  NaN 82.61   96.16   13.16   58.6    1.05    0.89    99.62   0.82    0.69    0.90    96.35   96.86   82.30   7.84    0.99    0.04    0.97    0.93    0.93    0.35    90.02
3   2.  T1024TS427_5    427 AlphaFold2  A D I G 71.67   100.00  NaN 69.82   86.70   10.97   54.8    1.10    0.88    99.62   0.82    0.68    0.88    94.42   96.61   79.70   7.74    0.98    0.06    0.95    0.88    0.88    0.35    83.46
4   3.  T1024TS226_5    226 s   Zhang-TBM   A D I G 65.73   100.00  NaN 71.10   85.42   11.57   41.8    1.94    0.64    84.27   0.65    0.39    0.71    82.88   81.55   83.80   7.64    0.92    0.33    0.92    0.87    0.87    3.40    NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
509 508.    T1024TS170_4    170 s   BhageerathH-Plus    A D I G 9.46    100.00  NaN 1.02    6.65    0.66    2.2 0.97    0.20    12.40   0.40    0.07    0.24    25.99   41.23   40.90   4.74    0.21    2.02    0.53    0.23    0.42    201.20  NaN
510 509.    T1024TS063_5    063 s   ACOMPMOD    A D I G 9.02    100.00  NaN 0.26    0.51    -0.07   0   4.98    0.06    10.10   0.30    0.05    0.16    21.49   29.85   37.60   3.70    0.08    2.17    0.50    0.17    0.27    746.50  92.81
511 510.    T1024TS305_1    305 s   CAO-SERVER  A D I G 8.76    100.00  -3.36   0.00    0.00    -0.80   0   4.74    0.11    18.67   0.41    0.07    0.23    26.36   48.22   45.30   3.50    0.09    2.32    0.48    0.22    0.30    151.85  18.11
512 511.    T1024TS342_2    342 CUTSP   A D I G 8.76    100.00  NaN 0.00    0.00    0.75    4.4 2.98    0.20    12.66   0.42    0.08    0.26    24.47   43.00   42.50   5.33    0.15    2.23    0.49    0.15    0.28    410.17  NaN
513 512.    T1024TS217_5    217 CAO-QA1 A D I G 8.44    100.00  NaN 1.02    5.37    -1.15   0   NaN 0.08    36.92   0.42    0.00    0.10    23.51   48.36   56.20   3.70    0.21    2.00    0.50    0.20    0.32    167.41  17.24
514 rows × 29 columns

You may want to select another row for table headers - see relevant pandas documentation here

Upvotes: 2

Related Questions