Reputation: 1511
I want to scrape all of the data on this site.
This part of my script will click the 'search' button necessary to produce the rows of data that I want to scrape:
from selenium import webdriver
import os
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
import sys
import re
import requests
#options.add_argument("--headless")
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
base_url = 'https://drugdesign.riken.jp/hERGdb/'
driver.get(base_url)
#click the button that says search
element = driver.find_element_by_css_selector('[name=Structure_Search]').click()
Then I need to click on each LOT_ID, which would bring me to a page like this, which I can scrape with this code:
base_url = 'https://drugdesign.riken.jp/hERGdb/compound.php?HGID=HG-0260086'
driver.get(base_url)
## compound information table
hgid = driver.find_element_by_xpath('//tbody/tr/th[contains(.,"HGID")]/following::td[1]')
drug_name = driver.find_element_by_xpath('//tbody/tr/th[contains(.,"Drug_name")]/following::td[1]')
MW = driver.find_element_by_xpath('//tbody/tr/th[contains(.,"MW")]/following::td[1]')
Formula = driver.find_element_by_xpath('//tbody/tr/th[contains(.,"Formula")]/following::td[1]')
## ID relation table
id_table = driver.find_elements_by_xpath('/html/body/div[2]/div/div/div[2]/table[2]/tbody')
for x in id_table:
print(x.text)
## in vitro assay information table
assay_data = driver.find_elements_by_xpath('/html/body/div[2]/div/div/div[2]/table[3]/tbody')
for x in assay_data:
print(x.text)
I cannot understand how to loop through ALL of the LOT_IDs on the site (e.g. there are only 10 displayed per page, and there seems to be >300,000 results but only 1,000 displayed). So the ultimate question is how do I loop through ALL >300,000 LOT_IDs that they say are the results for my search, so I can run the second part of my code (above) on it (which runs on each individual page).
I have been looking through SO, I've tried something like:
#table = driver.find_element_by_css_selector('//*[@id="foo-table"]/tbody/tr[1]/td[3]/a')
#print(table)
and similar using XPaths etc, but I get errors like:
selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: An invalid or illegal selector was specified
(Session info: chrome=77.0.3865.90)
So if someone could fill in the middle part of my code (I think it should only be one or two lines max?) that would show me how to loop through the >300,000 LOT_IDs and click on them, to bring me to the page that I then scrape, I would appreciate it.
Upvotes: 1
Views: 87
Reputation: 33384
Here is selenium code to get all 1000 links.Use while loop and iterate each page get the link until Next button is disabled.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
base_url = 'https://drugdesign.riken.jp/hERGdb/'
driver.get(base_url)
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//input[@name='Structure_Search']"))).click()
targeturl=[]
while True:
elements=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#foo-table_filter ~ #foo-table a")))
for ele in elements:
targeturl.append(ele.get_attribute('href').replace('./', ''))
if len(driver.find_elements_by_css_selector("a.paginate_button.next.disabled"))>0:
break
nextbutton=driver.find_element_by_css_selector("a.paginate_button.next")
nextbutton.location_once_scrolled_into_view
nextbutton.click()
print(len(targeturl))
print(targeturl)
Upvotes: 0
Reputation: 12255
You can get all links using requests and beautifulsoup. Code below print all 1000 links:
import requests
from bs4 import BeautifulSoup
base_url = "https://drugdesign.riken.jp/hERGdb"
data = [
('smiles_S', ''),
('jme_S', ''),
('tab_selected', 'tab_S'),
('query_type', 'Substructure'),
('Target[]', 'hERG'),
('Target[]', 'Cav1.2'),
('Target[]', 'Nav1.5'),
('Target[]', 'Kv1.5'),
('Value_type[]', 'IC50'),
('Value_type[]', 'inhibition'),
('Value_type[]', 'other'),
('Assay_type[]', 'binding'),
('Assay_type[]', 'patch clamp'),
('Assay_type[]', 'other'),
('Data_source[]', 'ChEMBL'),
('Data_source[]', 'PubChem_CID'),
('Data_source[]', 'hERG Central(PubChem_SID)'),
('low_MW', ''),
('high_MW', ''),
('Assay_name', ''),
('Structure_Search', 'Search'),
]
response = requests.post(f'{base_url}/result.php', data=data)
lots = BeautifulSoup(response.text, "html.parser").select("a[href^='./compound.php?HGID=']")
for lot in lots:
url = str(lot['href']).replace("./", "")
print(f"{base_url}/{url}")
Upvotes: 2