user14894074
user14894074

Reputation:

Selenium not returning the correct number of child elements?

I've been trying multiple ways to get the child elements of this section, but it seems to be wildly off. The website I'm using is https://www.thecompleteuniversityguide.co.uk/courses/details/computing-bsc/57997898 except I am to do this multiple times over multiple pages so I'm just focusing on making sure that just one webpage can return what I need. Currently this is what I have:

import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

opts = Options()
opts.add_argument('--headless')

driver = Chrome(options = opts, executable_path = 'D:\Programs\Python\chromedriver.exe')

driver.get("https://www.thecompleteuniversityguide.co.uk/courses/details/computing-bsc/57997898")

closeButton = driver.find_element_by_xpath("//a[@id='closeFilter']")
closeButton.click()
driver.find_element_by_xpath("//a[@id='acceptCookie']").click()

modules_container = driver.find_element_by_xpath("//div[@data-sub-sec='Modules']").find_element_by_class_name("cdsb_rt")
numberOfModulesByYear = len(modules_container.find_elements_by_xpath("//div[@class='mdldv']"))

for moduleYear in range(numberOfModulesByYear):
      print("_"*75)
      moduleYearButtonString = "//div[@class='mdldv' and @data-module-sections='{}']".format(str(moduleYear))
      module_year = modules_container.find_element_by_xpath(moduleYearButtonString)
      module_year_a = module_year.find_element_by_tag_name("a")
      while module_year_a.find_element_by_tag_name("span").get_attribute("class") != "icon icon-minus": 
            module_year_a.click()
            print(module_year_a.find_element_by_tag_name("span").get_attribute("class"))
      numberOfModules = module_year.find_elements_by_xpath("//div[@class='mdiv']")
      print(len(numberOfModules))

driver.close()

The output is:

___________________________________________________________________________
icon icon-minus
0
___________________________________________________________________________
icon icon-minus
10
___________________________________________________________________________
icon icon-minus
10
___________________________________________________________________________
icon icon-minus
15

However, in order of modules on the website it's 5,5,4,1 Does anyone know how to fix this? (Also appears to me that Selenium is not looking exclusively within the element for child elements and also is returning child elements in its sibling elements which is NOT what I want.

Edit: Still have no idea why it's returning sibling child elements but I figured I would hav eto wait for it load:

for moduleYear in range(numberOfModulesByYear):
      print("_"*75)
      moduleYearButtonString = "//div[@class='mdldv' and @data-module-sections='{}']".format(str(moduleYear))
      module_year = modules_container.find_element_by_xpath(moduleYearButtonString)
      module_year_a = module_year.find_element_by_tag_name("a")
      while module_year_a.find_element_by_tag_name("span").get_attribute("class") != "icon icon-minus": 
            module_year_a.click()
      while len(module_year.find_elements_by_xpath("//div[@class='mdiv']")) - previousNumberOfModules == 0:
            time.sleep(0.01)
      numberOfModules = len(module_year.find_elements_by_xpath("//div[@class='mdiv']")) - previousNumberOfModules
      previousNumberOfModules = len(module_year.find_elements_by_xpath("//div[@class='mdiv']"))
      print(numberOfModules)

Still have no idea why it's returning sibling child elements though, when I specified the exact element I want it to look inside of.

Upvotes: 0

Views: 251

Answers (2)

QHarr
QHarr

Reputation: 84465

You could do the whole thing with requests as the page uses ajax requests to get the module info for courses. If you visit a given course page, you can extract the module ids for the initial ajax request to get the modules; then extract the description ids for each module to use in a subsequent request.

I have used Session for efficiency of tcp re-use. Also, tried to abstract in hope you can use with different course id (at this point starts to feel like could re-write as class)......

import requests, pprint
from bs4 import BeautifulSoup as bs

course_id = '57997898'

url_desc = 'https://www.thecompleteuniversityguide.co.uk/cug2/ajax/get-course-modules-description.html'
url_modules = 'https://www.thecompleteuniversityguide.co.uk/cug2/ajax/get-course-modules.html'

headers = {
    'User-Agent': 'Mozilla/5.0',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Referer': 'https://www.thecompleteuniversityguide.co.uk/courses/details/computing-bsc/' + course_id,
}

results = {}

with requests.Session() as s:
    s.headers = headers
    r = s.get(f'https://www.thecompleteuniversityguide.co.uk/courses/details/computing-bsc/{course_id}#modules')
    soup = bs(r.content, 'lxml')
    module_ids = {i.a.text.strip():i.select_one('.mdtt')['data-module-group'] for i in soup.select('.mdldv')} #gather the module ids

    for k,v in module_ids.items():
        results[k] = {}
        data = {'moduleGroupId': v, 'courseId': course_id}
        r = s.post(url_modules,data=data) 
        soup = bs(r.text, 'lxml')
        modules = {i.select_one('.mdltxt').text.strip():i.a['data-modules-list'] 
                   for i in soup.select('.mdiv') if i.select_one('.mdltxt')}
        # print(modules)
        if modules: #check there are actually modules (last doesn't have any)
            for k2,v2 in modules.items():
                data = {'moduleGroupId': v, 'moduleId': v2}
                r = s.post(url_desc, data=data)
                soup = bs(r.content, 'lxml') 
                desc = soup.select_one('.mdesc').text.strip()
                results[k][k2] = desc
                # print(desc)
pprint.pprint(results)

Upvotes: 0

user14894074
user14894074

Reputation:

import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as BSoup

opts = Options()
opts.add_argument('--headless')

driver = Chrome(executable_path = 'D:\Programs\Python\chromedriver.exe', options = opts)

driver.get("https://www.thecompleteuniversityguide.co.uk/courses/details/computing-bsc/57997898")

closeButton = driver.find_element_by_xpath("//a[@id='closeFilter']")
closeButton.click()
driver.find_element_by_xpath("//a[@id='acceptCookie']").click()

modules_container = driver.find_element_by_xpath("//div[@data-sub-sec='Modules']").find_element_by_class_name("cdsb_rt")
numberOfModulesByYear = len(modules_container.find_elements_by_xpath("//div[@class='mdldv']"))
previousNumberOfModules = 0
numberOfModulesTab = []

print("-"*100)
for moduleYear in range(numberOfModulesByYear):
      moduleYearButtonString = "//div[@class='mdldv' and @data-module-sections='{}']".format(str(moduleYear))
      module_year = modules_container.find_element_by_xpath(moduleYearButtonString)
      print(module_year.get_attribute("innerHTML"))
      module_year_a = module_year.find_element_by_tag_name("a")
      time.sleep(0.5)
      while module_year_a.find_element_by_tag_name("span").get_attribute("class") == "icon icon-add": 
            module_year_a.click()
      while len(module_year.find_elements_by_xpath("//div[@class='mdiv']")) - previousNumberOfModules == 0:
            time.sleep(0.01)
      listOfModules = module_year.find_elements_by_xpath("//div[@class='mdiv']")
      numberOfModules = len(listOfModules) - previousNumberOfModules
      previousNumberOfModules = len(module_year.find_elements_by_xpath("//div[@class='mdiv']"))
      numberOfModulesTab.append(numberOfModules)
      print("-"*100)

alreadyExistingModules = []
moduleIndex = 0
numberOfDescriptions = 0

allTheModules = modules_container.find_elements_by_xpath("//div[@class='mdiv']")
for moduleNumber, module in enumerate(allTheModules):
      outputLines = False
      module_a = module.find_elements_by_tag_name("a")
      if len(module_a) > 0:
            module_a = module_a[0]
            soup = BSoup(module_a.get_attribute("innerHTML"), "html.parser")
            moduleName = soup.find("span", class_='mdltxt').getText()
            if not (moduleName in alreadyExistingModules):
                  outputLines = True
                  moduleIndex += 1
                  alreadyExistingModules.append(moduleName)
                  print(moduleName.rstrip())
                  while module_a.find_element_by_tag_name("span").get_attribute("class") == "icon icon-add":
                        driver.execute_script("arguments[0].click()", module_a)
                  while len(module.find_elements_by_xpath("//p[@class='mdesc']"))-numberOfDescriptions == 0:
                        time.sleep(0.01)
                  soup = BSoup(module.get_attribute("innerHTML"), "html.parser")
                  print(soup.find("p", class_="mdesc").getText().rstrip())
                  numberOfDescriptions += 1
      else:
            soup = BSoup(module.get_attribute("innerHTML"), "html.parser")
            moduleName = soup.find("span", class_='mtxt').getText()
            if not (moduleName in alreadyExistingModules):
                  outputLines = True
                  moduleIndex += 1
                  alreadyExistingModules.append(moduleName)
                  print(moduleName.rstrip())
                  
      if outputLines:
            print("-"*100)

driver.close()

Had to wait at certain points because java script doesn't immediately execute and sometimes executes twice... and for some reason it was returning what I didn't want so I had to use beautifulSoup to get what I wanted but it works so :)

Upvotes: 2

Related Questions