Vinayakumar R
Vinayakumar R

Reputation: 57

Extracting user comments from news website

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def wait(dr, x):
  element = WebDriverWait(dr, 50).until(
    EC.presence_of_all_elements_located((By.XPATH, x))
)
return element
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("http://www.dinamalar.com/user_comments.asp? uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D")
for elem in wait(browser, '//*[@id="commsec"]/div[2]/div[1]'):
print elem.text

This is the link i need to extract all the comments http://www.dinamalar.com/user_comments.asp?uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D

But my code extracting only the first 10 comments. After clicking the button the other 10 comments are loaded dynamically. How to extract all these comments using python selenium

Upvotes: 4

Views: 125

Answers (1)

alecxe
alecxe

Reputation: 473833

The idea would be to look for how many "more ideas" elements are present on the page. Every time you click the button and load more comments, one more "more ideas" red button becomes present. Implementation:

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver


browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
browser.get("http://www.dinamalar.com/user_comments.asp?uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D")

# initial wait for the page to load
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".morered")))

pages = 1
while True:
    browser.find_elements_by_css_selector(".morered")[-1].click()

    # wait for more "load more" buttons to be present
    try:
        wait.until(lambda browser: len(browser.find_elements_by_css_selector(".morered")) > pages)
    except TimeoutException:
        break  # no more data loaded, exit the loop

    print("Comments loaded: %d" % len(browser.find_elements_by_css_selector(".dateg")))

    pages += 1

browser.close()

Note that I've also removed that extra space inside the URL.

Upvotes: 2

Related Questions