Sonu
Sonu

Reputation: 41

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.

Upvotes: 2

Views: 1564

Answers (2)

user3415910
user3415910

Reputation: 490

Had some issues using @CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs

def get_source_code(browser):
    rm_btns = browser.find_elements_by_class_name('_1BWGvX')
    for rm_btn in rm_btns:
        rm_btn.click()
    return browser.page_source


def collect_reviews_attributes(html):
    soup_obj = bs(html, "html.parser")
    text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
    heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
    rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
    text_tags = [tag.text for tag in text_tag_divs]
    heading_tags = [tag.text for tag in heading_tag_divs]
    rating_tags = [tag.text for tag in rating_tag_divs]
    return list(zip(heading_tags, text_tags, rating_tags))
    
collector_list = []    
    

browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception

browser.get(url) # open the url in the browser
for _ in range(num_pages):
    page_source_code = get_source_code(browser)
    collector_list.extend(collect_reviews_attributes(page_source_code))
    next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
    next_page.click()

Upvotes: 0

Chandu
Chandu

Reputation: 2129

from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata

def remove_non_ascii_1(text):

    return ''.join([i if ord(i) < 128 else ' ' for i in text])

with closing(Firefox()) as browser:
    site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
    browser.get(site)

    file = open("review.txt", "w")

    for count in range(1, 10):
        nav_btns = browser.find_elements_by_class_name('_33m_Yg')

        button = ""

        for btn in nav_btns:
            number = int(btn.text)
            if(number==count):
                button = btn
                break

        button.send_keys(Keys.RETURN)
        WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))

        read_more_btns = browser.find_elements_by_class_name('_1EPkIx')


        for rm in read_more_btns:
            browser.execute_script("return arguments[0].scrollIntoView();", rm)
            browser.execute_script("window.scrollBy(0, -150);")
            rm.click()

        page_source = browser.page_source

        soup = BeautifulSoup(page_source, "lxml")
        ans = soup.find_all("div", class_="_3DCdKt")


        for tag in ans:
            title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
            title = remove_non_ascii_1(title)
            title.encode('ascii','ignore')
            content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
            content = remove_non_ascii_1(content)
            content.encode('ascii','ignore')
            content = content[15:-7]

            votes = tag.find_all("span", class_="_1_BQL8")
            upvotes = int(votes[0].string)
            downvotes = int(votes[1].string)

            file.write("Review Title : %s\n\n" % title )
            file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
            file.write("Review Content :\n%s\n\n\n\n" % content )

    file.close()

Usage:

  1. Install the requirements by running pip install bs4 selenium.
  2. Add geckodriver to the PATH. Follow these instructions.
  3. Put the link of the product in site variable inside the script.
  4. Run the script by running python scrape.py.
  5. Reviews will be saved in the file review.txt.

Upvotes: 2

Related Questions