Reputation: 176
I'm trying to scrape instagram IGTV data (e.g., video title, views, likes, comments etc.) First I was using only BeautifulSoup but I was only able to take first 12 video details. Then I started using Selenium, now I'm able to get first 24 video details. But I've to scrape all the videos.
Below code gives me hyperlinks for the first 24 videos, then I'm scraping video details from each of the hyperlink:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#import json
url = 'https://www.instagram.com/agt/channel/?hl=en'
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
video_links=[]
for a in soup.find_all('a', class_='_bz0w', href=True):
video_links.append('https://www.instagram.com' + a['href'])
print(video_links)
Please suggest me on how to get all the video details.
Upvotes: 0
Views: 472
Reputation: 1956
you probably need to scroll down to load more results. You can do something like
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
to do so
Combining this with answer found elsewhere so we can scroll down until end of page is reached:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#import json
url = 'https://www.instagram.com/agt/channel/?hl=en'
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
SCROLL_PAUSE_TIME = 1
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
video_links=[]
for a in soup.find_all('a', class_='_bz0w', href=True):
video_links.append('https://www.instagram.com' + a['href'])
print(len(video_links))
Upvotes: 1