bilakos
bilakos

Reputation: 145

Problem with Instagram Scraping using Selenium when trying to append the urls to a list of urls

Guys i maybe have a tricky problem over here. I was trying to made a bot that will download all the photos/videos urls of an instagram account, append them to a list and in the end save them to a file. But while i was seeing if it was working, i find out that the list of urls, it was containing 51 urls all the time, and every time i was appending new urls while the program was working, those urls on the list was changing with the new 51 urls and the last urls was removed from the list, instead of add them up to the existing urls to the list and continue appending the new ones. Why is happening such a thing? I need your knowledge guys :)

The code of the bot is below:

#Here is the run.py from where I'm running the program

import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import autoit
from selenium.webdriver.common.keys import Keys
import requests
import coockies
import PopUpsClose
import login
import link
import url_extraxction


def main():
    #Makes an mobile emulator to start Instagram like a smartphone 
    mobile_emulation = {
        "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
        "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options = chrome_options)

    browser.get('https://www.instagram.com/accounts/login/')

    coockies.close_coockies(browser)
    login.Insta_login(browser)
    PopUpsClose.pop_up(browser)

    ######################################
    #Here it takes the url from the file 
    url = link.page_link(browser)
    browser.get(url)
    sleep(5)

    #Scrolling down to the page and getting the URLS
    url_extraxction.extract(browser, url)

  


main()

Here is the login function

from time import sleep

def Insta_login(browser):
    login_file = open(r'C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\login.txt', 'r')
    username = login_file.readline()
    while username != '':
        password = login_file.readline()
        username_ = username.rstrip("\n")
        password = password.rstrip("\n")
        username = login_file.readline()

    sleep(2)
    browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[3]/div/label/input""").send_keys(username_)
    browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[4]/div/label/input""").send_keys(password) 
    sleep(2)
    browser.find_element_by_xpath("""/html/body/div[1]/section/main/div[1]/div/div/div/form/div[1]/div[6]/button/div""").click()

    sleep(10)
    login_file.close()

Here is the coockies function

def close_coockies(browser):
    coockies_accept = browser.find_element_by_xpath("""/html/body/div[2]/div/div/div/div[2]/button[1]""")
    coockies_accept.click()

Here is the PopUpsClose function

from time import sleep

def pop_up(browser):
    #Εδώ βρίσκει που είναι σημείο για να κλείσει το 1ο Pop Up
    not_now_button = browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div/div/button""")
    not_now_button.click()
    sleep(10)
    #Εδώ βρίσκει που είναι σημείο για να κλείσει το 2ο Pop Up
    not_now_button2 = browser.find_element_by_xpath("""/html/body/div[4]/div/div/div/div[3]/button[2]""")
    not_now_button2.click()
    sleep(2)

And last is the url_extraction function in where i have the problem

from time import sleep
import requests
import os


def extract(browser, url):
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"}
    requests.get(url, headers = header)
    #SCROLL DOWN
    print("This process maybe it will take like 5 minutes.\n", "Don't close the program......")
    last_height = 0
    proceed = ''
    while True:
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        sleep(1)

        #GET THE URLS
        elements = browser.find_elements_by_xpath('//a[@href]')
        links = []
        for elem in elements:
            urls = elem.get_attribute('href')
            if urls not in links and 'p' in urls.split('/'):
                links.append(urls)
        print(links)
        sleep(2)
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        if False:
            proceed = False
        else:
            proceed = True
    sleep(10)

    #Create a folder with the name of the profile
    if proceed == True:
        name = browser.find_element_by_class_name("_7UhW9.fKFbl.yUEEX.KV-D4.fDxYl")
        text = name.text
        print("Wait to create a Folder to pass the extracted links.\nPlease don't close the program.")
        print('' * 2)
        sleep(5)
        path = "C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\" + text
        sleep(2)
        try:
            os.mkdir(path)
            link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
                                    + '\\extracted_links.txt', 'w')
            sleep(2)
            print("The extracted_links.txt file is created.")
            print('' * 2)
            for i in links:
                link_extraction.write(i + '\n')
            link_extraction.close()
            sleep(2)
            print('The links transferred succesfully to the file.')
        except FileExistsError:
            print('The file already exist.')
            link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
                                    + '\\extracted_links.txt', 'w')
            sleep(2)
            print("The extracted_links.txt file is created.")
            print('' * 2)
            for i in links:
                link_extraction.write(i + '\n')
            link_extraction.close()
            sleep(2)
            print('The links transferred successfully to the file.')
    

Inside the url_extraction function i have a #GET THE URLS and after that is where the problem occurs.

Upvotes: 0

Views: 198

Answers (1)

Lesmana
Lesmana

Reputation: 27053

in your while loop you are redefining the list everytime you scroll. so in effect you are only saving the last scroll to file.

def extract(browser, url):
    ...
    while True:
        # scroll down
        ...
        links = [] # <--- (1) ---
        for elem in elements:
            urls = elem.get_attribute('href')
            if urls not in links and 'p' in urls.split('/'):
                links.append(urls) # <--- (2) ---
        print(links)
        ...
        # check if at end and if yes then break out of loop

at (1) you are defining a new list. at (2) you are appending to the list. but in the next iteration of the while loop you are again defining a new list at (1) and the previous items are lost.

to keep the results you must define the list outside of the while loop.

def extract(browser, url):
    ...
    links = [] # <--- (1) ---
    while True:
        # scroll down
        ...
        for elem in elements:
            urls = elem.get_attribute('href')
            if urls not in links and 'p' in urls.split('/'):
                links.append(urls) # <--- (2) ---
        print(links)
        ...
        # check if at end and if yes then break out of loop

Upvotes: 1

Related Questions