Reputation: 145
Guys i maybe have a tricky problem over here. I was trying to made a bot that will download all the photos/videos urls of an instagram account, append them to a list and in the end save them to a file. But while i was seeing if it was working, i find out that the list of urls, it was containing 51 urls all the time, and every time i was appending new urls while the program was working, those urls on the list was changing with the new 51 urls and the last urls was removed from the list, instead of add them up to the existing urls to the list and continue appending the new ones. Why is happening such a thing? I need your knowledge guys :)
The code of the bot is below:
#Here is the run.py from where I'm running the program
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import autoit
from selenium.webdriver.common.keys import Keys
import requests
import coockies
import PopUpsClose
import login
import link
import url_extraxction
def main():
#Makes an mobile emulator to start Instagram like a smartphone
mobile_emulation = {
"deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options = chrome_options)
browser.get('https://www.instagram.com/accounts/login/')
coockies.close_coockies(browser)
login.Insta_login(browser)
PopUpsClose.pop_up(browser)
######################################
#Here it takes the url from the file
url = link.page_link(browser)
browser.get(url)
sleep(5)
#Scrolling down to the page and getting the URLS
url_extraxction.extract(browser, url)
main()
Here is the login function
from time import sleep
def Insta_login(browser):
login_file = open(r'C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\login.txt', 'r')
username = login_file.readline()
while username != '':
password = login_file.readline()
username_ = username.rstrip("\n")
password = password.rstrip("\n")
username = login_file.readline()
sleep(2)
browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[3]/div/label/input""").send_keys(username_)
browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[4]/div/label/input""").send_keys(password)
sleep(2)
browser.find_element_by_xpath("""/html/body/div[1]/section/main/div[1]/div/div/div/form/div[1]/div[6]/button/div""").click()
sleep(10)
login_file.close()
Here is the coockies function
def close_coockies(browser):
coockies_accept = browser.find_element_by_xpath("""/html/body/div[2]/div/div/div/div[2]/button[1]""")
coockies_accept.click()
Here is the PopUpsClose function
from time import sleep
def pop_up(browser):
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 1ο Pop Up
not_now_button = browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div/div/button""")
not_now_button.click()
sleep(10)
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 2ο Pop Up
not_now_button2 = browser.find_element_by_xpath("""/html/body/div[4]/div/div/div/div[3]/button[2]""")
not_now_button2.click()
sleep(2)
And last is the url_extraction function
in where i have the problem
from time import sleep
import requests
import os
def extract(browser, url):
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"}
requests.get(url, headers = header)
#SCROLL DOWN
print("This process maybe it will take like 5 minutes.\n", "Don't close the program......")
last_height = 0
proceed = ''
while True:
browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
#GET THE URLS
elements = browser.find_elements_by_xpath('//a[@href]')
links = []
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls)
print(links)
sleep(2)
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
if False:
proceed = False
else:
proceed = True
sleep(10)
#Create a folder with the name of the profile
if proceed == True:
name = browser.find_element_by_class_name("_7UhW9.fKFbl.yUEEX.KV-D4.fDxYl")
text = name.text
print("Wait to create a Folder to pass the extracted links.\nPlease don't close the program.")
print('' * 2)
sleep(5)
path = "C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\" + text
sleep(2)
try:
os.mkdir(path)
link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
+ '\\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + '\n')
link_extraction.close()
sleep(2)
print('The links transferred succesfully to the file.')
except FileExistsError:
print('The file already exist.')
link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
+ '\\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + '\n')
link_extraction.close()
sleep(2)
print('The links transferred successfully to the file.')
Inside the url_extraction
function i have a #GET THE URLS
and after that is where the problem occurs.
Upvotes: 0
Views: 198
Reputation: 27053
in your while loop you are redefining the list everytime you scroll. so in effect you are only saving the last scroll to file.
def extract(browser, url):
...
while True:
# scroll down
...
links = [] # <--- (1) ---
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop
at (1) you are defining a new list. at (2) you are appending to the list. but in the next iteration of the while loop you are again defining a new list at (1) and the previous items are lost.
to keep the results you must define the list outside of the while loop.
def extract(browser, url):
...
links = [] # <--- (1) ---
while True:
# scroll down
...
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop
Upvotes: 1