Reputation: 200
I have a selenium configuration for scraping a specific HTTP request, this request was send only when I click on a specific REACT element of a website. That's the reason why i'm using selenium... can't find other way.
I must renew my IP, each time I want to scrape this specific HTTP request. For achieve this I use Tor. When I start my python script it works very well, Tor set a new ip and scrape what I want. I have add a try/catch to my script, if my script can't work the first time, it will retry 10 times.
The problem is when my script try another time, the IP can't rotate anymore....
how achieve this ?
import time
from random import randint
from time import sleep
import os
import subprocess
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from seleniumwire import webdriver
from selenium.webdriver.firefox.options import Options
from fake_useragent import UserAgent
options_wire = {
'proxy': {
'http': 'http://localhost:8088',
'https': 'https://localhost:8088',
'no_proxy': ''
}
}
def firefox_init():
os.system("killall tor")
time.sleep(1)
ua = UserAgent()
user_agent = ua.random
subprocess.Popen(("tor --HTTPTunnelPort 8088"),shell=True)
time.sleep(2)
return user_agent
def profile_firefox():
profile = FirefoxProfile()
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("general.useragent.override", firefox_init())
profile.set_preference("driver.privatebrowsing.autostart", True)
profile.update_preferences()
return profile
def options_firefox():
options = Options()
options.headless = False
return options
def firefox_closing(driver):
driver.quit()
time.sleep(3)
os.system('killall tor')
def headless(url):
for x in range(0, 10):
profile = profile_firefox()
options = options_firefox()
driver = webdriver.Firefox(seleniumwire_options=options_wire,firefox_profile=profile, options=options, executable_path='******/headless_browser/geckodriver')
driver.set_window_position(0, 0)
driver.set_window_size(randint(1024, 2060), randint(1024, 4100))
# time.sleep(randint(3,10))
driver.get(url)
time.sleep(randint(3,8))
try:
if driver.find_element_by_xpath("//*[@id=\"*******\"]/main/div/div/div[1]/div[2]/form/div/div[2]/div[1]/button"):
# driver.find_element_by_xpath("//*[@id=\"*******\"]/main/div/div/div[1]/div[2]/form/div/div[2]/div[1]/button").click()
# time.sleep(randint(8,10))
driver.find_element_by_xpath("//*[@id=\"*******\"]/main/div/div/div[1]/div[2]/form/div/div[2]/div[1]/button").click()
time.sleep(randint(3,6))
for request in driver.requests:
if request.path == "https://api.*********.***/*******/*********":
request_api = request
raw = str(request_api.body)
request_api = raw.split(('b\''))
payload_raw = request_api[1]
payload = payload_raw[:-1]
if payload:
header = request.headers
print(header, payload)
break
else:
continue
break
except:
firefox_closing(driver)
time.sleep(5)
finally:
firefox_closing(driver)
return header, payload
url="https://check.torproject.org/?lang=fr"
headless(url)
Thank you
Upvotes: 0
Views: 4137
Reputation: 200
So to achieve this, I use an other proxy, selenium-wire is very good but it need to be fix.
I have use Browsermob proxy and set an upstream proxy to work with. The result is you can catch every HTTP resquest or response parse it and the ip rotate every time and use tor HTTPTunnelPort configuration.
proxy_params = {'httpProxy': 'localhost:8088', 'httpsProxy': 'localhost:8088'}
proxy_b = server.create_proxy(params=proxy_params)
Thanks
Upvotes: 0
Reputation: 104
Well, I can't possibly know how it's not renewing the IP address since you kill the tor process. Even if you put tor as a service in Systemd, it'd renew as you restart the service, certainly. But I might give you some directions:
ua = UserAgent(cache=False, use_cache_server=False)
Upvotes: 1