robots.txt
robots.txt

Reputation: 137

Can't make use of different proxies to perform subsequesnt requests

I've written a script in python using proxies to scrape the links of different posts traversing different pages from a webpage. My goal here is to make two subsequesnt requests using different proxies from a list.

The script takes random proxies from the list and sends request by make_requests() function and then again makes another request by picking another proxy from the list using the newly populated links by make_ano_requests() function.

Finally, get_title() function prints the result.

However, if any proxy doesn't work then it gets kicked out from the list by either of the two functions make_requests() or make_ano_requests().

When I run the script, it seems to be working but somewhere within it's execution, the script gets stuck and never acoomplish the task. How can I accomplish the task?

This is what I've written so far (proxyVault contains fake proxies here):

import random
import requests
from random import choice
from bs4 import BeautifulSoup
from urllib.parse import urljoin

base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_urls = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
            f'newest&page={page}&pagesize=50' for page in range(1, 5)]

linkList = []

proxyVault = ['103.110.37.244:36022', '180.254.218.229:8080', '110.74.197.207:50632', '1.20.101.95:49001', '200.10.193.90:8080', '173.164.26.117:3128', '103.228.118.66:43002', '178.128.231.201:3128', '1.2.169.54:55312', '181.52.85.249:31487', '97.64.135.4:8080', '190.96.214.123:53251', '52.144.107.142:31923', '45.5.224.145:52035', '89.218.22.178:8080', '192.241.143.186:80', '113.53.29.218:38310', '36.78.131.182:39243']

def make_requests(url):
    proxy_url = choice(proxyVault)
    proxy = {'https': f'http://{proxy_url}'}
    try:
        res = requests.get(url, proxies=proxy)
        soup = BeautifulSoup(res.text, "lxml")
        linkList.extend([urljoin(base_url, item.get("href")) for item in soup.select(".summary .question-hyperlink")])
    except requests.exceptions.ProxyError:
        if proxy_url in proxyVault:
            proxyVault.remove(proxy_url)
            print(f'kicked out bad proxy by first func: {proxy_url}')
        return make_requests(url)

def make_ano_requests(url):
    proxy_url = choice(proxyVault)
    proxy = {'https': f'http://{proxy_url}'}
    try:
        res = requests.get(url, proxies=proxy)
        get_title(res.text)
    except requests.exceptions.ProxyError:
        if proxy_url in proxyVault:
            proxyVault.remove(proxy_url)
            print(f'kicked out bad proxy by second func: {proxy_url}')
        return make_ano_requests(url)

def get_title(response):
    soup = BeautifulSoup(response, "lxml")
    print(soup.select_one("h1[itemprop='name'] a").text)

if __name__ == '__main__':
    for lead_url in lead_urls:
        make_requests(lead_url)

    for single_link in linkList:
        make_ano_requests(single_link)

Upvotes: 2

Views: 5185

Answers (2)

Frans
Frans

Reputation: 837

You could speedup the proxy filter process by using asyncio and aiohttp. Something like this:

import aiohttp
import asyncio
import random
import requests
from random import choice
from bs4 import BeautifulSoup
from urllib.parse import urljoin


base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_urls = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort=' \
             f'newest&page={page}&pagesize=50' for page in range(1, 5)]
linkList = []

proxyVault = ['103.110.37.244:36022', '180.254.218.229:8080', '110.74.197.207:50632', '1.20.101.95:49001',
              '200.10.193.90:8080', '173.164.26.117:3128', '103.228.118.66:43002', '178.128.231.201:3128',
              '1.2.169.54:55312', '181.52.85.249:31487', '97.64.135.4:8080', '190.96.214.123:53251',
              '52.144.107.142:31923', '45.5.224.145:52035', '89.218.22.178:8080', '192.241.143.186:80',
              '113.53.29.218:38310', '36.78.131.182:39243']

def make_ano_requests(url):
    proxy_url = choice(proxyVault)
    proxy = {'https': f'http://{proxy_url}'}
    try:
        res = requests.get(url, proxies=proxy, timeout=5)
        get_title(res.text)
    except requests.exceptions.ProxyError:
        if proxy_url in proxyVault:
            proxyVault.remove(proxy_url)
            print(f'kicked out bad proxy by second func: {proxy_url}')
        return make_ano_requests(url)


def get_title(response):
    soup = BeautifulSoup(response, "lxml")
    print(soup.select_one("h1[itemprop='name'] a").text)


async def fetch(session, url, proxy_url):
    proxy = f'http://{proxy_url}'

    try:
        async with session.get(url, proxy=proxy) as response:
            return await response.text()
    except aiohttp.client_exceptions.ClientProxyConnectionError:
        print(f'kicked out bad proxy by first func: {proxy_url}')
        proxyVault.remove(proxy_url)


async def make_requests():
    tasks = []
    async with aiohttp.ClientSession() as session:
        for proxy in proxyVault:
            tasks.append(fetch(session, base_url, proxy))

        responses = await asyncio.gather(*tasks)

        print(f'Usefull proxies: {proxyVault}')
        for res in responses:
            if res:
                soup = BeautifulSoup(res.text, "lxml")
                linkList.extend(
                    [urljoin(base_url, item.get("href")) for item in soup.select(".summary .question-hyperlink")])


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(make_requests())

    for single_link in linkList:
        make_ano_requests(single_link)

Upvotes: 1

SimonF
SimonF

Reputation: 1885

It is quite possible that your requests.get are causing it to "hang", because they have no timeouts. Like the documentation says:

Nearly all production code should use this parameter in nearly all requests. Failure to do so can cause your program to hang indefinitely

So I suggest changing it to res = requests.get(url, proxies=proxy, timeout=1) to prevent it from hanging.

It is however really sloooooow. To speed it up I would suggest removing the second request, and instead of getting the links from requests 1 getting the strings [item.string for item in soup.select(".summary .question-hyperlink")] which more often than not are the same as the titles.

Edit, added code for catching timeouts in request.get:

import random
import requests
from random import choice
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib3

base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_urls = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
            f'newest&page={page}&pagesize=50' for page in range(1, 5)]

linkList = []

proxyVault = ['103.110.37.244:36022', '180.254.218.229:8080', '110.74.197.207:50632', '1.20.101.95:49001', '200.10.193.90:8080', '173.164.26.117:3128', '103.228.118.66:43002', '178.128.231.201:3128', '1.2.169.54:55312', '181.52.85.249:31487', '97.64.135.4:8080', '190.96.214.123:53251', '52.144.107.142:31923', '45.5.224.145:52035', '89.218.22.178:8080', '192.241.143.186:80', '113.53.29.218:38310', '36.78.131.182:39243']

def make_requests(url):
    proxy_url = choice(proxyVault)
    proxy = {'https': f'http://{proxy_url}'}
    try:
        res = requests.get(url, proxies=proxy, timeout=1)
        soup = BeautifulSoup(res.text, "lxml")
        linkList.extend([urljoin(base_url, item.get("href")) for item in soup.select(".summary .question-hyperlink")])
    except (requests.exceptions.ProxyError,
            requests.exceptions.Timeout,
            requests.exceptions.ConnectionError,
            urllib3.exceptions.MaxRetryError):
        if proxy_url in proxyVault:
            proxyVault.remove(proxy_url)
            print(f'kicked out bad proxy by first func: {proxy_url}')
        return make_requests(url)

def make_ano_requests(url):
    proxy_url = choice(proxyVault)
    proxy = {'https': f'http://{proxy_url}'}
    try:
        res = requests.get(url, proxies=proxy, timeout=1)
        get_title(res.text)
    except (requests.exceptions.ProxyError,
            requests.exceptions.Timeout,
            requests.exceptions.ConnectionError,
            urllib3.exceptions.MaxRetryError):
        if proxy_url in proxyVault:
            proxyVault.remove(proxy_url)
            print(f'kicked out bad proxy by second func: {proxy_url}')
        return make_ano_requests(url)

def get_title(response):
    soup = BeautifulSoup(response, "lxml")
    print(soup.select_one("h1[itemprop='name'] a").text)

if __name__ == '__main__':
    for lead_url in lead_urls:
        make_requests(lead_url)

    for single_link in linkList:
        make_ano_requests(single_link)

Upvotes: 3

Related Questions