HamidBee
HamidBee

Reputation: 279

How to Bypass HTTP 403 Error When Scraping CoinGecko with Python?

I am trying to scrape the Bitcoin markets section from CoinGecko using Python. However, I keep encountering a HTTP 403 error. I have tried using the requests library with custom headers to mimic a real browser, but I still get the same error.

Here is the code I am using:

import requests
import pandas as pd

# Base URL for Bitcoin markets on CoinGecko
base_url = "https://www.coingecko.com/en/coins/bitcoin"

# Function to fetch a single page
def fetch_page(url, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    response = requests.get(f"{url}?page={page}", headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}: Status code {response.status_code}")
        return None
    return response.text

# Function to extract market data from a page
def extract_markets(html):
    dfs = pd.read_html(html)
    return dfs[0] if dfs else pd.DataFrame()

# Main function to scrape all pages
def scrape_all_pages(base_url, max_pages=10):
    all_markets = []
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        html = fetch_page(base_url, page)
        if html is None:
            break
        df = extract_markets(html)
        if df.empty:
            break
        all_markets.append(df)

    return pd.concat(all_markets, ignore_index=True) if all_markets else pd.DataFrame()

# Scrape data and store in a DataFrame
max_pages = 10  # Adjust this to scrape more pages if needed
df = scrape_all_pages(base_url, max_pages)

# Display the DataFrame
print(df)

error:

Scraping page 1...
Failed to fetch page 1: Status code 403
Empty DataFrame
Columns: []
Index: []

I also tried a suggested solution on stackoverflow, but it did not resolve the issue.

Could someone suggest a workaround or a more effective way to scrape this data? Any help would be greatly appreciated. Thank you in advance.

Upvotes: 0

Views: 328

Answers (1)

datawookie
datawookie

Reputation: 6564

As suggested in response to your other question, your requests are being identified as bot traffic and hence the 403 error. Use Playwright to access the site via a browser.

import time
from io import StringIO

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import pandas as pd

# 🚨 Specific URL for getting Bitcoin markets.
#
URL = f"https://www.coingecko.com/en/coins/1/markets/spot"

playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=False, slow_mo=2000)
context = browser.new_context(
    viewport={"width": 1280, "height": 900}
)

page = context.new_page()

def fetch_page(url):
    print(url)
    page.goto(url)
    time.sleep(5)

    return page.content()

def scrape_all_pages(url, max_pages=10):
    markets = []
    for page in range(1, max_pages + 1):
        html = fetch_page(f"{url}?page={page}")

        df = pd.read_html(StringIO(html))
        markets.extend(df)

    return pd.concat(markets, ignore_index=True)

max_pages = 10
df = scrape_all_pages(URL, max_pages)

page.close()

df = df.dropna(how='all')

print(df)

Top of output:

       #             Exchange Unnamed: 2       Pair       Price Spread    +2% Depth    -2% Depth      24h Volume Volume % Last Updated  Trust Score
1    1.0              Binance        CEX   BTC/USDT  $61,578.60  0.01%  $14,564,938  $19,766,330  $1,226,281,740    5.72%     Recently          NaN
2    2.0    Coinbase Exchange        CEX    BTC/USD  $61,570.63  0.01%  $15,912,548  $14,809,605    $667,341,947    3.12%     Recently          NaN
3    3.0               Kraken        CEX    BTC/USD  $61,584.00  0.01%  $13,621,680  $13,100,698     $50,592,315    0.24%     Recently          NaN
4    4.0              Gate.io        CEX   BTC/USDT  $61,584.36  0.01%  $12,523,800  $11,856,866    $202,923,100    0.95%     Recently          NaN
5    5.0              Binance        CEX  BTC/FDUSD  $61,568.21  0.01%   $8,355,196   $8,489,839  $1,901,656,552    8.88%     Recently          NaN
6    6.0                  OKX        CEX   BTC/USDT  $61,588.33  0.01%   $4,552,443  $13,952,016    $398,284,635    1.86%     Recently          NaN
7    7.0               Bitget        CEX   BTC/USDT  $61,580.27  0.01%   $8,598,359   $8,848,635    $239,239,284    1.12%     Recently          NaN
8    8.0               Kraken        CEX    BTC/EUR  $61,589.72  0.01%   $7,703,734   $7,050,064     $27,293,519    0.13%     Recently          NaN
9    9.0                Bybit        CEX   BTC/USDT  $61,583.98  0.01%   $2,208,077   $1,347,924  $1,103,150,476    5.15%     Recently          NaN
10  10.0               Pionex        CEX   BTC/USDT  $61,588.25  0.01%  $17,409,820  $15,637,094    $224,747,215    1.05%     Recently          NaN
12  11.0              Binance        CEX   WBTC/BTC  $61,700.03  0.02%   $5,446,446  $20,784,820      $8,023,065    0.04%     Recently          NaN
13  12.0  Crypto.com Exchange        CEX    BTC/USD  $61,588.55  0.01%   $2,539,978   $5,433,264    $359,030,532    1.68%     Recently          NaN
14  13.0  Crypto.com Exchange        CEX   BTC/USDT  $61,581.55  0.01%   $2,635,507   $5,917,342    $267,921,739    1.25%     Recently          NaN
15  14.0              Binance        CEX    ETH/BTC   $3,449.52  0.02%   $7,954,561   $9,115,420     $73,631,305    0.34%     Recently          NaN
16  15.0                LBank        CEX   BTC/USDT  $61,588.16  0.01%  $12,152,948  $12,658,338    $401,602,782    1.87%     Recently          NaN
17  16.0                 MEXC        CEX   BTC/USDT  $61,578.24  0.01%   $1,693,904   $2,007,183    $477,125,505    2.23%     Recently          NaN
18  17.0           CoinTR Pro        CEX   BTC/USDT  $61,578.39  0.01%   $9,294,010   $3,947,978    $158,178,596    0.74%     Recently          NaN
19  18.0             Bitfinex        CEX   BTC/USDT  $61,586.63  0.02%   $5,122,254   $5,538,887      $7,127,016    0.03%     Recently          NaN
20  19.0              Binance        CEX    BNB/BTC     $581.02  0.01%   $1,040,939   $5,591,340     $13,706,709    0.06%     Recently          NaN
21  20.0                Dcoin        CEX   BTC/USDT  $61,583.99  0.02%   $5,939,915   $5,770,785     $20,137,964    0.09%     Recently          NaN

Here's an asynchronous implementation too.

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

URL = f"https://www.coingecko.com/en/coins/1/markets/spot"

async def fetch_page(page, url):
    print(f"Fetching: {url}")
    await page.goto(url)
    await asyncio.sleep(5)
    return await page.content()

async def scrape_all_pages(url, max_pages=10):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=False, slow_mo=2000)
        context = await browser.new_context(viewport={"width": 1280, "height": 900})
        page = await context.new_page()

        markets = []
        for page_num in range(1, max_pages + 1):
            full_url = f"{url}?page={page_num}"
            html = await fetch_page(page, full_url)

            try:
                dfs = pd.read_html(StringIO(html))
                markets.extend(dfs)
            except ValueError as e:
                print(f"No tables found on page {page_num}: {e}")

        await page.close()
        await browser.close()

        return pd.concat(markets, ignore_index=True)

async def main():
    max_pages = 10
    df = await scrape_all_pages(URL, max_pages)

    df = df.dropna(how="all")

    print(df)

if __name__ == "__main__":
    asyncio.run(main())

Upvotes: 1

Related Questions