Reputation: 279
I am trying to scrape the Bitcoin markets section from CoinGecko using Python. However, I keep encountering a HTTP 403 error. I have tried using the requests library with custom headers to mimic a real browser, but I still get the same error.
Here is the code I am using:
import requests
import pandas as pd
# Base URL for Bitcoin markets on CoinGecko
base_url = "https://www.coingecko.com/en/coins/bitcoin"
# Function to fetch a single page
def fetch_page(url, page):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
response = requests.get(f"{url}?page={page}", headers=headers)
if response.status_code != 200:
print(f"Failed to fetch page {page}: Status code {response.status_code}")
return None
return response.text
# Function to extract market data from a page
def extract_markets(html):
dfs = pd.read_html(html)
return dfs[0] if dfs else pd.DataFrame()
# Main function to scrape all pages
def scrape_all_pages(base_url, max_pages=10):
all_markets = []
for page in range(1, max_pages + 1):
print(f"Scraping page {page}...")
html = fetch_page(base_url, page)
if html is None:
break
df = extract_markets(html)
if df.empty:
break
all_markets.append(df)
return pd.concat(all_markets, ignore_index=True) if all_markets else pd.DataFrame()
# Scrape data and store in a DataFrame
max_pages = 10 # Adjust this to scrape more pages if needed
df = scrape_all_pages(base_url, max_pages)
# Display the DataFrame
print(df)
error:
Scraping page 1...
Failed to fetch page 1: Status code 403
Empty DataFrame
Columns: []
Index: []
I also tried a suggested solution on stackoverflow, but it did not resolve the issue.
Could someone suggest a workaround or a more effective way to scrape this data? Any help would be greatly appreciated. Thank you in advance.
Upvotes: 0
Views: 328
Reputation: 6564
As suggested in response to your other question, your requests are being identified as bot traffic and hence the 403 error. Use Playwright to access the site via a browser.
import time
from io import StringIO
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import pandas as pd
# 🚨 Specific URL for getting Bitcoin markets.
#
URL = f"https://www.coingecko.com/en/coins/1/markets/spot"
playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=False, slow_mo=2000)
context = browser.new_context(
viewport={"width": 1280, "height": 900}
)
page = context.new_page()
def fetch_page(url):
print(url)
page.goto(url)
time.sleep(5)
return page.content()
def scrape_all_pages(url, max_pages=10):
markets = []
for page in range(1, max_pages + 1):
html = fetch_page(f"{url}?page={page}")
df = pd.read_html(StringIO(html))
markets.extend(df)
return pd.concat(markets, ignore_index=True)
max_pages = 10
df = scrape_all_pages(URL, max_pages)
page.close()
df = df.dropna(how='all')
print(df)
Top of output:
# Exchange Unnamed: 2 Pair Price Spread +2% Depth -2% Depth 24h Volume Volume % Last Updated Trust Score
1 1.0 Binance CEX BTC/USDT $61,578.60 0.01% $14,564,938 $19,766,330 $1,226,281,740 5.72% Recently NaN
2 2.0 Coinbase Exchange CEX BTC/USD $61,570.63 0.01% $15,912,548 $14,809,605 $667,341,947 3.12% Recently NaN
3 3.0 Kraken CEX BTC/USD $61,584.00 0.01% $13,621,680 $13,100,698 $50,592,315 0.24% Recently NaN
4 4.0 Gate.io CEX BTC/USDT $61,584.36 0.01% $12,523,800 $11,856,866 $202,923,100 0.95% Recently NaN
5 5.0 Binance CEX BTC/FDUSD $61,568.21 0.01% $8,355,196 $8,489,839 $1,901,656,552 8.88% Recently NaN
6 6.0 OKX CEX BTC/USDT $61,588.33 0.01% $4,552,443 $13,952,016 $398,284,635 1.86% Recently NaN
7 7.0 Bitget CEX BTC/USDT $61,580.27 0.01% $8,598,359 $8,848,635 $239,239,284 1.12% Recently NaN
8 8.0 Kraken CEX BTC/EUR $61,589.72 0.01% $7,703,734 $7,050,064 $27,293,519 0.13% Recently NaN
9 9.0 Bybit CEX BTC/USDT $61,583.98 0.01% $2,208,077 $1,347,924 $1,103,150,476 5.15% Recently NaN
10 10.0 Pionex CEX BTC/USDT $61,588.25 0.01% $17,409,820 $15,637,094 $224,747,215 1.05% Recently NaN
12 11.0 Binance CEX WBTC/BTC $61,700.03 0.02% $5,446,446 $20,784,820 $8,023,065 0.04% Recently NaN
13 12.0 Crypto.com Exchange CEX BTC/USD $61,588.55 0.01% $2,539,978 $5,433,264 $359,030,532 1.68% Recently NaN
14 13.0 Crypto.com Exchange CEX BTC/USDT $61,581.55 0.01% $2,635,507 $5,917,342 $267,921,739 1.25% Recently NaN
15 14.0 Binance CEX ETH/BTC $3,449.52 0.02% $7,954,561 $9,115,420 $73,631,305 0.34% Recently NaN
16 15.0 LBank CEX BTC/USDT $61,588.16 0.01% $12,152,948 $12,658,338 $401,602,782 1.87% Recently NaN
17 16.0 MEXC CEX BTC/USDT $61,578.24 0.01% $1,693,904 $2,007,183 $477,125,505 2.23% Recently NaN
18 17.0 CoinTR Pro CEX BTC/USDT $61,578.39 0.01% $9,294,010 $3,947,978 $158,178,596 0.74% Recently NaN
19 18.0 Bitfinex CEX BTC/USDT $61,586.63 0.02% $5,122,254 $5,538,887 $7,127,016 0.03% Recently NaN
20 19.0 Binance CEX BNB/BTC $581.02 0.01% $1,040,939 $5,591,340 $13,706,709 0.06% Recently NaN
21 20.0 Dcoin CEX BTC/USDT $61,583.99 0.02% $5,939,915 $5,770,785 $20,137,964 0.09% Recently NaN
Here's an asynchronous implementation too.
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
URL = f"https://www.coingecko.com/en/coins/1/markets/spot"
async def fetch_page(page, url):
print(f"Fetching: {url}")
await page.goto(url)
await asyncio.sleep(5)
return await page.content()
async def scrape_all_pages(url, max_pages=10):
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=False, slow_mo=2000)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
markets = []
for page_num in range(1, max_pages + 1):
full_url = f"{url}?page={page_num}"
html = await fetch_page(page, full_url)
try:
dfs = pd.read_html(StringIO(html))
markets.extend(dfs)
except ValueError as e:
print(f"No tables found on page {page_num}: {e}")
await page.close()
await browser.close()
return pd.concat(markets, ignore_index=True)
async def main():
max_pages = 10
df = await scrape_all_pages(URL, max_pages)
df = df.dropna(how="all")
print(df)
if __name__ == "__main__":
asyncio.run(main())
Upvotes: 1