Python Web Scraping / Beautiful Soup, with list of keywords at the end of URL

Question

I'm trying to build a webscraper to get the reviews of wine off Vivino.com. I have a large list of wines and wanted it to search

url = ("https://www.vivino.com/search/wines?q=")

Then cycle through the list. Scraping the rating text "4.5 - 203 reviews", the name of the wine and the attached link to page.

I found a 20 lines of code https://www.kashifaziz.me/web-scraping-python-beautifulsoup.html/ to build a web scraper. Was trying to compile it with

url = ("https://www.vivino.com/search/wines?q=")

#list having the keywords (made by splitting input with space as its delimiter) 
keyword = input().split()

#go through the keywords
for key in keywords :

   #everything else is same logic
   r = requests.get(url + key)

   print("URL :", url+key)
   if 'The specified profile could not be found.' in r.text:
        print("This is available")
   else :
        print('
Sorry that one is taken')

Also, where would I include the list of keywords?

I'd love any help with this! I'm trying to get better at python but not sure I'm at this level yet haha.

Thank you.

Andrej Kesely · Accepted Answer

This script traverses all pages for selected keyword and selects title, price, rating, reviews and link to wine:

import re
import requests
from time import sleep
from bs4 import BeautifulSoup

url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}

def get_wines(kw):
    with requests.session() as s:
        page = 1
        while True:
            soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')

            if not soup.select('.default-wine-card'):
                break

            params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
            prices_js = s.get(prices_url, params=params, headers={
                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
                'X-Requested-With': 'XMLHttpRequest',
                'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
                }).text

            wine_prices = dict(re.findall(r"\$$'\.vintage-price-id-(\d+)'$\.find$ '\.wine-price-value' $\.text$ '(.*?)' $;", prices_js))

            for wine_card in soup.select('.default-wine-card'):
                title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
                price = wine_prices.get(wine_card['data-vintage'], '-')

                average = wine_card.select_one('.average__number')
                average = average.get_text(strip=True) if average else '-'

                ratings = wine_card.select_one('.text-micro')
                ratings = ratings.get_text(strip=True) if ratings else '-'

                link = 'https://www.vivino.com' + wine_card.a['href']

                yield title, price, average, ratings, link

            sleep(3)
            page +=1

kw = 'angel'
for title, price, average, ratings, link in get_wines(kw):
    print(title)
    print(price)
    print(average + ' / ' + ratings)
    print(link)
    print('-' * 80)

Prints:

Angél ica Zapata Malbec Alta
-
4,4 / 61369 ratings
https://www.vivino.com/wines/1469874
--------------------------------------------------------------------------------
Château d'Esclans Whispering Angel Rosé
16,66
4,1 / 38949 ratings
https://www.vivino.com/wines/1473981
--------------------------------------------------------------------------------
Angél ica Zapata Cabernet Sauvignon Alta
-
4,3 / 27699 ratings
https://www.vivino.com/wines/1471376
--------------------------------------------------------------------------------

... and so on.

EDIT: To select only one wine, you can put keyword inside a list and then check each wine in loop:

import re
import requests
from time import sleep
from bs4 import BeautifulSoup

url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}

def get_wines(kw):
    with requests.session() as s:
        page = 1
        while True:
            soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')

            if not soup.select('.default-wine-card'):
                break

            params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
            prices_js = s.get(prices_url, params=params, headers={
                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
                'X-Requested-With': 'XMLHttpRequest',
                'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
                }).text

            wine_prices = dict(re.findall(r"\$$'\.vintage-price-id-(\d+)'$\.find$ '\.wine-price-value' $\.text$ '(.*?)' $;", prices_js))

            no = 1
            for no, wine_card in enumerate(soup.select('.default-wine-card'), 1):
                title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
                price = wine_prices.get(wine_card['data-vintage'], '-')

                average = wine_card.select_one('.average__number')
                average = average.get_text(strip=True) if average else '-'

                ratings = wine_card.select_one('.text-micro')
                ratings = ratings.get_text(strip=True) if ratings else '-'

                link = 'https://www.vivino.com' + wine_card.a['href']

                yield title, price, average, ratings, link

            # if no < 20:
            #     break

            # sleep(3)
            page +=1

wines = ['10 SPAN VINEYARDS CABERNET SAUVIGNON CENTRAL COAST',
         '10 SPAN VINEYARDS CHARDONNAY CENTRAL COAST']

for wine in wines:
    for title, price, average, ratings, link in get_wines(wine):
        print(title)
        print(price)
        print(average + ' / ' + ratings)
        print(link)
        print('-' * 80)

Prints:

10 Span Vineyards Cabernet Sauvignon
-
3,7 / 557 ratings
https://www.vivino.com/wines/4535453
--------------------------------------------------------------------------------
10 Span Vineyards Chardonnay
-
3,7 / 150 ratings
https://www.vivino.com/wines/5815131
--------------------------------------------------------------------------------

Python Web Scraping / Beautiful Soup, with list of keywords at the end of URL

Answers (2)

Related Questions