Reputation: 47
I'm trying to build a webscraper to get the reviews of wine off Vivino.com. I have a large list of wines and wanted it to search
url = ("https://www.vivino.com/search/wines?q=")
Then cycle through the list. Scraping the rating text "4.5 - 203 reviews", the name of the wine and the attached link to page.
I found a 20 lines of code https://www.kashifaziz.me/web-scraping-python-beautifulsoup.html/ to build a web scraper. Was trying to compile it with
url = ("https://www.vivino.com/search/wines?q=")
#list having the keywords (made by splitting input with space as its delimiter)
keyword = input().split()
#go through the keywords
for key in keywords :
#everything else is same logic
r = requests.get(url + key)
print("URL :", url+key)
if 'The specified profile could not be found.' in r.text:
print("This is available")
else :
print('\nSorry that one is taken')
Also, where would I include the list of keywords?
I'd love any help with this! I'm trying to get better at python but not sure I'm at this level yet haha.
Thank you.
Upvotes: 0
Views: 1706
Reputation: 195613
This script traverses all pages for selected keyword and selects title, price, rating, reviews and link to wine:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def get_wines(kw):
with requests.session() as s:
page = 1
while True:
soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
if not soup.select('.default-wine-card'):
break
params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
prices_js = s.get(prices_url, params=params, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
}).text
wine_prices = dict(re.findall(r"\$\('\.vintage-price-id-(\d+)'\)\.find\( '\.wine-price-value' \)\.text\( '(.*?)' \);", prices_js))
for wine_card in soup.select('.default-wine-card'):
title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
price = wine_prices.get(wine_card['data-vintage'], '-')
average = wine_card.select_one('.average__number')
average = average.get_text(strip=True) if average else '-'
ratings = wine_card.select_one('.text-micro')
ratings = ratings.get_text(strip=True) if ratings else '-'
link = 'https://www.vivino.com' + wine_card.a['href']
yield title, price, average, ratings, link
sleep(3)
page +=1
kw = 'angel'
for title, price, average, ratings, link in get_wines(kw):
print(title)
print(price)
print(average + ' / ' + ratings)
print(link)
print('-' * 80)
Prints:
Angél ica Zapata Malbec Alta
-
4,4 / 61369 ratings
https://www.vivino.com/wines/1469874
--------------------------------------------------------------------------------
Château d'Esclans Whispering Angel Rosé
16,66
4,1 / 38949 ratings
https://www.vivino.com/wines/1473981
--------------------------------------------------------------------------------
Angél ica Zapata Cabernet Sauvignon Alta
-
4,3 / 27699 ratings
https://www.vivino.com/wines/1471376
--------------------------------------------------------------------------------
... and so on.
EDIT: To select only one wine, you can put keyword inside a list and then check each wine in loop:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def get_wines(kw):
with requests.session() as s:
page = 1
while True:
soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
if not soup.select('.default-wine-card'):
break
params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
prices_js = s.get(prices_url, params=params, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
}).text
wine_prices = dict(re.findall(r"\$\('\.vintage-price-id-(\d+)'\)\.find\( '\.wine-price-value' \)\.text\( '(.*?)' \);", prices_js))
no = 1
for no, wine_card in enumerate(soup.select('.default-wine-card'), 1):
title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
price = wine_prices.get(wine_card['data-vintage'], '-')
average = wine_card.select_one('.average__number')
average = average.get_text(strip=True) if average else '-'
ratings = wine_card.select_one('.text-micro')
ratings = ratings.get_text(strip=True) if ratings else '-'
link = 'https://www.vivino.com' + wine_card.a['href']
yield title, price, average, ratings, link
# if no < 20:
# break
# sleep(3)
page +=1
wines = ['10 SPAN VINEYARDS CABERNET SAUVIGNON CENTRAL COAST',
'10 SPAN VINEYARDS CHARDONNAY CENTRAL COAST']
for wine in wines:
for title, price, average, ratings, link in get_wines(wine):
print(title)
print(price)
print(average + ' / ' + ratings)
print(link)
print('-' * 80)
Prints:
10 Span Vineyards Cabernet Sauvignon
-
3,7 / 557 ratings
https://www.vivino.com/wines/4535453
--------------------------------------------------------------------------------
10 Span Vineyards Chardonnay
-
3,7 / 150 ratings
https://www.vivino.com/wines/5815131
--------------------------------------------------------------------------------
Upvotes: 0
Reputation: 420
import requests
#list having the keywords (made by splitting input with space as its delimiter)
keywords = input().split()
#go through the keywords
for key in keywords :
url = "https://www.vivino.com/search/wines?q={}".format(key)
#everything else is same logic
r = requests.get(url)
print("URL :", url)
if 'The specified profile could not be found.' in r.text:
print("This is available")
else :
print('\nSorry that one is taken')
for the list of keywords you can use a text file where you put in each line one keyword
Upvotes: 0