Reputation: 117
I am writing a web crawler to extract the information of a website. However, since I am using Beautiful Soup 4 to extract large amount of data on window, the crawling speed is extremely slow. Can Anyone tell me how to use multi threading in my case. Thank you so much if you wanna help me. My code is as below:
import requests
from html.parser import HTMLParser
from bs4 import BeautifulSoup
import re
import time
import sys
import json
HTML_PARSER = "html.parser"
def get_shop_link_list(links):
global food_id
list_req=requests.get(links)
if list_req.status_code == requests.codes.ok:
soup = BeautifulSoup(list_req.content, HTML_PARSER)
link_tag=soup.find_all('h2',{'class':'heavy break-word'})
for h2 in link_tag:
print(food_id,end="@")
link=h2.find('a')
parse_shop_information(link['href'])
food_id=food_id+1
print("")
#turn next page
try:
next_page=soup.find('a',{'class':'button -small'})
get_shop_link_list(next_page['href'])
except KeyError:
pass
def parse_shop_information(shop_link):
req=requests.get(shop_link)
if req.status_code == requests.codes.ok:
soup = BeautifulSoup(req.content, HTML_PARSER)
#restaurant name
shop_header_tags = soup.find('div',{'class' : 'cassette'})
japname_tag=shop_header_tags.find('p',{'class':'small'})
print(japname_tag.get_text(" ",strip=True).encode(encoding="utf-8",errors="strict").decode(sys.stdin.encoding).strip(),end="@")
name_tag = shop_header_tags.find('h1',{'class' : 'jumbo break-word'})
print(name_tag.get_text(" ",strip=True).encode(encoding="utf-8",errors="strict").decode(sys.stdin.encoding).strip(),end="@")
#Basic information
shop_body_tags = soup.find('ul',{'class' : 'icon-list -space sentence'})
information = shop_body_tags.find_all('li')
for li in information:
print((li.get_text("|",strip=True).encode(encoding="utf-8",errors="strict")).strip(),end="{")
#Detail information
restaurant_detail = soup.find_all("table", {"class":"table sentence"})
basic_detail=restaurant_detail[0].find_all('tr')
address="No record"
address_pic="No record"
access="No record"
parking="No record"
service="No record"
cards="No record"
for tr in basic_detail:
if tr.find('th').get_text(" ",strip=True)=='Address':
address=((tr.find('p').get_text(" ",strip=True)))
address_pic=((tr.find('img')["src"]))
if tr.find('th').get_text(" ",strip=True)=='Access':
access=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Parking':
parking=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Service charge':
service=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Cards Accepted':
cards=((tr.find('td').get_text(" ",strip=True)))
print(address.strip(),end="@")
print(address_pic.strip(),end="@")
print(access.strip(),end="@")
print(parking.strip(),end="@")
print(service.strip(),end="@")
print(cards.strip(),end="@")
try:
facility_detail=restaurant_detail[1].find_all('tr')
seating="No record"
MPS="No record"
RMPS="No record"
Smoking="No record"
WAR="No record"
KF="No record"
LS="No record"
WP="No record"
Other="No record"
for tr in facility_detail:
if tr.find('th').get_text(" ",strip=True)=='Seating Capacity':
seating=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Banquet Maximum Party Size':
MPS=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Reservation Maximum Party Size':
RMPS=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Smoking':
Smoking=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Wheelchair Accessible Restrooms':
WAR=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Kid Friendly':
KF=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Language Support':
LS=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Wi-Fi / Plug-in':
WP=((tr.find('td').get_text(" ",strip=True)))
if tr.find('th').get_text(" ",strip=True)=='Other':
Other=((tr.find('td').get_text(" ",strip=True)))
print(seating.strip(),end="@")
print(MPS.strip(),end="@")
print(RMPS.strip(),end="@")
print(Smoking.strip(),end="@")
print(WAR.strip(),end="@")
print(KF.strip(),end="@")
print(LS.strip(),end="@")
print(WP.strip(),end="@")
print(Other.strip(),end="@")
except IndexError:
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
try:
other_detail=restaurant_detail[2].find_all('tr')
menu="No record"
lunch="No record"
dress_code="No record"
Delivery="No record"
for tr in other_detail:
if tr.find('th').get_text(" ",strip=True)=='Lunch Service':
lunch=(tr.find('td').get_text(" ",strip=True))
if tr.find('th').get_text(" ",strip=True)=='Dress Code':
dress_code=(tr.find('td').get_text(" ",strip=True))
if tr.find('th').get_text(" ",strip=True)=='Menu':
lunch=(tr.find('td').get_text(" ",strip=True))
if tr.find('th').get_text(" ",strip=True)=='Delivery / Catering':
dress_code=(tr.find('td').get_text(" ",strip=True))
print(menu.strip(),end="@")
print(lunch.strip(),end="@")
print(dress_code.strip(),end="@")
print(Delivery.strip(),end="@")
except IndexError:
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
print("No record",end="@")
try:
main_col_tag=soup.find('div',{'class':'global-navigation'})
main_col=main_col_tag.find_all('li')
for li in main_col:
if li.find('a').get_text()=="Menu":
print("{",end="")
sub_menu(shop_link)
print("}",end="")
except (IndexError, AttributeError):
print("No record",end="@")
def sub_menu(link):
list_req=requests.get(link)
if list_req.status_code == requests.codes.ok:
soup=BeautifulSoup(list_req.content,HTML_PARSER)
all_menu_tag=soup.find('ul',{'class':'-sub-menu hide'})
menus=all_menu_tag.find_all('li')
for i, li in enumerate(menus):
type=li.find('a').get_text()
print("\"",end="")
print(type,end="\":[")
link=li.find('a')
sub_menu_json(link['href'])
if i != len(menus)-1:
print("]",end=",")
else:
print("]",end="")
def sub_menu_json(link):
list_req=requests.get(link)
if list_req.status_code == requests.codes.ok:
soup = BeautifulSoup(list_req.content, HTML_PARSER)
eachfood=soup.find_all('div',{'class':'cassette normal-colored'})
for i,div in enumerate(eachfood):
food_jap_name="No record"
food_eng_name="No record"
food_price="No record"
tax_inclusion="No record"
description="No record"
if div.find('div',{'class':'small'}):
food_jap_name=div.find('div',{'class':'small'}).get_text(" ",strip=True)
if div.find('h3',{'class':'huge'}):
food_eng_name=div.find('h3',{'class':'huge'}).get_text(" ",strip=True)
if div.find('h3',{'class':'huge abit-spacing'}):
food_eng_name=div.find('h3',{'class':'huge'}).get_text(" ",strip=True)
if div.find('p',{'class':'small spacing'}):
food_price=div.find('p',{'class':'small spacing'}).get_text(" ",strip=True)
if div.find('span',{'class':'-value'}):
food_price=div.find('span',{'class':'-value'}).get_text(" ",strip=True)
if div.find('p',{'class':'text-right small'}):
tax_inclusion=div.find('p',{'class':'text-right small'}).get_text(" ",strip=True)
if div.find('div',{'class':'panel -light-silver -in'}):
description=div.find('div',{'class':'panel -light-silver -in'}).get_text(" ",strip=True)
if div.find('div',{'class':'sake-detail'}):
description=div.find('div',{'class':'sake-detail'}).get_text(" ",strip=True)
print("{\"JpnFoodname:\":",end="\"")
print(food_jap_name.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",")
print("\"EngFoodname\":",end="\"")
print(food_eng_name.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",")
print("\"Price\":",end="\"")
print(food_price.strip().encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",")
print("\"TaxIncludeExclude\":",end="\"")
print(tax_inclusion.strip().encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",")
print("\"Description\":",end="\"")
print(description.strip().encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",")
if div.find('a') or div.find('img'):
print("\"ImgUrl:\":",end="\"")
if div.find('img'):
food_pic=div.find('img')["src"]
if div.find('a'):
food_pic=div.find('a')['href']
print(food_pic,end="\"}")
else:
print("\"ImgUrl:\":\"No record\"",end="}")
if i != len(eachfood)-1:
print(',',end="")
if __name__ == '__main__':
global food_id
food_id=1
get_shop_link_list("https://gurunavi.com/en/reg/pf_tokyo/rs/srh/?p=461")
Upvotes: 1
Views: 955
Reputation: 4961
Here's an outline of what you can do; split the work into individual jobs, add the jobs to a jobQueue and start as many processes as you need (you pass the jobQueue to each one)
Each process takes one job from the queue as log as there are still jobs to process.
This way the number of workers is configurable
import multiprocessing,Queue
def getData(tasksQ):
while (True):
try:
job=tasksQ.get(block = False)
except Queue.Empty:
break;
else:
<do_work>
tasks = multiprocessing.Queue()
for job in getJob():
tasks.put(job)
noOfProcesses = 10
processes = []
for i in range(noOfProcesses):
p = multiprocessing.Process(target = getData, args=(tasks,))
processes.append(p)
for p in processes:
p.start()
for p in processes:
p.join()
Hope this helps.
Upvotes: 1