Reputation: 33
I am trying to scrape the yellow pages for some company info. Everything is going smoothly so far. But I just cannnot get the text in the dd and dt element in the particular company's page. Could you be so kind and help me with it? Every suggestion is much appreciated! Thanks.
Here is my code: (I frist enter the website and get the search results. Then, I get the link for the individual companies' webpage and parse what's in there. The problem is that I cannot get the info stored in the dd elements in the individual company's page.)
from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket
out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours \n"
f = open(out_filename, "w")
f.write(headers)
for i in range(0,50):
page_url = "https://www.yellowpages.com/search?search_terms=expeditors&geo_location_terms=NJ&page=" + str(i+1) + "&sort=&source=real-user"
req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
uClient = urlopen(req)
page_soup = soup(uClient, "html.parser")
uClient.close()
containers = page_soup.findAll("div", {"class", "v-card"})
for container in containers:
business_name = container.find("a", {"class":"business-name"}).span.text
link = str(container.find("a",{"class":"business-name"}).get('href'))
container_url = "https://www.yellowbook.com" + link
req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
uClient = urlopen(req_)
container_soup = soup(uClient, "html.parser")
uClient.close()
info_list = container_soup.findAll("dd")
try:
business_type = container.find("div",{"class":"categories"}).text
except:
business_type = str(None)
try:
years_in_business = str(container.find("div",{"class":"count"}).text)
except:
years_in_business = str(None)
try:
business_website = container.find("a",{"class":"track-visit-website"}).get('href')
except:
business_website = str(None)
try:
business_address = container.find("div",{"class":"street-address"}).text + " " + container.find("div",{"class":"locality"}).text
except:
business_address = str(None)
try:
business_phone = container.find("div",{"class":"phones phone primary"}).text
except:
business_phone = str(None)
try:
general_info = info_list[0].text
except:
general_info = str(None)
try:
work_hours = info_list[1].text
except:
work_hours = str(None)
print("business name: " + business_name + "\n")
print("business type: " + business_type + "\n")
print("years_in_business: " + years_in_business + "\n")
print("business_website: " + business_website + "\n")
print("business_address: " + business_address + "\n")
print("business_phone: " + business_phone + "\n")
print("general_info: " + general_info + "\n")
print("work_hours: " + work_hours + "\n")
f.write(business_name.replace(",", "|") + ", " +
business_type.replace(",", "|").replace("/", "|") + ", " +
years_in_business.replace(",", "|").replace("/", "|") + ", " +
business_website.replace(",", "|").replace("/", "|") + ", " +
business_address.replace(",", "|").replace("/", "|") + ", " +
business_phone.replace(",", "|").replace("/", "|") + ", " +
general_info.replace(",", "|").replace("/", "|") +
work_hours.replace(",", "|").replace("/", "|") +
"\n")
f.close()
If you want to modify the code in a great deal or do it in an completely different way, please give some explaination so I can uderstand. I am new to programming. Thanks a lot.
Upvotes: 0
Views: 207
Reputation: 11525
import httpx
import trio
from bs4 import BeautifulSoup
import csv
limit = trio.CapacityLimiter(6)
async def scrape(client, item, sender):
async with limit, sender:
r = await client.get(f'https://www.yellowpages.com{item[1]}')
soup = BeautifulSoup(r.text, 'lxml')
try:
bw = soup.select_one('.primary-btn')['href']
except (TypeError, AttributeError):
bw = None
try:
phone = soup.select_one('p.phone').text
except (TypeError, AttributeError):
phone = None
try:
biy = soup.select_one('.number').text
except AttributeError:
biy = None
result = [item[0], bw, biy, phone]
print(result)
await sender.send(result)
async def worker(client, num, sender, nurse):
async with limit, sender:
params = {
"search_terms": "expeditors",
"geo_location_terms": "NJ",
"page": num,
"sort": "",
"source": "real-user"
}
r = await client.get('https://www.yellowpages.com/search', params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(i.span.text, i['href'])
for i in soup.select('.business-name')]
for what in goal:
nurse.start_soon(scrape, client, what, sender.clone())
async def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
client.headers.update(headers)
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for item in range(1, 2):
nurse.start_soon(worker, client, item, sender.clone(), nurse)
async def rec(receiver):
with open('result.csv', 'w', buffering=1, newline='') as f:
writer = csv.writer(f)
writer.writerow(['Business Name', 'Website',
'Years In Business', 'Phone'])
async with receiver:
async for value in receiver:
writer.writerow(value)
if __name__ == "__main__":
trio.run(main)
Upvotes: 1