ZY Cao
ZY Cao

Reputation: 33

Web Scraping using python Beautifulsoup. How to scrape the text in the dd and dt element?

I am trying to scrape the yellow pages for some company info. Everything is going smoothly so far. But I just cannnot get the text in the dd and dt element in the particular company's page. Could you be so kind and help me with it? Every suggestion is much appreciated! Thanks.

Here is my code: (I frist enter the website and get the search results. Then, I get the link for the individual companies' webpage and parse what's in there. The problem is that I cannot get the info stored in the dd elements in the individual company's page.)

from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket

out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours   \n"
f = open(out_filename, "w")
f.write(headers)

for i in range(0,50):
    page_url = "https://www.yellowpages.com/search?search_terms=expeditors&geo_location_terms=NJ&page=" + str(i+1) + "&sort=&source=real-user"
    req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    uClient = urlopen(req)
    page_soup = soup(uClient, "html.parser")
    uClient.close()
    containers = page_soup.findAll("div", {"class", "v-card"})
    for container in containers: 
        business_name = container.find("a", {"class":"business-name"}).span.text
        link = str(container.find("a",{"class":"business-name"}).get('href'))
        container_url = "https://www.yellowbook.com" + link
        req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
        uClient = urlopen(req_)
        container_soup = soup(uClient, "html.parser")
        uClient.close()
        info_list = container_soup.findAll("dd")
        
        try:
            business_type = container.find("div",{"class":"categories"}).text
        except:
            business_type = str(None)
        try: 
            years_in_business = str(container.find("div",{"class":"count"}).text)
        except: 
            years_in_business = str(None)
        try: 
            business_website = container.find("a",{"class":"track-visit-website"}).get('href')
        except: 
            business_website = str(None)
        try:
            business_address = container.find("div",{"class":"street-address"}).text + " " + container.find("div",{"class":"locality"}).text
        except:
            business_address = str(None)
        try: 
            business_phone = container.find("div",{"class":"phones phone primary"}).text
        except: 
            business_phone = str(None)
        try:
            general_info = info_list[0].text
        except:
            general_info = str(None)
        try:
            work_hours = info_list[1].text
        except:
            work_hours = str(None)
        
        print("business name: " + business_name + "\n")
        print("business type: " + business_type + "\n")
        print("years_in_business: " + years_in_business + "\n")
        print("business_website: " + business_website + "\n")
        print("business_address: " + business_address + "\n")
        print("business_phone: " + business_phone + "\n")
        print("general_info: " + general_info + "\n")
        print("work_hours: " + work_hours + "\n")

        
        f.write(business_name.replace(",", "|") + ", " + 
                business_type.replace(",", "|").replace("/", "|") + ", " + 
                years_in_business.replace(",", "|").replace("/", "|") + ", " + 
                business_website.replace(",", "|").replace("/", "|") + ", " + 
                business_address.replace(",", "|").replace("/", "|") + ", " + 
                business_phone.replace(",", "|").replace("/", "|") + ", " + 
                general_info.replace(",", "|").replace("/", "|") +
                work_hours.replace(",", "|").replace("/", "|") +
                "\n")

f.close()

If you want to modify the code in a great deal or do it in an completely different way, please give some explaination so I can uderstand. I am new to programming. Thanks a lot.

Upvotes: 0

Views: 207

Answers (1)

import httpx
import trio
from bs4 import BeautifulSoup
import csv

limit = trio.CapacityLimiter(6)


async def scrape(client, item, sender):
    async with limit, sender:
        r = await client.get(f'https://www.yellowpages.com{item[1]}')
        soup = BeautifulSoup(r.text, 'lxml')

        try:
            bw = soup.select_one('.primary-btn')['href']
        except (TypeError, AttributeError):
            bw = None
        try:
            phone = soup.select_one('p.phone').text
        except (TypeError, AttributeError):
            phone = None
        try:
            biy = soup.select_one('.number').text
        except AttributeError:
            biy = None

        result = [item[0], bw, biy, phone]
        print(result)
        await sender.send(result)


async def worker(client, num, sender, nurse):
    async with limit, sender:
        params = {
            "search_terms": "expeditors",
            "geo_location_terms": "NJ",
            "page": num,
            "sort": "",
            "source": "real-user"
        }
        r = await client.get('https://www.yellowpages.com/search', params=params)
        soup = BeautifulSoup(r.text, 'lxml')
        goal = [(i.span.text, i['href'])
                for i in soup.select('.business-name')]
        for what in goal:
            nurse.start_soon(scrape, client, what, sender.clone())


async def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }
    async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
        client.headers.update(headers)

        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)

        async with sender:
            for item in range(1, 2):
                nurse.start_soon(worker, client, item, sender.clone(), nurse)


async def rec(receiver):
    with open('result.csv', 'w', buffering=1, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Business Name', 'Website',
                        'Years In Business', 'Phone'])
        async with receiver:
            async for value in receiver:
                writer.writerow(value)

if __name__ == "__main__":
    trio.run(main)

Upvotes: 1

Related Questions