Web Scraping using python Beautifulsoup. How to scrape the text in the dd and dt element?

Question

I am trying to scrape the yellow pages for some company info. Everything is going smoothly so far. But I just cannnot get the text in the dd and dt element in the particular company's page. Could you be so kind and help me with it? Every suggestion is much appreciated! Thanks.

Here is my code: (I frist enter the website and get the search results. Then, I get the link for the individual companies' webpage and parse what's in there. The problem is that I cannot get the info stored in the dd elements in the individual company's page.)

from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket

out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours   
"
f = open(out_filename, "w")
f.write(headers)

for i in range(0,50):
    page_url = "https://www.yellowpages.com/search?search_terms=expeditors&geo_location_terms=NJ&page=" + str(i+1) + "&sort=&source=real-user"
    req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    uClient = urlopen(req)
    page_soup = soup(uClient, "html.parser")
    uClient.close()
    containers = page_soup.findAll("div", {"class", "v-card"})
    for container in containers: 
        business_name = container.find("a", {"class":"business-name"}).span.text
        link = str(container.find("a",{"class":"business-name"}).get('href'))
        container_url = "https://www.yellowbook.com" + link
        req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
        uClient = urlopen(req_)
        container_soup = soup(uClient, "html.parser")
        uClient.close()
        info_list = container_soup.findAll("dd")
        
        try:
            business_type = container.find("div",{"class":"categories"}).text
        except:
            business_type = str(None)
        try: 
            years_in_business = str(container.find("div",{"class":"count"}).text)
        except: 
            years_in_business = str(None)
        try: 
            business_website = container.find("a",{"class":"track-visit-website"}).get('href')
        except: 
            business_website = str(None)
        try:
            business_address = container.find("div",{"class":"street-address"}).text + " " + container.find("div",{"class":"locality"}).text
        except:
            business_address = str(None)
        try: 
            business_phone = container.find("div",{"class":"phones phone primary"}).text
        except: 
            business_phone = str(None)
        try:
            general_info = info_list[0].text
        except:
            general_info = str(None)
        try:
            work_hours = info_list[1].text
        except:
            work_hours = str(None)
        
        print("business name: " + business_name + "
")
        print("business type: " + business_type + "
")
        print("years_in_business: " + years_in_business + "
")
        print("business_website: " + business_website + "
")
        print("business_address: " + business_address + "
")
        print("business_phone: " + business_phone + "
")
        print("general_info: " + general_info + "
")
        print("work_hours: " + work_hours + "
")

        
        f.write(business_name.replace(",", "|") + ", " + 
                business_type.replace(",", "|").replace("/", "|") + ", " + 
                years_in_business.replace(",", "|").replace("/", "|") + ", " + 
                business_website.replace(",", "|").replace("/", "|") + ", " + 
                business_address.replace(",", "|").replace("/", "|") + ", " + 
                business_phone.replace(",", "|").replace("/", "|") + ", " + 
                general_info.replace(",", "|").replace("/", "|") +
                work_hours.replace(",", "|").replace("/", "|") +
                "
")

f.close()

If you want to modify the code in a great deal or do it in an completely different way, please give some explaination so I can uderstand. I am new to programming. Thanks a lot.

Web Scraping using python Beautifulsoup. How to scrape the text in the dd and dt element?

Answers (1)

Related Questions