Reputation: 63
How do I scrape data from the following to get the Company, Name, address, city state, zip, Phone, email, website as different columns? from https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any
I want to split the details of the attorneys under various br tags to be separate entities i am stuck after finding the attorneys details how do i assign each br tag to a value like name? address etc?
import pandas as pd
from bs4 import BeautifulSoup, Tag
import requests
import re
data =[]
res=requests.get("https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any")
soup=BeautifulSoup(res.text,'lxml')
lawyers=soup.findAll('div',{'style':'float:left'})
Upvotes: 1
Views: 146
Reputation: 63
import pandas as pd
from bs4 import BeautifulSoup, Tag
import requests
import re
data=[]
res=requests.get("https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any")
soup=BeautifulSoup(res.text,'lxml')
lawyer=soup.findAll('div',{'style':'float:left'})
for item in lawyer:
lawyer_company=(item.contents[0].text)
lawyer_name=(item.contents[2])
lawyer_address=(item.contents[4])
lawyer_city=(item.contents[6])
lawyer_state=(item.contents[6])
lawyer_zip=(item.contents[6])
lawyer_phone=(item.contents[8])
lawyer_email=(item.contents[11])
if isinstance(lawyer_email, Tag):
lawyer_email=lawyer_email.text.strip()
lawyer_website=(item.contents[13])
if isinstance(lawyer_website, Tag):
lawyer_website=lawyer_website.text.strip()
full_dict={'Company':lawyer_company, 'Name':lawyer_name,'Address':lawyer_address,'City':lawyer_city,'State':lawyer_state,'Zip':lawyer_zip,'Phone':lawyer_phone,'Email':lawyer_phone,'Website':lawyer_website}
data.append(full_dict)
df=pd.DataFrame(data)
print(df)
Upvotes: 0
Reputation: 24940
Try it this way:
from bs4 import BeautifulSoup, Tag, NavigableString
import pandas as pd
import requests
res=requests.get("https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any")
soup=BeautifulSoup(res.text,'lxml')
lawyers=soup.findAll('div',{'style':'float:left'})
roster = []
for law in lawyers:
data = []
for item in law:
if isinstance(item, Tag) and len(item.text.strip())>0:
data.append(item.text.strip())
if isinstance(item, NavigableString):
data.append(item.strip())
roster.append(data)
df = pd.DataFrame(roster)
df.head()
Upvotes: 2