Reputation: 97
I have been trying to webscrape a realtor website using BeautifulSoup and encountered 2 difficulties that I cannot seem to fix.
Difficulties:
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
Thank you in advance for any assistance you could provide.
Python Code:
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
title = []
description = []
date = []
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('a', attrs={'class':'announcement-block__title'}):
text = tag2.get_text().strip()
if len(text) > 0:
title.append(text)
else:
title.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__description'}):
text = tag2.get_text().strip()
if len(text) > 0:
description.append(text)
else:
description.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
text = tag2.get_text().strip()
if len(text) > 0:
date.append(text)
else:
date.append('N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(title,description,date)),columns=['Title', 'Description', 'Date'])
Upvotes: 0
Views: 880
Reputation: 310
You get 66 items because your date[]
contains only 66 elements, therefore, you need to check all three fields at once in one for
loop. Your if else
checks do nothing as there are no announcement-block__date
divs with empty content on the page.
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
info = {
'title': [],
'description': [],
'date': []
}
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
title = tag.find('a', attrs={'class':'announcement-block__title'})
description = tag.find('div', attrs={'class':'announcement-block__description'})
date = tag.find('div', attrs={'class':'announcement-block__date'})
info['title'].append(title.get_text().strip() if title else 'N/A')
info['description'].append(description.get_text().strip() if description else 'N/A')
info['date'].append(date.get_text().strip() if date else 'N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(info['title'], info['description'], info['date'])),columns=['Title', 'Description', 'Date'])
print(len(info['title']), len(info['description']), len(info['date']))
print(data_frame)
About your second question, a similar question has already been answered here
Upvotes: 1