Reputation: 93
I'm trying to scrape only unique urls from a website and write them to a file as absolute links. WHen I initially scrape the site, I retrieve 253 links. However, when I use set() to retrieve only unique links and use code to turn them into absolute links, it only returns 1 url. I played around with it and removed the function unique_urls thinking that could be causing it, but it did the same thing. I'm new to Python and still grasping loops, so it could easily be something I've overlooked. Any ideas? Thank you.
import bs4
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv
url = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
results = soup.find_all("a")
print ('Number of links retrieved: ', len (results))
def unique_urls(tags,url):
cleaned_urls = str(link.get("href"))
for link in results:
link = link.get("href")
if link.startswith('/'):
cleaned_urls.add('https://www.census.gov' + hrefs)
elif link.startswith('#'):
cleaned_urls.add(hrefs)
else:
unique_urls.add(cleaned_urls)
print ('Number of links retrieved: ', len )
Upvotes: 0
Views: 939
Reputation: 141
use below code for unique urls
import bs4
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv
import re
url = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
results = soup.find_all("a")
print('Number of links retrieved: ', len(results))
list_urls = []
for link in results:
link = link.get("href")
if link is None or link == "#content":
pass
else:
if re.match(r"https://", link):
list_urls.append(link)
print("******* total urls*********")
print("total count of urls: ", len(list_urls))
print("*********** after unique*****")
unique_urls = set(list_urls)
print("length of unique urls:", len(set(list_urls)))
Upvotes: 1
Reputation: 955
Is this what you require ?. I actually get only 92 valid and unique URL.
import bs4
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv
from functools import partial
from operator import is_not
import re
filter_null = partial(filter, partial(is_not, None))
checklink = []
url = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
results = soup.find_all("a")
print ('Number of links retrieved: ', len (results))
for result in results:
checklink.append(result.get('href'))
L =list(filter_null(checklink))
regex = re.compile(r'http')
selected_files = list(filter(regex.match, L))
distinctTitle = (list(set(selected_files)))
#def unique_urls(tags,url):
# cleaned_urls = str(link.get("href"))
#
#for link in results:
# link = link.get("href")
# if link.startswith('/'):
# cleaned_urls.add('https://www.census.gov' + hrefs)
# elif link.startswith('#'):
# cleaned_urls.add(hrefs)
# else:
# unique_urls.add(cleaned_urls)
print ('Number of Unique and cleaned links retrieved: ', len(distinctTitle))
Upvotes: 0