user9762321
user9762321

Reputation: 93

Scraping unique links from website with Python only retrieving 1 link

I'm trying to scrape only unique urls from a website and write them to a file as absolute links. WHen I initially scrape the site, I retrieve 253 links. However, when I use set() to retrieve only unique links and use code to turn them into absolute links, it only returns 1 url. I played around with it and removed the function unique_urls thinking that could be causing it, but it did the same thing. I'm new to Python and still grasping loops, so it could easily be something I've overlooked. Any ideas? Thank you.

import bs4
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv

url = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
results = soup.find_all("a")

print ('Number of links retrieved: ', len (results))

def unique_urls(tags,url):
    cleaned_urls = str(link.get("href"))

for link in results:
    link = link.get("href")
    if link.startswith('/'):
        cleaned_urls.add('https://www.census.gov' + hrefs)
    elif link.startswith('#'):
        cleaned_urls.add(hrefs)
    else: 
        unique_urls.add(cleaned_urls)

print ('Number of links retrieved: ', len )

Upvotes: 0

Views: 939

Answers (2)

venkatesh
venkatesh

Reputation: 141

use below code for unique urls

import bs4
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv
import re

url = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
results = soup.find_all("a")

print('Number of links retrieved: ', len(results))

list_urls = []
for link in results:
    link = link.get("href")
    if link is None or link == "#content":
        pass
    else:
        if re.match(r"https://", link):
            list_urls.append(link)
print("******* total urls*********")
print("total count of urls: ", len(list_urls))
print("*********** after unique*****")
unique_urls = set(list_urls)
print("length of unique urls:", len(set(list_urls)))

Upvotes: 1

Prakhar Jhudele
Prakhar Jhudele

Reputation: 955

Is this what you require ?. I actually get only 92 valid and unique URL.

import bs4
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv
from functools import partial  
from operator import is_not


import re

filter_null = partial(filter, partial(is_not, None))

checklink = []

url = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
results = soup.find_all("a")

print ('Number of links retrieved: ', len (results))

for result in results:

    checklink.append(result.get('href'))
    L =list(filter_null(checklink))

    regex = re.compile(r'http')

    selected_files = list(filter(regex.match, L))
distinctTitle  = (list(set(selected_files)))

#def unique_urls(tags,url):
#    cleaned_urls = str(link.get("href"))
#
#for link in results:
#    link = link.get("href")
#    if link.startswith('/'):
#        cleaned_urls.add('https://www.census.gov' + hrefs)
#    elif link.startswith('#'):
#        cleaned_urls.add(hrefs)
#    else: 
#        unique_urls.add(cleaned_urls)

print ('Number of Unique and cleaned links retrieved: ', len(distinctTitle))

Upvotes: 0

Related Questions