Reputation: 45
Im trying to write a scraper that randomly chooses a wiki article link from a page, goes there, grabs another, and loops that. I want to exclude links with "Category:", "File:", "List" in the href. Im pretty sure the links i want are all inside of p tags, but when I include "p" in find_all, i get "int object is not subscriptable" error.
The code below returns wiki pages but does not exclude the things i want to filter out.
This is a learning journey for me. All help is appreciated.
import requests
from bs4 import BeautifulSoup
import random
import time
def scrapeWikiArticle(url):
response = requests.get(
url=url,
)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find(id="firstHeading")
print(title.text)
print(url)
allLinks = soup.find(id="bodyContent").find_all("a")
random.shuffle(allLinks)
linkToScrape = 0
for link in allLinks:
# Here i am trying to select hrefs with /wiki/ in them and exclude hrefs with "Category:" etc. It does select for wikis but does not exclude anything.
if link['href'].find("/wiki/") == -1:
if link['href'].find("Category:") == 1:
if link['href'].find("File:") == 1:
if link['href'].find("List") == 1:
continue
# Use this link to scrape
linkToScrape = link
articleTitles = open("savedArticles.txt", "a+")
articleTitles.write(title.text + ", ")
articleTitles.close()
time.sleep(6)
break
scrapeWikiArticle("https://en.wikipedia.org" + linkToScrape['href'])
scrapeWikiArticle("https://en.wikipedia.org/wiki/Anarchism")
Upvotes: 0
Views: 254
Reputation: 84465
Use :not()
to handle the list of exclusions within href
alongside *
contains operator. This will filter out hrefs
containing (*
) specified substrings. Precede this with an attribute = value selector
that contains *
/wiki/
. I have specified a case insensitive match via i
, for the first two, which can be removed:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://en.wikipedia.org/wiki/2018_FIFA_World_Cup#Prize_money')
soup = bs(r.content, 'lxml') # 'html.parser'
links = [i['href'] for i in soup.select('#bodyContent a[href*="/wiki/"]:not([href*="Category:" i], [href*="File:" i], [href*="List"])')]
Upvotes: 1
Reputation: 1498
You need to modify the for loop, .attrs is used to access the attributes of any tag. If you want to exclude links if the href value contains particular keyword then use !=-1
comparison.
Modified code:
import requests
from bs4 import BeautifulSoup
import random
import time
def scrapeWikiArticle(url):
response = requests.get(
url=url,
)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find(id="firstHeading")
allLinks = soup.find(id="bodyContent").find_all("a")
random.shuffle(allLinks)
linkToScrape = 0
for link in allLinks:
if("href" in link.attrs):
if link.attrs['href'].find("/wiki/") == -1 or link.attrs['href'].find("Category:") != -1 or link.attrs['href'].find("File:") != -1 or link.attrs['href'].find("List") != -1:
continue
linkToScrape = link
articleTitles = open("savedArticles.txt", "a+")
articleTitles.write(title.text + ", ")
articleTitles.close()
time.sleep(6)
break
if(linkToScrape):
scrapeWikiArticle("https://en.wikipedia.org" + linkToScrape.attrs['href'])
scrapeWikiArticle("https://en.wikipedia.org/wiki/Anarchism")
Upvotes: 1
Reputation: 2811
This section seems problematic.
if link['href'].find("/wiki/") == -1:
if link['href'].find("Category:") == 1:
if link['href'].find("File:") == 1:
if link['href'].find("List") == 1:
continue
find returns the index of the substring you are looking for, you are also using it wrong.
So if wiki
is not found or Category:
, File:
etc. appears in href, then continue.
if link['href'].find("/wiki/") == -1 or \
link['href'].find("Category:") != -1 or \
link['href'].find("File:") != -1 or \
link['href'].find("List")!= -1 :
print("skipped " + link["href"])
continue
Saint Petersburg
https://en.wikipedia.org/wiki/St._Petersburg
National Diet Library
https://en.wikipedia.org/wiki/NDL_(identifier)
Template talk:Authority control files
https://en.wikipedia.org/wiki/Template_talk:Authority_control_files
skipped #searchInput
skipped /w/index.php?title=Template_talk:Authority_control_files&action=edit§ion=1
User: Tom.Reding
https://en.wikipedia.org/wiki/User:Tom.Reding
skipped http://toolserver.org/~dispenser/view/Main_Page
Iapetus (moon)
https://en.wikipedia.org/wiki/Iapetus_(moon)
87 Sylvia
https://en.wikipedia.org/wiki/87_Sylvia
skipped /wiki/List_of_adjectivals_and_demonyms_of_astronomical_bodies
Asteroid belt
https://en.wikipedia.org/wiki/Main_asteroid_belt
Detached object
https://en.wikipedia.org/wiki/Detached_object
Upvotes: 1