Reputation: 331
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
def scrap_hrefs(url,baseUrl):
resp = requests.get(url, headers= header)
respData = BeautifulSoup(resp.content, 'html.parser')
allHrefs = respData.select('[href]')
return allHrefs, baseUrl
def get_hrefs(allHrefs, baseUrl):
for i in range(0,len(allHrefs)):
if allHrefs[i]['href'].startswith('/'):
allHrefs[i]= baseUrl + allHrefs[i]['href']
else:
allHrefs[i]= allHrefs[i]['href']
return allHrefs
def clean_hrefs(allHrefs):
links = {'links' : allHrefs}
df = pd.DataFrame(links).drop_duplicates()
df = df[df['links'].str.contains('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings|news|media')]
for i in range(0,len(df)):
if df[i]['links'].str.find('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings')!= -1:
df[i]['segments'] = df['Finance']
else:
continue
return df
def store_hrefs(df):
df.to_csv("testing.csv", index=False)
def run_scraper(url,baseUrl) :
store_hrefs(clean_hrefs(get_hrefs(*scrap_hrefs(url, baseUrl))))
run_scraper('https://www.example.com/','https://www.example.com')
In clean_hrefs() function, I want to get the first link from the data frame, check if it's content has the word 'finance, investors, ir, report, filings'. If it does, create another column called 'segments' and assign it id 'FINANCE'. But it's giving an error. KeyError: 0 Any help would be much appreciated!
Upvotes: 1
Views: 59
Reputation: 862511
You can set column to another one by mask, similar like filtration, if no matching get missing values:
mask = df['links'].str.contains('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings')
df.loc[mask, 'segments'] = 'Finance'
working like:
df['segments'] = np.where(mask, 'Finance', np.nan)
EDIT:
If want set multiple values you can specify new values in dictionary and then set column segments
like:
d = {'INVESTOR':'financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings',
'NEWS':'news|media'}
for k, v in d.items():
df.loc[df['links'].str.contains(v, na=False), 'segmentID'] = k
Upvotes: 2