Reputation: 3
I tried to write a code that calls several URLs and then saves the entire scraped text in a txt file, but I can not figure out where to implement a loop function without breaking everything.
that's how the code looks right now:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed
def getReadMe():
with open('urls.txt','r') as file:
return file.read()
def getHtml(readMe):
ua = UserAgent()
header = {'user-agent':ua.random}
response = requests.get(readMe,headers=header,timeout=3)
response.raise_for_status()
return response.content
readMe = getReadMe()
print(readMe)
html = getHtml(readMe)
soup = BeautifulSoup(html, 'html.parser')
text = soup.find_all(text=True)
output =''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style'
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
print(output)
with open("copy.txt", "w") as file:
file.write(str(output))
Upvotes: 0
Views: 236
Reputation: 3
A friend of a friend provided me an answer, which works at least with English urls. as soon as I find a solution for German urls (right now they crash terrible ^^) I will post it ttoo:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed
def getHtml(url):
ua = UserAgent()
header = {'user-agent':ua.random}
response = requests.get(url,timeout=10)
response.raise_for_status()
return response.content
with open('urls.txt','r') as fd:
for i, line in enumerate(fd.readlines()):
url = line.strip()
print("scraping " + url + "...")
html = getHtml(url)
soup = BeautifulSoup(html, 'html.parser',)
text = soup.find_all(text=True)(text.encode('utf-8').decode('ascii', 'ignore')
output =''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style'
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
with open("{}.txt".format(i), "w") as out_fd:
out_fd.write(output)
`
Upvotes: 0
Reputation: 677
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed
def getReadMe():
with open('urls.txt','r') as file:
return file.read()
def getHtml(readMe):
ua = UserAgent()
header = {'user-agent':ua.random}
response = requests.get(readMe,headers=header,timeout=3)
response.raise_for_status()
return response.content
readMe = getReadMe()
print(readMe)
for line in readMe:
html = getHtml(line)
soup = BeautifulSoup(html, 'html.parser')
text = soup.find_all(text=True)
output =''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style'
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
print(output)
#the option a makes u append the new data to the file
with open("copy.txt", "a") as file:
file.write(str(output))
Try this one and see if it works.
Upvotes: 1