Reijarmo
Reijarmo

Reputation: 3

how to scrape the text body from multiple pages based on urls in a txt file

I tried to write a code that calls several URLs and then saves the entire scraped text in a txt file, but I can not figure out where to implement a loop function without breaking everything.

that's how the code looks right now:

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed


def getReadMe():
    with open('urls.txt','r') as file:
        return file.read()

def getHtml(readMe):
    ua = UserAgent()
    header = {'user-agent':ua.random}
    response = requests.get(readMe,headers=header,timeout=3)
    response.raise_for_status() 
    return response.content

readMe = getReadMe()


print(readMe)


html = getHtml(readMe)
soup = BeautifulSoup(html, 'html.parser')
text = soup.find_all(text=True)


output =''


blacklist = [
    '[document]',
    'noscript',
    'header',
    'html',
    'meta',
    'head', 
    'input',
    'script',
    'style'
    
]


for t in text:
    if t.parent.name not in blacklist:
        output += '{} '.format(t)

print(output)

with open("copy.txt", "w") as file:
    file.write(str(output))

Upvotes: 0

Views: 236

Answers (2)

Reijarmo
Reijarmo

Reputation: 3

A friend of a friend provided me an answer, which works at least with English urls. as soon as I find a solution for German urls (right now they crash terrible ^^) I will post it ttoo:

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed

def getHtml(url):
    ua = UserAgent()
    header = {'user-agent':ua.random}
    response = requests.get(url,timeout=10)
    response.raise_for_status() 
    return response.content

with open('urls.txt','r') as fd:
    for i, line in enumerate(fd.readlines()):
        url = line.strip()
        print("scraping " + url + "...")

              
        html = getHtml(url)
        soup = BeautifulSoup(html, 'html.parser',)
        text = soup.find_all(text=True)(text.encode('utf-8').decode('ascii', 'ignore')

        output =''

        blacklist = [
            '[document]',
            'noscript',
            'header',
            'html',
            'meta',
            'head', 
            'input',
            'script',
            'style'
            
        ]


        for t in text:
            if t.parent.name not in blacklist:
                output += '{} '.format(t)

        with open("{}.txt".format(i), "w") as out_fd:
            out_fd.write(output)
`

Upvotes: 0

alphaBetaGamma
alphaBetaGamma

Reputation: 677

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed


def getReadMe():
    with open('urls.txt','r') as file:
        return file.read()

def getHtml(readMe):
    ua = UserAgent()
    header = {'user-agent':ua.random}
    response = requests.get(readMe,headers=header,timeout=3)
    response.raise_for_status() 
    return response.content

readMe = getReadMe()


print(readMe)

for line in readMe:
    html = getHtml(line)
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.find_all(text=True)
    output =''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style'
        
    ]
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)

    print(output)
    #the option a makes u append the new data to the file
    with open("copy.txt", "a") as file:
        file.write(str(output))

Try this one and see if it works.

Upvotes: 1

Related Questions