Beautiful soup get text for multiple pages

Question

I try to scrape news pages of a German party and store all information in a dataframe ("python beginner"). There exist only a small problem, when I want to store the whole text or even the date into the dataframe. It seems like that only the last element of the text (p ... /p) will be stored in the row. I think the problem occurs because the iteration over the loop is misleading.

import pandas as pd
import requests 
from time import sleep
from random import randint
from time import time
import numpy as np
from urllib.request import urlopen

data = pd.DataFrame()
teaser = ()
title = []
content = ()
childrenUrls = []
mainPage = "https://www.fdp.de"
start_time = time()
counter = 0

#for i in list(map(lambda x: x+1, range(3))):
for i in range(3):

    counter = counter + 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(counter, counter/elapsed_time))
    url = "https://www.fdp.de/seite/aktuelles?page="+str(i)
    #print(url)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    uls = soup.find_all('div', {'class': 'field-title'})

    for ul in uls:
        for li in ul.find_all('h2'):
            for link in li.find_all('a'):
                url = link.get('href')
                contents = link.text
                print(contents)
                childrenUrls = mainPage+url
                print(childrenUrls)

                childrenPages = urllib2.urlopen(childrenUrls)
                soupCP = BeautifulSoup(childrenPages, 'html.parser')

                #content1 = soupCP.findAll('p').get_text()
                #print(content1)

                for content in soupCP.findAll('p'):
                    #for message in content.get('p'):
                    content = content.text.strip()
                    print(content)

                for teaser in soupCP.find_all('div', class_ = 'field-teaser'):
                    teaser = teaser.text.strip()
                    print(date)

                for title in soupCP.find_all('title'):
                    title = title.text.strip()
                    print(ttt)

                df = pd.DataFrame(
                    {'teaser': teaser,
                     'title' : title,
                    'content' : content}, index=[counter])

                data = pd.concat([data, df])
    #join(str(v) for v in value_list)

Beautiful soup get text for multiple pages

Answers (1)

Related Questions