Vincent
Vincent

Reputation: 137

Web Scraping data from multiple pages then appending it to csv file

I'm working on web scraping with beautiful soup to retrieve jobs from indeed. My code is working but when it loops to the next page it would overwrite the existing CSV file. I see from other posts that I would need to use pandas concat? but I can't seem to get it to work or where to implement it in my source code. Any suggestions to improve my code would also be greatly appreciated.

Below scrape pages 1-2 on indeed.

from bs4 import BeautifulSoup
import requests, pandas as pd
from urllib.parse import urljoin


print('Getting new jobs...')

main_url = 'https://www.indeed.com/jobs?q=web+developer&l=Sacramento,+CA&sort=date'
start_from = '&start='  


for page in range(1, 3):
    page = (page - 1) * 10
    url = "%s%s%d" % (main_url, start_from, page)  # get full url
    indeed = requests.get(url)
    indeed.raise_for_status()
    soup = BeautifulSoup(indeed.text, 'html.parser')

    home = 'https://www.indeed.com/viewjob?'
    jobsTitle, companiesName, citiesName, jobsSummary, jobsLink = [], [], [], [], []
    target = soup.find_all('div', class_=' row result')

    for div in target:

        if div:
            title = div.find('a', class_='turnstileLink').text.strip()
            jobsTitle.append(title)

            company = div.find('span', class_='company').text.strip()
            companiesName.append(company)

            city = div.find('span', class_='location').text.strip()
            citiesName.append(city)

            summary = div.find('span', class_='summary').text.strip()
            jobsSummary.append(summary)

            job_link = urljoin(home, div.find('a').get('href'))
            jobsLink.append(job_link)


    target2 = soup.find_all('div', class_='lastRow row result')
    for i in target2:
        title2 = i.find('a', class_='turnstileLink').text.strip()
        jobsTitle.append(title2)

        company2 = i.find('span', class_='company').text.strip()
        companiesName.append(company2)

        city2 = i.find('span', class_='location').text.strip()
        citiesName.append(city2)

        summary2 = i.find('span', class_='summary').text.strip()
        jobsSummary.append(summary2)

        jobLink2 = urljoin(home, i.find('a').get('href'))
        jobsLink.append(jobLink2)


    data_record = []
    for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
        data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})

    df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
df

Upvotes: 2

Views: 1068

Answers (1)

jezrael
jezrael

Reputation: 863481

You can crate list data_record out of loop with DataFrame contructor:

data_record = []
for page in range(1, 3):
    page = (page - 1) * 10
    url = "%s%s%d" % (main_url, start_from, page)  # get full url
    indeed = requests.get(url)
    indeed.raise_for_status()
    soup = BeautifulSoup(indeed.text, 'html.parser')

...

    for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
        data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})

df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])

Possible solution with concat:

dfs = []
for page in range(1, 3):
    page = (page - 1) * 10
    url = "%s%s%d" % (main_url, start_from, page)  # get full url
    indeed = requests.get(url)
    indeed.raise_for_status()
    soup = BeautifulSoup(indeed.text, 'html.parser')

...

    data_record = []
    for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
        data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})

    df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
    dfs.append(df)

df_fin = pd.concat(dfs, ignore_index=True)

Upvotes: 1

Related Questions