sb2002
sb2002

Reputation: 75

How can I avoid append errors using concurrent.futures in Python?

I'm trying to create a table with the information of an API. When I analyze game by game it works fine, but when I try to analyze a large number of games using "concurrent.futures" to speed up the process it appends some wrong information to the table. Every time I run the script the errors come up in different rows, it's aleatory.

Also, I have noticed printing the game_ids analyzed, that the script doesn't look them in order. Maybe the error is in this problem.

How can I solve this issue? Thank you!

This is the code I'm using.

import requests as r
import pandas as pd
import concurrent.futures

pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

game_id = [100, 101, 102] #This is an example, I use a large number of games

d = {'game_id'        : [],
     'atbat_num'      : [],
     'play_index'     : [],
     'batter_id'      : [],
     'batter_name'    : [],
     'pitcher_id'     : [],
     'pitcher_name'   : [],
     'runner_id'      : [],
     'runner_name'    : [],
     'event'          : [],
     'start'          : [],
     'end'            : [],
     'movementReason' : []
        }

def get_url(gids):
    url = (f'http://examplelink.com/str(gids)}/')
    req  = r.get(url)
    json = req.json()

    for i in json['allPlays']:

        if 'runners' in i:

            for p in i['runners']:

                d['game_id'].append(gids)
                if 'atBatIndex' in i:
                    d['atbat_num'].append(i['atBatIndex'])
                else: d['atbat_num'].append(None)
                if 'playIndex' in p['details']:
                    d['play_index'].append(p['details']['playIndex'])
                else: d['play_index'].append(None)
                if 'matchup' in i:
                    if 'batter' in i['matchup']:
                        d['batter_id'].append(i['matchup']['batter']['id'])                    
                    else: d['batter_id'].append(None)
                else: d['batter_id'].append(None)
                if 'matchup' in i:
                    if 'batter' in i['matchup']:
                        d['batter_name'].append(i['matchup']['batter']['fullName'])
                    else: d['batter_name'].append(None)
                else: d['batter_name'].append(None)
                if 'matchup' in i:
                    if 'pitcher' in i['matchup']:
                        d['pitcher_id'].append(i['matchup']['pitcher']['id'])
                    else: d['pitcher_id'].append(None)
                else: d['pitcher_id'].append(None)
                if 'matchup' in i:
                    if 'pitcher' in i['matchup']:
                        d['pitcher_name'].append(i['matchup']['pitcher']['fullName'])
                    else: d['pitcher_name'].append(None)
                else: d['pitcher_name'].append(None)
                if 'details' in p:
                    if 'runner' in p['details']:
                        if 'id' in p['details']['runner']:
                            d['runner_id'].append(p['details']['runner']['id'])
                        else: d['runner_id'].append(None)
                    else: d['runner_id'].append(None)
                else: d['runner_id'].append(None)
                if 'details' in p:
                    if 'runner' in p['details']:
                        if 'fullName' in p['details']['runner']:
                            d['runner_name'].append(p['details']['runner']['fullName'])
                        else: d['runner_name'].append(None)
                    else: d['runner_name'].append(None)
                else: d['runner_name'].append(None)
                if 'details' in p:
                    d['event'].append(p['details']['event'])
                else: d['event'].append(None)
                if 'movement' in p:
                    d['start'].append(p['movement']['start'])
                else: d['start'].append(None)
                if 'movement' in p:
                    d['end'].append(p['movement']['end'])
                else: d['end'].append(None)
                if 'details' in p:
                    d['movementReason'].append(p['details']['movementReason'])
                else: d['movementReason'].append(None)

    print(f'Game {gids} analyzed')          

    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_url, game_id)

table = pd.DataFrame(d)

export_csv = table.to_csv ('runner.csv', index = None, header=True)

Upvotes: 0

Views: 176

Answers (1)

Oluwafemi Sule
Oluwafemi Sule

Reputation: 38952

Executor.map has func invoked concurrently so there isn't guarantee that the ordering of the results are according to the iterable.

I am assuming that the data is structured this way so that it can be later presented as tabular data using pandas library.

I suggest that you use a different data structure that doesn't care about the ordering such as a list. pandas.DataFrame data parameter can be dict of lists or list of dicts

d = []
game_id = [100, 101, 102] #This is an example, I use a large number of games

def get_url(gid):
    url = f"http://examplelink.com/{gid}/"
    req = r.get(url)
    json = req.json()

    for i in json["allPlays"]:
        for p in i.get("runners", []):
            matchup = i.get("matchup", {})
            batter = matchup.get("batter", {})
            pitcher = matchup.get("pitcher", {})
            details = p.get("details", {})
            runner = details.get("runner", {})
            event = details.get("event", {})

            d.append(
                dict(
                    game_id=gid,
                    atbat_num=i.get("atBatIndex"),
                    play_index=details.get("playIndex"),
                    batter_id=batter.get("id"),
                    batter_name=batter.get("fullName"),
                    pitcher=pitcher.get("id"),
                    pitcher_name=pitcher.get("fullName"),
                    runner_id=runner.get("id"),
                    runner_name=runner.get("fullName"),
                    event=details.get("event"),
                    start=p.get("movement", {}).get("start"),
                    end=p.get("movement", {}).get("end"),
                    movementReason=details.get("movementReason"),
                )
            )

    print(f"Game {gid} analyzed")

with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_url, game_id)

Upvotes: 1

Related Questions