Reputation: 75
I'm trying to create a table with the information of an API. When I analyze game by game it works fine, but when I try to analyze a large number of games using "concurrent.futures" to speed up the process it appends some wrong information to the table. Every time I run the script the errors come up in different rows, it's aleatory.
Also, I have noticed printing the game_ids analyzed, that the script doesn't look them in order. Maybe the error is in this problem.
How can I solve this issue? Thank you!
This is the code I'm using.
import requests as r
import pandas as pd
import concurrent.futures
pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
game_id = [100, 101, 102] #This is an example, I use a large number of games
d = {'game_id' : [],
'atbat_num' : [],
'play_index' : [],
'batter_id' : [],
'batter_name' : [],
'pitcher_id' : [],
'pitcher_name' : [],
'runner_id' : [],
'runner_name' : [],
'event' : [],
'start' : [],
'end' : [],
'movementReason' : []
}
def get_url(gids):
url = (f'http://examplelink.com/str(gids)}/')
req = r.get(url)
json = req.json()
for i in json['allPlays']:
if 'runners' in i:
for p in i['runners']:
d['game_id'].append(gids)
if 'atBatIndex' in i:
d['atbat_num'].append(i['atBatIndex'])
else: d['atbat_num'].append(None)
if 'playIndex' in p['details']:
d['play_index'].append(p['details']['playIndex'])
else: d['play_index'].append(None)
if 'matchup' in i:
if 'batter' in i['matchup']:
d['batter_id'].append(i['matchup']['batter']['id'])
else: d['batter_id'].append(None)
else: d['batter_id'].append(None)
if 'matchup' in i:
if 'batter' in i['matchup']:
d['batter_name'].append(i['matchup']['batter']['fullName'])
else: d['batter_name'].append(None)
else: d['batter_name'].append(None)
if 'matchup' in i:
if 'pitcher' in i['matchup']:
d['pitcher_id'].append(i['matchup']['pitcher']['id'])
else: d['pitcher_id'].append(None)
else: d['pitcher_id'].append(None)
if 'matchup' in i:
if 'pitcher' in i['matchup']:
d['pitcher_name'].append(i['matchup']['pitcher']['fullName'])
else: d['pitcher_name'].append(None)
else: d['pitcher_name'].append(None)
if 'details' in p:
if 'runner' in p['details']:
if 'id' in p['details']['runner']:
d['runner_id'].append(p['details']['runner']['id'])
else: d['runner_id'].append(None)
else: d['runner_id'].append(None)
else: d['runner_id'].append(None)
if 'details' in p:
if 'runner' in p['details']:
if 'fullName' in p['details']['runner']:
d['runner_name'].append(p['details']['runner']['fullName'])
else: d['runner_name'].append(None)
else: d['runner_name'].append(None)
else: d['runner_name'].append(None)
if 'details' in p:
d['event'].append(p['details']['event'])
else: d['event'].append(None)
if 'movement' in p:
d['start'].append(p['movement']['start'])
else: d['start'].append(None)
if 'movement' in p:
d['end'].append(p['movement']['end'])
else: d['end'].append(None)
if 'details' in p:
d['movementReason'].append(p['details']['movementReason'])
else: d['movementReason'].append(None)
print(f'Game {gids} analyzed')
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(get_url, game_id)
table = pd.DataFrame(d)
export_csv = table.to_csv ('runner.csv', index = None, header=True)
Upvotes: 0
Views: 176
Reputation: 38952
Executor.map has func
invoked concurrently so there isn't guarantee that the ordering of the results are according to the iterable.
I am assuming that the data is structured this way so that it can be later presented as tabular data using pandas library.
I suggest that you use a different data structure that doesn't care about the ordering such as a list. pandas.DataFrame
data parameter can be dict of lists or list of dicts
d = []
game_id = [100, 101, 102] #This is an example, I use a large number of games
def get_url(gid):
url = f"http://examplelink.com/{gid}/"
req = r.get(url)
json = req.json()
for i in json["allPlays"]:
for p in i.get("runners", []):
matchup = i.get("matchup", {})
batter = matchup.get("batter", {})
pitcher = matchup.get("pitcher", {})
details = p.get("details", {})
runner = details.get("runner", {})
event = details.get("event", {})
d.append(
dict(
game_id=gid,
atbat_num=i.get("atBatIndex"),
play_index=details.get("playIndex"),
batter_id=batter.get("id"),
batter_name=batter.get("fullName"),
pitcher=pitcher.get("id"),
pitcher_name=pitcher.get("fullName"),
runner_id=runner.get("id"),
runner_name=runner.get("fullName"),
event=details.get("event"),
start=p.get("movement", {}).get("start"),
end=p.get("movement", {}).get("end"),
movementReason=details.get("movementReason"),
)
)
print(f"Game {gid} analyzed")
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(get_url, game_id)
Upvotes: 1