Reputation: 169
I have a program that simulates an entire baseball season, but does a lot of calculations per game, so each game takes around 30 seconds to run. With 2430 games in a season, the program takes about 20 hours to run, per season. Obviously I'd like to speed this up, so the most immediate solution seems like multiprocessing. I could manually split it up into groups of ~600 and run four processes, but I'd like to figure out how the multiprocessing module works.
Here's what I've tried so far, but obviously it doesn't work.
def test_func():
algorithm_selection = 1
# Create sqlite database connection
conn = sqlite3.connect('C:/F5 Prediction Engine/sqlite3/Version 2/statcast_db.db')
c = conn.cursor()
season = input('Year to simulate: ')
c.execute('SELECT * FROM gamelogs_' + season)
season_games = c.fetchall()
game_num = 0
for game in season_games:
game_num = game_num + 1
#Get away lineup in terms of MLB IDs
away_lineup = ConvertLineup(game[105], game[108], game[111], game[114], game[117], game[120], game[123], game[126], game[129])
#Get home lineup in terms of MLB IDs
home_lineup = ConvertLineup(game[132], game[135], game[138], game[141], game[144], game[147], game[150], game[153], game[156])
#Get away starting pitcher and hand in terms of MLB ID
away_pitcher_results = GetPitcherIDandHand(game[101])
away_pitcher_id = away_pitcher_results[0][0]
away_pitcher_hand = away_pitcher_results[0][1]
#Get home starting pitcher and hand in terms of MLB ID
home_pitcher_results = GetPitcherIDandHand(game[103])
home_pitcher_id = home_pitcher_results[0][0]
home_pitcher_hand = home_pitcher_results[0][1]
#Get the date of the game
today_date = game[0]
if algorithm_selection == 1:
#Check if the current game has already been evaluated and entered into the database
c.execute('SELECT * FROM pemstein_results_' + season + ' WHERE date = "' + game[0] + '" AND away_team = "' + game[3] + '" AND home_team = "' + game[6] + \
'" AND away_team_score = "' + game[9] + '" AND home_team_score = "' + game[10] + '"')
check_results = c.fetchall()
if len(check_results) == 0:
exp_slgs = PemsteinSimulation(home_pitcher_id, away_pitcher_id, season, home_pitcher_hand, away_pitcher_hand, home_lineup, away_lineup, game[0])
if exp_slgs[2] == 0: #if both pitches had at least 300 PAs to use for simulation
c.execute([long string to insert results into database])
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if exp_slgs[2] == 1: #if one of the pitches did not have enough PAs to qualify
c.execute([long string to insert results into database])
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if len(check_results) > 0:
print('Game ' + str(game_num) + ' has already been evaluated.')
from multiprocessing import Process
import os
processes = []
for i in range(0, os.cpu_count()):
print('Registering process %d' % i)
processes.append(Process(target=test))
for process in processes:
process.start()
for process in processes:
process.join()
==================
#Child Process
def simulate_games(games_list, counter, lock):
while(1):
# Create sqlite database connection
conn = sqlite3.connect('C:/F5 Prediction Engine/sqlite3/Version 2/statcast_db.db')
c = conn.cursor()
#acquire the lock which grants access to the shared variable
with lock:
#check the termination condition
if counter >= len(games_list):
break
#get the game_num and game to simulate
game_num = counter.value
game_to_simulate = game_list[counter.value]
#update the counter for the next process
counter.value += 1
#Do simulation
game_num = 0
game_num = game_num + 1
#Get away lineup in terms of MLB IDs
away_lineup = ConvertLineup(game_to_simulate[105], game_to_simulate[108], game_to_simulate[111], game_to_simulate[114], game_to_simulate[117], game_to_simulate[120], game_to_simulate[123], game_to_simulate[126], game_to_simulate[129])
#Get home lineup in terms of MLB IDs
home_lineup = ConvertLineup(game_to_simulate[132], game_to_simulate[135], game_to_simulate[138], game_to_simulate[141], game_to_simulate[144], game_to_simulate[147], game_to_simulate[150], game_to_simulate[153], game_to_simulate[156])
#Get away starting pitcher and hand in terms of MLB ID
away_pitcher_results = GetPitcherIDandHand(game[101])
away_pitcher_id = away_pitcher_results[0][0]
away_pitcher_hand = away_pitcher_results[0][1]
#Get home starting pitcher and hand in terms of MLB ID
home_pitcher_results = GetPitcherIDandHand(game[103])
home_pitcher_id = home_pitcher_results[0][0]
home_pitcher_hand = home_pitcher_results[0][1]
#Get the date of the game
today_date = game_to_simulate[0]
if algorithm_selection == 1:
#Check if the current game has already been evaluated and entered into the database
c.execute('SELECT * FROM pemstein_results_' + season + ' WHERE date = "' + game_to_simulate[0] + '" AND away_team = "' + game_to_simulate[3] + '" AND home_team = "' + game_to_simulate[6] + \
'" AND away_team_score = "' + game_to_simulate[9] + '" AND home_team_score = "' + game_to_simulate[10] + '"')
check_results = c.fetchall()
if len(check_results) == 0:
exp_slgs = PemsteinSimulation(home_pitcher_id, away_pitcher_id, season, home_pitcher_hand, away_pitcher_hand, home_lineup, away_lineup, game_to_simulate[0])
if exp_slgs[2] == 0: #if both pitches had at least 300 PAs to use for simulation
c.execute('long sql')
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if exp_slgs[2] == 1: #if one of the pitches did not have enough PAs to qualify
c.execute('long sql')
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if len(check_results) > 0:
print('Game ' + str(game_num) + ' has already been evaluated.')
if __name__ == "__main__":
# Create sqlite database connection
conn = sqlite3.connect('C:/F5 Prediction Engine/sqlite3/Version 2/statcast_db.db')
c = conn.cursor()
#Query all games for season to be simulated
season = int(input('Year to simulate: '))
c.execute('SELECT * FROM gamelogs_' + str(season))
season_games = c.fetchall()
algorithmSelection = 1
if algorithmSelection == 1:
PemsteinSQLresults(str(season))
counter = mp.Value('i', 0)
lock = mp.Lock()
children = []
for i in range(os.cpu_count()):
children.append(mp.Process(target=simulate_games, args=(season_games, counter, lock)))
for child in children:
child.start()
for child in children:
child.join()
Error:
Traceback (most recent call last):
File "C:\F5 Prediction Engine\Version 2\SimulateSeason v2.py", line 126, in <module>
child.start()
File "C:\Python\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Python\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Python\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Python\lib\multiprocessing\popen_spawn_win32.py", line 65, in __init__
reduction.dump(process_obj, to_child)
File "C:\Python\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
BrokenPipeError: [Errno 32] Broken pipe
So I went to this website to review some things, and tried a new script with the following code that I copied from the site:
import mp
def worker(num):
"""thread worker function"""
print('Worker:' + num)
return
if __name__ == '__main__':
jobs = []
for i in range(5):
p = mp.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
But it likewise doesn't do anything. The site says it should print Worker:0
Worker:1
etc, but I'm getting no prints. Is it possible there's something wrong locally on my machine?
Upvotes: 0
Views: 307
Reputation: 565
It seems to me that you have simply tried to instantiate a new process for each cpu and had them run the same function that you wrote at first, however if you want to work with processes you would have to adapt it and handle process synchonization.
As an example you could have a master process which prompts the user for the season year, fetches all the games for that year and then the child processes would read from the resulting array. See the following example:
# Parent Process
import multiprocessing as mp
# establish db connection [ ... ]
season = int(input("Year to simulate: "))
c.execute('SELECT * FROM gamelogs_' + season)
season_games = c.fetchall()
counter = mp.Value("i", 0)
lock = mp.Lock()
children = []
for i in range(os.cpu_count()):
children.append(mp.Process(target=simulate_games, args=(season_games, counter, lock,)))
for child in children:
child.start()
for child in children:
child.join()
# Child Process
def simulate_games(games_list, counter, lock):
while(1):
# acquire the lock which grants the access to the shared variable
with lock:
# check the termination condition
if counter.value >= len(games_list):
break
# get the game_num and the game to simulate
game_num = counter.value
game_to_simulate = games_list[counter.value]
# update counter for the next process
counter.value += 1
# Do simulation here
What we have above is a parent process which is basically preparing some data and creating new child processes.
The counter is implemented by means of a special class, i.e Value, which is used for sharing scalar values among processes; Lock is basically a mutex, which we use to synchronize the access to the counter variable and avoid concurrent access: note that you could have used the Lock which is automatically created inside of the counter shared variable, but I thought it would be easier to understand by separating the two.
The children processes by first acquiring the lock, read the counter value and increment it, then proceed to their normal behavior, thus simulating the games
Upvotes: 1