Cmagelssen
Cmagelssen

Reputation: 660

Create a random group allocation in python

I have a Python script that ranks skiers on performance (column: "GJENNOMSNITT") and then creates two matched groups on GJENNOMSNITT: group1 and group 2. I use the following code to this end:

    def allokereGrupper(df1):
        df1 = df1.sort_values(by='GJENNOMSNITT', ascending=True)
        mask = np.arange(len(df1)) % 2
        group1 = df1.loc[mask == 0]
        print("gruppe 1:")
        print(group1)
        group2 = df1.loc[mask == 1]
        print("gruppe 2:")
        print(group2)
        return group1,group2

The problem with this script is that the best skiers will always be in the GROUP 1 because of mask == 0. Instead, I want this to be random. I have been coded in JavaScript for 4 months and I am not able to come up with a good solution to this problem in Python. Can someone help me?

Here is all my code and you should have access to the csv file that I am reading

resultater.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

class Resultat:

    def lastInnOgRydd(path, LagreCsv = False):
        df = pd.read_csv(path, skiprows=2, decimal=".")
        filt = df['FINISH'] == 'DNF'
        dnf = df[filt]
        dnf = dnf.replace('DNF', 1)
        if LagreCsv == True:
            dnf.to_csv('DNF.csv')
        df.replace('DNF', np.NaN, inplace=True)
        df.replace('GARBAGE GARBAGE', np.NaN, inplace=True) #Denne finnes det nok en bedre løsning på
        df.dropna(subset=['FINISH'], inplace=True)
        df.dropna(subset=['NAME'], inplace=True)
        return df

    def endreDataType(df):
        df["FINISH"] = df["FINISH"].str.replace(',', '.').astype(float)
        df["INTER 1"] = df["INTER 1"].str.replace(',', '.').astype(float)
        df["SECTION IM4-FINISH"] = df["SECTION IM4-FINISH"].str.replace(',', '.').astype(float)
        df["COMMENT"] = df['COMMENT'].astype(int)
        df["COMMENT"] = df['COMMENT'].astype(str)
        df["COMMENT"] = df['COMMENT'].str.replace('11', 'COURSE 1')
        df["COMMENT"] = df['COMMENT'].str.replace('22', 'COURSE 2')
        df["COMMENT"] = df['COMMENT'].str.replace('33', 'COURSE 3')
        df["COMMENT"] = df['COMMENT'].str.replace('55', 'UTKJORING')
        df["COMMENT"] = df['COMMENT'].str.replace('99', 'STRAIGHT-GLIDING')
        pd.to_numeric(df['FINISH'], downcast='float', errors='raise')
        pd.to_numeric(df['INTER 1'], downcast='float', errors='raise')
        pd.to_numeric(df['SECTION IM4-FINISH'], downcast='float', errors='raise')
        return df

    def navnendringCommentTilCourse(df):
        df.rename(columns={'COMMENT': 'COURSE'}, inplace=True)
        return df

    def finnBesteRunder(df):
        grupper = df.groupby(['BIB#', 'COURSE'])
        bestruns = grupper['FINISH'].apply(lambda x: x.nsmallest(2).mean()).reset_index()
        print(bestruns)
        df1 = bestruns.pivot('BIB#', 'COURSE', 'FINISH').reset_index()
        df1['GJENNOMSNITT'] = df1['COURSE 1'].add(df1['COURSE 2']).add(df1['COURSE 3']).div(3)
        #df1['PRESTASJON'] = df1['MEAN'].div(df1['STRAIGHT-GLIDING']) # fjerner denne nå, men må med i den ordentilige analysen
        return df1

    def allokereGrupper(df1):
        df1 = df1.sort_values(by='GJENNOMSNITT', ascending=True)
        mask = np.arange(len(df1)) % 2
        group1 = df1.loc[mask == 0]
        print("gruppe 1:")
        print(group1)
        group2 = df1.loc[mask == 1]
        print("gruppe 2:")
        print(group2)
        return group1,group2

main.py

from moduler import Resultat

path = "http://www.cmagelssen.no/pilot2.csv"

df = Resultat.lastInnOgRydd(path)
df = Resultat.endreDataType(df)
df = Resultat.navnendringCommentTilCourse(df)
df = Resultat.finnBesteRunder(df)
df = Resultat.allokereGrupper(df)

Upvotes: 2

Views: 565

Answers (1)

Arne
Arne

Reputation: 10545

So you want the two groups to be matched in the sense that for every consecutive pair of skiers in the ranking list (df1) it is to be decided randomly (with equal probabilities) whether the higher ranked skier is allocated to group 1 and the lower ranked one to group 2 or vice versa.

A straightforward if not the most efficient way to achieve this is to use Python's standard random module to shuffle each pair of mask values after assigning the repeating 0-1 sequence to mask:

import numpy as np
import random


def allokereGrupper(df):
    df = df.sort_values(by='GJENNOMSNITT', ascending=True)
    mask = np.arange(len(df)) % 2
    
    # new: for each pair of mask values, 
    #      randomly decide whether to swap them or not
    for i in range(0, len(df), 2):
        random.shuffle(mask[i:i+2])
    
    group1 = df.loc[mask == 0]
    print("gruppe 1:")
    print(group1)
    group2 = df.loc[mask == 1]
    print("gruppe 2:")
    print(group2)
    return group1, group2

Note that I changed the name of the argument to df, to make it more explicit that this is a general function. To make it even more general, you could pass the name of the column to sort by as an argument too.

Upvotes: 1

Related Questions