oakca
oakca

Reputation: 1568

Applying reinforced-learning on combination of continuous and discrete action space environment

I have a custom gym environment, where it has 3 continuous and 1 discrete action space. I would like to apply a reinforcement-learning algorithm, however I am not sure what to use.

Below you can find the environment code, action space is basically setting some parameters within the gas network which is called via self.func and observation space is the pressure results of nodes, and velocity results of elements:

import numpy as np
import gymnasium as gym
import simtools as st


class GasNetworkEnv(gym.Env):
    def __init__(self, map_, qcorr_bounds, pset_bounds, cs_ctrl_bounds,
                 obs_size, func, func_args):
        super(GasNetworkEnv, self).__init__()
        self.action_space = gym.spaces.Dict({
            'qcorr': gym.spaces.Box(
                low=np.array([qcorr_bounds[0]]),
                high=np.array([qcorr_bounds[1]]),
                dtype=np.float64),
            'pset': gym.spaces.Box(
                low=np.array([pset_bounds[0]]),
                high=np.array([pset_bounds[1]]),
                dtype=np.float64),
            'cs_ctrl': gym.spaces.Box(
                low=np.repeat(cs_ctrl_bounds[0], len(map_)),
                high=np.repeat(cs_ctrl_bounds[1], len(map_)),
                dtype=np.float64),
            'cs_state': gym.spaces.MultiBinary(
                sum([len(map_[k].no) for k in map_]))})
        self.observation_space = gym.spaces.Box(
            low=-1e5, high=1e5, shape=(obs_size,), dtype=np.float64)
        self.func = func
        self.func_args = func_args

    def step(self, action):  
        # call objective function (we are trying to minimize score which is bad when higher)
        node_results, element_results, score = self.func(action, self.func_args)
        reward = -score
   
        # observation
        observation = np.concatenate((node_results, element_results))

        # termination conditions (currently: no termination)
        done = False

        return observation, reward, done, {}

    def reset(self, **kwargs):
        super().reset(seed=kwargs.get('seed', None))

        initial_observation = np.random.uniform(
            low=self.observation_space.low,
            high=self.observation_space.high,
            size=self.observation_space.shape)

        return initial_observation

Upvotes: 0

Views: 35

Answers (0)

Related Questions