Controller816
Controller816

Reputation: 11

Using Pytorch Sequential for reinforcement model without gym

I'm trying to play a game with AI, but I want to do it in real time. Because of that, I'm not using gym to create an environment.

I want to take a screenshot, preprocess it, then pass it through the model before updating the model based on whether or not the model died in-game.

EDIT: I've made some progress, so here is the updated code

from PIL import ImageGrab, Image
import numpy as np
import torch
from torch import nn
from torchvision import transforms
import torch.optim as optim
# imporot keyboard and pyautogui for playing the game

_bbox=(800, 200, 1200, 900) # (L,T,R,B)
dead = False


def get():
    global dead
    screenshot = ImageGrab.grab(bbox=_bbox)

    # Check if dead
    if np.sum(np.array(screenshot)[:,:,0])/(screenshot.size[0]*screenshot.size[1])>=190 and np.sum(np.array(screenshot)[:,:,1])/(screenshot.size[0]*screenshot.size[1])<50 and np.sum(np.array(screenshot)[:,:,2])/(screenshot.size[0]*screenshot.size[1])<15:
        dead = True # edit

    return screenshot.copy()


def compute_discounted_rewards(rewards, gamma=0.99): # Edit
    """This function returns an error if used, so I don't use it further on. But I still wanted to show it here."""
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)
    return discounted_rewards

# define preprocessor
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(256),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# define model (am I doing this right?)
model = nn.Sequential(
          nn.Conv2d(3,256,5),
          nn.ReLU(),
          nn.MaxPool2d(2, 2),
          nn.Conv2d(256,512,5),
          nn.MaxPool2d(2, 2),
          nn.ReLU(),
          nn.Flatten(),
          nn.Linear(3721, 512),
          nn.LeakyReLU(),
          nn.Linear(512, 512),
          nn.LeakyReLU(),
          nn.Linear(512, 3),
          nn.Softmax(1)
        )

# Everything bellow here is after the edit
optimizer = optim.Adam(model.parameters(), lr=1E-2)
ep_reward = list()

for j in range(1): # number of episodes
    dead = False
    i = 0
    log_probs = list()
    rewards = list()
    while not dead: # episode ends after loop ends
        inputs = preprocess(get(raw=True))
        outputs = model(inputs)
        # pred = torch.argmax(outputs.data[0]).item() # 0 = no moves, 1 = left, 2 = right
        m = Categorical(outputs)
        action = m.sample()

        log_probs.append(m.log_prob(action))
        rewards.append(1 if not dead else 0) # append the reward for action
        i += 1
        break # Temp
    dead = False
    
    ep_reward.append(sum(rewards))
    # discounted_rewards = compute_discounted_rewards(rewards)
    discounted_rewards = rewards
    policy_loss = list() # model_loss
    for log_prob, Gt in zip(log_probs, discounted_rewards):
        policy_loss.append(-log_prob*Gt)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()

Most of this is code I've copied from an assortment of sources. I try to understand what it all does, though. I don't know if this will work, so please let me know. How does the optimizer affect the model? It isn't linked in any way I can tell. Is it something pytorch does behind the scenes?

Upvotes: 0

Views: 44

Answers (0)

Related Questions