Reputation: 26
I try to adapt Llama2 to solve a regression task, by utilizing the last hidden state of the model given the entire input sequence.
If the question is then asked "What is the answer to 2+2"
, it should answer 4
(dummy problem, to explain the issue).
To that end, i will use it in a pytorch model as so
import torch
import torch.nn as nn
from transformers import LlamaModel, LlamaTokenizer
class TransformerModel(nn.Module):
def __init__(self, model_name:str, additional_layer_size:int = 1):
super(TransformerModel, self).__init__()
self.transformer = LlamaModel.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir="hugginface_cache/models")
self.tokenizer = LlamaTokenizer.from_pretrained(model_name, cache_dir="hugginface_cache/tokenizer")
# Add an additional layer with one output
self.additional_layer = nn.Linear(self.transformer.config.hidden_size, additional_layer_size)
def forward(self, input_text):
# Tokenize input text
input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
print("inptut_ids:", input_ids)
# Get the outputs from the transformer
outputs = self.transformer(input_ids)
# Use the entire last hidden state as input to the additional layer
last_hidden_state = outputs.last_hidden_state
print('last_hidden_state_shape:', last_hidden_state.size())
# Apply the additional layer
additional_output = self.additional_layer(last_hidden_state)
return additional_output
model_url = "meta-llama/Llama-2-7b-hf"
model = TransformerModel(model_url)
However, for the given input model ("Hello world!"
) the output is a tensor of size 1,4,1
.
I can verify that the tokenizer splits the string into 4 tokens, which i expect to then cause the problem. However, I am not certain how to fix this.
Upvotes: 0
Views: 743
Reputation: 26
Below the code that appears to be working for me. However, one should handle the padding generated by batches.
class TokenAverageHead(nn.Module):
def __init__(self, additional_layer_size, hidden_size, id_for_eos_token):
super(TokenAverageHead, self).__init__()
self.average_pool_layer = nn.AdaptiveAvgPool1d(
1,
)
self.additional_layer = nn.Linear(hidden_size, additional_layer_size, dtype=torch.bfloat16)
self.id_for_eos_token = id_for_eos_token
def forward(self, input_ids):
# todo check with alex whether this is really what we want to do
# Average the last hidden state over the tokens
input_ids = input_ids.transpose(1, 2)
average_last_hidden_state = self.average_pool_layer(input_ids)
average_last_hidden_state = average_last_hidden_state.squeeze(2)
# Apply the additional layer
additional_output = self.additional_layer(average_last_hidden_state)
return additional_output
class AverageTokenTransformer(nn.Module):
def __init__(self, model_name: str, additional_layer_size: int = 1, id_for_eos_token):
super(AverageTokenTransformer, self).__init__()
self.id_for_eos_token = id_for_eos_token
self.transformer = LlamaModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir="hugginface_cache/models")
for param in self.transformer.parameters():
param.requires_grad = False
# Add an additional head with one output
self.additional_head = TokenAverageHead(additional_layer_size, self.transformer.config.hidden_size)
def forward(self, input_ids):
outputs = self.transformer(input_ids)
# Apply the additional layer
output = self.additional_head(outputs.last_hidden_state)
return output
class LastTokenHead(nn.Module):
def __init__(self, additional_layer_size: int, hidden_size: int = 1, id_for_eos_token):
super(LastTokenHead, self).__init__()
self.id_for_eos_token = id_for_eos_token
self.linear_layer = nn.Linear(hidden_size, additional_layer_size, dtype=torch.bfloat16)
def forward(self, last_hidden_state):
# todo talk with alex whether this is really what i want
last_tokebn_hidden_state = last_hidden_state[:, -1, :]
output = self.linear_layer(last_tokebn_hidden_state)
return output
class LastTokenTransformer(nn.Module):
def __init__(self, model_name: str, additional_layer_size: int = 1, id_for_eos_token):
super(LastTokenTransformer, self).__init__()
self.transformer = LlamaModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir="hugginface_cache/models")
for param in self.transformer.parameters():
param.requires_grad = False
# Add an additional head with one output
self.additional_head = LastTokenHead(additional_layer_size, self.transformer.config.hidden_size)
def forward(self, input_ids):
outputs = self.transformer(input_ids)
# todo debug what happens here
# Apply the additional layer
additional_output = self.additional_head(outputs.last_hidden_state)
return additional_output
Upvotes: 0