Lukas Fehring
Lukas Fehring

Reputation: 26

Llama2 Language Model for Regression (huggingface)

I try to adapt Llama2 to solve a regression task, by utilizing the last hidden state of the model given the entire input sequence.

If the question is then asked "What is the answer to 2+2", it should answer 4 (dummy problem, to explain the issue).

To that end, i will use it in a pytorch model as so

import torch
import torch.nn as nn
from transformers import LlamaModel, LlamaTokenizer

class TransformerModel(nn.Module):
    def __init__(self, model_name:str, additional_layer_size:int = 1):
        super(TransformerModel, self).__init__()
        self.transformer = LlamaModel.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir="hugginface_cache/models")
        self.tokenizer = LlamaTokenizer.from_pretrained(model_name, cache_dir="hugginface_cache/tokenizer")

        # Add an additional layer with one output
        self.additional_layer = nn.Linear(self.transformer.config.hidden_size, additional_layer_size)
        
    def forward(self, input_text):
        # Tokenize input text
        input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
        print("inptut_ids:", input_ids)

        # Get the outputs from the transformer
        outputs = self.transformer(input_ids)
        
        # Use the entire last hidden state as input to the additional layer
        last_hidden_state = outputs.last_hidden_state
        print('last_hidden_state_shape:', last_hidden_state.size())

        # Apply the additional layer
        additional_output = self.additional_layer(last_hidden_state)

        return additional_output


model_url = "meta-llama/Llama-2-7b-hf"

model = TransformerModel(model_url)

However, for the given input model ("Hello world!") the output is a tensor of size 1,4,1.

I can verify that the tokenizer splits the string into 4 tokens, which i expect to then cause the problem. However, I am not certain how to fix this.

Upvotes: 0

Views: 743

Answers (1)

Lukas Fehring
Lukas Fehring

Reputation: 26

Below the code that appears to be working for me. However, one should handle the padding generated by batches.

class TokenAverageHead(nn.Module):
    def __init__(self, additional_layer_size, hidden_size, id_for_eos_token):
        super(TokenAverageHead, self).__init__()
        self.average_pool_layer = nn.AdaptiveAvgPool1d(
            1,
        )
        self.additional_layer = nn.Linear(hidden_size, additional_layer_size, dtype=torch.bfloat16)
        self.id_for_eos_token = id_for_eos_token

    def forward(self, input_ids):
        # todo check with alex whether this is really what we want to do
        # Average the last hidden state over the tokens
        input_ids = input_ids.transpose(1, 2)
        average_last_hidden_state = self.average_pool_layer(input_ids)
        average_last_hidden_state = average_last_hidden_state.squeeze(2)

        # Apply the additional layer
        additional_output = self.additional_layer(average_last_hidden_state)

        return additional_output


class AverageTokenTransformer(nn.Module):
    def __init__(self, model_name: str, additional_layer_size: int = 1, id_for_eos_token):
        super(AverageTokenTransformer, self).__init__()
        self.id_for_eos_token = id_for_eos_token
        self.transformer = LlamaModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir="hugginface_cache/models")

        for param in self.transformer.parameters():
            param.requires_grad = False

        # Add an additional head with one output
        self.additional_head = TokenAverageHead(additional_layer_size, self.transformer.config.hidden_size)

    def forward(self, input_ids):
        outputs = self.transformer(input_ids)

        # Apply the additional layer
        output = self.additional_head(outputs.last_hidden_state)

        return output


class LastTokenHead(nn.Module):
    def __init__(self, additional_layer_size: int, hidden_size: int = 1, id_for_eos_token):
        super(LastTokenHead, self).__init__()
        self.id_for_eos_token = id_for_eos_token
        self.linear_layer = nn.Linear(hidden_size, additional_layer_size, dtype=torch.bfloat16)

    def forward(self, last_hidden_state):
        # todo talk with alex whether this is really what i want
        last_tokebn_hidden_state = last_hidden_state[:, -1, :]
        output = self.linear_layer(last_tokebn_hidden_state)

        return output


class LastTokenTransformer(nn.Module):
    def __init__(self, model_name: str, additional_layer_size: int = 1, id_for_eos_token):
        super(LastTokenTransformer, self).__init__()
        self.transformer = LlamaModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir="hugginface_cache/models")
        for param in self.transformer.parameters():
            param.requires_grad = False

        # Add an additional head with one output
        self.additional_head = LastTokenHead(additional_layer_size, self.transformer.config.hidden_size)

    def forward(self, input_ids):
        outputs = self.transformer(input_ids)
        # todo debug what happens here
        # Apply the additional layer
        additional_output = self.additional_head(outputs.last_hidden_state)

        return additional_output

Upvotes: 0

Related Questions