How to perform weight regularization in pytorch?

Question

I have a model implemented in pytorch that applies a final fully connected layer before running the softmax function. The architecture is defined to solve a 4-class Speech Emotion Recognition task: given an audio track, it transforms it into its spectrogram and uses it to predict the emotion between happiness, sadness, neutrality and anger.

Unlike the architecture of the paper, it attempts to adapt the implementation of the Compact Convolutional Transformer found on Github at the link https://github.com/SHI-Labs/Compact-Transformers/blob/main/src/cct.py.

To improve the performance of the model I am following some tricks defined in the paper https://arxiv.org/abs/2104.07288. Like what is described in the paper, however, my model also suffers from a "class collapse" problem: even by balancing the dataset, it tends to predict the anger and sadness classes well and the other two badly.

In the paper to solve this problem they apply a particular weight regularization technique to the fully connected layer, described in chapter 2.4.

Unfortunately, however, I cannot understand how I should modify my fully connected layer in pytorch to implement this type of regularization.

Code of the model:

class CCT(nn.Module):
def __init__(self,
             img_size=224,
             embedding_dim=768,
             n_input_channels=3,
             n_conv_layers=1,
             kernel_size=7,
             stride=2,
             padding=3,
             pooling_kernel_size=3,
             pooling_stride=2,
             pooling_padding=1,
             dropout=0.,
             attention_dropout=0.1,
             stochastic_depth=0.1,
             num_layers=14,
             num_heads=6,
             mlp_ratio=4.0,
             num_classes=1000,
             positional_embedding='learnable',
             *args, **kwargs):
    super(CCT, self).__init__()

    self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
                               n_output_channels=embedding_dim,
                               kernel_size=kernel_size,
                               stride=stride,
                               padding=padding,
                               pooling_kernel_size=pooling_kernel_size,
                               pooling_stride=pooling_stride,
                               pooling_padding=pooling_padding,
                               max_pool=True,
                               activation=nn.ReLU,
                               n_conv_layers=n_conv_layers,
                               conv_bias=False)

    self.classifier = TransformerClassifier(
        sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
                                                       height=img_size,
                                                       width=img_size),
        embedding_dim=embedding_dim,
        seq_pool=True,
        dropout=dropout,
        attention_dropout=attention_dropout,
        stochastic_depth=stochastic_depth,
        num_layers=num_layers,
        num_heads=num_heads,
        mlp_ratio=mlp_ratio,
        num_classes=num_classes,
        positional_embedding=positional_embedding
    )

def forward(self, x):
    x = self.tokenizer(x)
    return self.classifier(x)

class Tokenizer(nn.Module):
def __init__(self,
             kernel_size, stride, padding,
             pooling_kernel_size=3, pooling_stride=2, pooling_padding=1,
             n_conv_layers=1,
             n_input_channels=3,
             n_output_channels=64,
             in_planes=64,
             activation=None,
             max_pool=True,
             conv_bias=False):
    super(Tokenizer, self).__init__()

    n_filter_list = [n_input_channels] + \
                    [in_planes for _ in range(n_conv_layers - 1)] + \
                    [n_output_channels]

    self.conv_layers = nn.Sequential(
        *[nn.Sequential(
            nn.Conv2d(n_filter_list[i], n_filter_list[i + 1],
                      kernel_size=(kernel_size, kernel_size),
                      stride=(stride, stride),
                      padding=(padding, padding), bias=conv_bias),
            nn.Identity() if activation is None else activation(),
            nn.MaxPool2d(kernel_size=pooling_kernel_size,
                         stride=pooling_stride,
                         padding=pooling_padding) if max_pool else nn.Identity()
        )
            for i in range(n_conv_layers)
        ])

    self.flattener = nn.Flatten(2, 3)
    self.apply(self.init_weight)

def sequence_length(self, n_channels=3, height=224, width=224):
    return self.forward(torch.zeros((1, n_channels, height, width))).shape[1]

def forward(self, x):
    return self.flattener(self.conv_layers(x)).transpose(-2, -1)

@staticmethod
def init_weight(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)

class TransformerClassifier(Module):
def __init__(self,
             seq_pool=True,
             embedding_dim=768,
             num_layers=12,
             num_heads=12,
             mlp_ratio=4.0,
             num_classes=1000,
             dropout=0.1,
             attention_dropout=0.1,
             stochastic_depth=0.1,
             positional_embedding='learnable',
             sequence_length=None):
    super().__init__()
    positional_embedding = positional_embedding if \
        positional_embedding in ['sine', 'learnable', 'none'] else 'sine'
    dim_feedforward = int(embedding_dim * mlp_ratio)
    self.embedding_dim = embedding_dim
    self.sequence_length = sequence_length
    self.seq_pool = seq_pool
    self.num_tokens = 0

    assert sequence_length is not None or positional_embedding == 'none', \
        f"Positional embedding is set to {positional_embedding} and" \
        f" the sequence length was not specified."

    if not seq_pool:
        sequence_length += 1
        self.class_emb = Parameter(torch.zeros(1, 1, self.embedding_dim),
                                   requires_grad=True)
        self.num_tokens = 1
    else:
        self.attention_pool = Linear(self.embedding_dim, 1)

    if positional_embedding != 'none':
        if positional_embedding == 'learnable':
            self.positional_emb = Parameter(torch.zeros(1, sequence_length, embedding_dim),
                                            requires_grad=True)
            init.normal_(self.positional_emb, std=0.2)
        else:
            self.positional_emb = Parameter(self.sinusoidal_embedding(sequence_length, embedding_dim),
                                            requires_grad=False)
    else:
        self.positional_emb = None

    self.dropout = Dropout(p=dropout)
    dpr = [x.item() for x in torch.linspace(0, stochastic_depth, num_layers)]
    self.blocks = ModuleList([
        TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
                                dim_feedforward=dim_feedforward, dropout=dropout,
                                attention_dropout=attention_dropout, drop_path_rate=dpr[i])
        for i in range(num_layers)])
    self.norm = LayerNorm(embedding_dim)

    self.fc = Linear(embedding_dim, num_classes)
    self.apply(self.init_weight)

def forward(self, x):
    if self.positional_emb is None and x.size(1) < self.sequence_length:
        x = F.pad(x, (0, 0, 0, self.n_channels - x.size(1)), mode='constant', value=0)

    if not self.seq_pool:
        cls_token = self.class_emb.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_token, x), dim=1)

    if self.positional_emb is not None:
        x += self.positional_emb

    x = self.dropout(x)

    for blk in self.blocks:
        x = blk(x)
    x = self.norm(x)

    if self.seq_pool:
        x = torch.matmul(F.softmax(self.attention_pool(x), dim=1).transpose(-1, -2), x).squeeze(-2)
    else:
        x = x[:, 0]

    x = self.fc(x)
    return x

Can someone help me?

How to perform weight regularization in pytorch?

Answers (1)

Related Questions