Reputation: 11
I have a model implemented in pytorch that applies a final fully connected layer before running the softmax function. The architecture is defined to solve a 4-class Speech Emotion Recognition task: given an audio track, it transforms it into its spectrogram and uses it to predict the emotion between happiness, sadness, neutrality and anger.
Unlike the architecture of the paper, it attempts to adapt the implementation of the Compact Convolutional Transformer found on Github at the link https://github.com/SHI-Labs/Compact-Transformers/blob/main/src/cct.py.
To improve the performance of the model I am following some tricks defined in the paper https://arxiv.org/abs/2104.07288. Like what is described in the paper, however, my model also suffers from a "class collapse" problem: even by balancing the dataset, it tends to predict the anger and sadness classes well and the other two badly.
In the paper to solve this problem they apply a particular weight regularization technique to the fully connected layer, described in chapter 2.4.
Unfortunately, however, I cannot understand how I should modify my fully connected layer in pytorch to implement this type of regularization.
Code of the model:
class CCT(nn.Module):
def __init__(self,
img_size=224,
embedding_dim=768,
n_input_channels=3,
n_conv_layers=1,
kernel_size=7,
stride=2,
padding=3,
pooling_kernel_size=3,
pooling_stride=2,
pooling_padding=1,
dropout=0.,
attention_dropout=0.1,
stochastic_depth=0.1,
num_layers=14,
num_heads=6,
mlp_ratio=4.0,
num_classes=1000,
positional_embedding='learnable',
*args, **kwargs):
super(CCT, self).__init__()
self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
n_output_channels=embedding_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding,
pooling_kernel_size=pooling_kernel_size,
pooling_stride=pooling_stride,
pooling_padding=pooling_padding,
max_pool=True,
activation=nn.ReLU,
n_conv_layers=n_conv_layers,
conv_bias=False)
self.classifier = TransformerClassifier(
sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
height=img_size,
width=img_size),
embedding_dim=embedding_dim,
seq_pool=True,
dropout=dropout,
attention_dropout=attention_dropout,
stochastic_depth=stochastic_depth,
num_layers=num_layers,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
num_classes=num_classes,
positional_embedding=positional_embedding
)
def forward(self, x):
x = self.tokenizer(x)
return self.classifier(x)
class Tokenizer(nn.Module):
def __init__(self,
kernel_size, stride, padding,
pooling_kernel_size=3, pooling_stride=2, pooling_padding=1,
n_conv_layers=1,
n_input_channels=3,
n_output_channels=64,
in_planes=64,
activation=None,
max_pool=True,
conv_bias=False):
super(Tokenizer, self).__init__()
n_filter_list = [n_input_channels] + \
[in_planes for _ in range(n_conv_layers - 1)] + \
[n_output_channels]
self.conv_layers = nn.Sequential(
*[nn.Sequential(
nn.Conv2d(n_filter_list[i], n_filter_list[i + 1],
kernel_size=(kernel_size, kernel_size),
stride=(stride, stride),
padding=(padding, padding), bias=conv_bias),
nn.Identity() if activation is None else activation(),
nn.MaxPool2d(kernel_size=pooling_kernel_size,
stride=pooling_stride,
padding=pooling_padding) if max_pool else nn.Identity()
)
for i in range(n_conv_layers)
])
self.flattener = nn.Flatten(2, 3)
self.apply(self.init_weight)
def sequence_length(self, n_channels=3, height=224, width=224):
return self.forward(torch.zeros((1, n_channels, height, width))).shape[1]
def forward(self, x):
return self.flattener(self.conv_layers(x)).transpose(-2, -1)
@staticmethod
def init_weight(m):
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
class TransformerClassifier(Module):
def __init__(self,
seq_pool=True,
embedding_dim=768,
num_layers=12,
num_heads=12,
mlp_ratio=4.0,
num_classes=1000,
dropout=0.1,
attention_dropout=0.1,
stochastic_depth=0.1,
positional_embedding='learnable',
sequence_length=None):
super().__init__()
positional_embedding = positional_embedding if \
positional_embedding in ['sine', 'learnable', 'none'] else 'sine'
dim_feedforward = int(embedding_dim * mlp_ratio)
self.embedding_dim = embedding_dim
self.sequence_length = sequence_length
self.seq_pool = seq_pool
self.num_tokens = 0
assert sequence_length is not None or positional_embedding == 'none', \
f"Positional embedding is set to {positional_embedding} and" \
f" the sequence length was not specified."
if not seq_pool:
sequence_length += 1
self.class_emb = Parameter(torch.zeros(1, 1, self.embedding_dim),
requires_grad=True)
self.num_tokens = 1
else:
self.attention_pool = Linear(self.embedding_dim, 1)
if positional_embedding != 'none':
if positional_embedding == 'learnable':
self.positional_emb = Parameter(torch.zeros(1, sequence_length, embedding_dim),
requires_grad=True)
init.normal_(self.positional_emb, std=0.2)
else:
self.positional_emb = Parameter(self.sinusoidal_embedding(sequence_length, embedding_dim),
requires_grad=False)
else:
self.positional_emb = None
self.dropout = Dropout(p=dropout)
dpr = [x.item() for x in torch.linspace(0, stochastic_depth, num_layers)]
self.blocks = ModuleList([
TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
dim_feedforward=dim_feedforward, dropout=dropout,
attention_dropout=attention_dropout, drop_path_rate=dpr[i])
for i in range(num_layers)])
self.norm = LayerNorm(embedding_dim)
self.fc = Linear(embedding_dim, num_classes)
self.apply(self.init_weight)
def forward(self, x):
if self.positional_emb is None and x.size(1) < self.sequence_length:
x = F.pad(x, (0, 0, 0, self.n_channels - x.size(1)), mode='constant', value=0)
if not self.seq_pool:
cls_token = self.class_emb.expand(x.shape[0], -1, -1)
x = torch.cat((cls_token, x), dim=1)
if self.positional_emb is not None:
x += self.positional_emb
x = self.dropout(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
if self.seq_pool:
x = torch.matmul(F.softmax(self.attention_pool(x), dim=1).transpose(-1, -2), x).squeeze(-2)
else:
x = x[:, 0]
x = self.fc(x)
return x
Can someone help me?
Upvotes: 0
Views: 1019
Reputation: 1098
As you have not shared any network architecture I will try to give a basic example. I am not sure the regularization of the paper but I will give a simple example that will apply L1 regularization on specific layer (e.g. layer 0)
my_model = nn.Sequential(
nn.Linear(5, 5),
nn.ReLU(),
nn.Linear(5, 2)
)
x = torch.randn(5, 5)
target = torch.ones(5, dtype=torch.long)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
for epoch in range(10):
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
# This will be your weight regularization, choose your layer like model[0] and apply normalization that you want on that layer weights
l1_norm = torch.norm(model[0].weight, p=1)
loss += l1_norm
loss.backward()
optimizer.step()
print('Epoch {}, loss {}, norm layer {}'.format(
epoch, loss.item(), l1_norm.item()))
Upvotes: 0