Reputation: 1100
I'm adding alphabets to captcha recognition, but pytorch's CTC seems to not working properly when alphabets are added.
At first, I modified BLANK_LABEL
to 62 since there are 62 labels(0-9, a-z, A-Z), but it gives me runtime error blank must be in label range
. I also tried BLANK_LABEL=0
and then assigning 1~63 as nonblank labels but it outputs NaN as loss.
This is the colab link for the current version of my code: here
below are just core parts of the code.
Constants:
DATASET_PATH = "/home/ik1ne/Downloads/numbers"
MODEL_PATH = "/home/ik1ne/Downloads"
BATCH_SIZE = 50
TRAIN_BATCHES = 180
TEST_BATCHES = 20
TOTAL_BATCHES = TRAIN_BATCHES+TEST_BATCHES
TOTAL_DATASET = BATCH_SIZE*TOTAL_BATCHES
BLANK_LABEL = 63
dataset generation:
!pip install captcha
from captcha.image import ImageCaptcha
import itertools
import os
import random
import string
if not os.path.exists(DATASET_PATH):
os.makedirs(DATASET_PATH)
characters = "0123456789"+string.ascii_lowercase + string.ascii_uppercase
while(len(list(Path(DATASET_PATH).glob('*'))) < TOTAL_BATCHES):
captcha_str = "".join(random.choice(characters) for x in range(6))
if captcha_str in list(Path(DATASET_PATH).glob('*')):
continue
ImageCaptcha().write(captcha_str, f"{DATASET_PATH}/{captcha_str}.png")
dataset:
def convert_strseq_to_numseq(s):
for c in s:
if c >= '0' and c <= '9':
return int(c)
elif c>='a' and c <='z':
return ord(c)-ord('a')+10
else:
return ord(c)-ord('A')+36
class CaptchaDataset(Dataset):
"""CAPTCHA dataset."""
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.image_paths = list(Path(root_dir).glob('*'))
self.transform = transform
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
if self.transform:
image = self.transform(image)
label_sequence = [convert_strseq_to_numseq(c) for c in self.image_paths[index].stem]
return (image, torch.tensor(label_sequence))
def __len__(self):
return len(self.image_paths)
model:
class StackedLSTM(nn.Module):
def __init__(self, input_size=60, output_size=11, hidden_size=512, num_layers=2):
super(StackedLSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout()
self.fc = nn.Linear(hidden_size, output_size)
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
def forward(self, inputs, hidden):
batch_size, seq_len, input_size = inputs.shape
outputs, hidden = self.lstm(inputs, hidden)
outputs = self.dropout(outputs)
outputs = torch.stack([self.fc(outputs[i]) for i in range(width)])
outputs = F.log_softmax(outputs, dim=2)
return outputs, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
net = StackedLSTM().to(device)
training:
net.train() # set network to training phase
epochs = 30
# for each pass of the training dataset
for epoch in range(epochs):
train_loss, train_correct, train_total = 0, 0, 0
h = net.init_hidden(BATCH_SIZE)
# for each batch of training examples
for batch_index, (inputs, targets) in enumerate(train_dataloader):
inputs, targets = inputs.to(device), targets.to(device)
h = tuple([each.data for each in h])
BATCH_SIZE, channels, height, width = inputs.shape
# reshape inputs: NxCxHxW -> WxNx(HxC)
inputs = (inputs
.permute(3, 0, 2, 1)
.contiguous()
.view((width, BATCH_SIZE, -1)))
optimizer.zero_grad() # zero the parameter gradients
outputs, h = net(inputs, h) # forward pass
# compare output with ground truth
input_lengths = torch.IntTensor(BATCH_SIZE).fill_(width)
target_lengths = torch.IntTensor([len(t) for t in targets])
loss = criterion(outputs, targets, input_lengths, target_lengths)
loss.backward() # backpropagation
nn.utils.clip_grad_norm_(net.parameters(), 10) # clip gradients
optimizer.step() # update network weights
# record statistics
prob, max_index = torch.max(outputs, dim=2)
train_loss += loss.item()
train_total += len(targets)
for i in range(BATCH_SIZE):
raw_pred = list(max_index[:, i].cpu().numpy())
pred = [c for c, _ in groupby(raw_pred) if c != BLANK_LABEL]
target = list(targets[i].cpu().numpy())
if pred == target:
train_correct += 1
# print statistics every 10 batches
if (batch_index + 1) % 10 == 0:
print(f'Epoch {epoch + 1}/{epochs}, ' +
f'Batch {batch_index + 1}/{len(train_dataloader)}, ' +
f'Train Loss: {(train_loss/1):.5f}, ' +
f'Train Accuracy: {(train_correct/train_total):.5f}')
train_loss, train_correct, train_total = 0, 0, 0
Upvotes: 4
Views: 2516
Reputation: 45
This error will occur when the index of blank is larger than the total number of classes, which equals number of chars + blank
. What's more, the index starts from 0
, instead of 1
, so if you have 62
characters in total, their index should be 0-61
and the index of blank should be 62
instead of 63
. (Or you can set blank as 0
, other characters from 1-62
)
You should also check the shape of the output tensor, it should has shape [T, B, C]
, where T
is the time step length, B
is the batch size, C
is the class num, remember to add blank in to the class num or you will meet the problem
Upvotes: 1
Reputation: 97
Most probably there is some problem with net shape when it's sent to CTC loss, but you should have provided the dataset to us to see the net's shape. It should be (T,N,C) , where T=input length, N=batch size, C=number of classes. And as I understand blank symbol id should in the 0..C range. Also, you should add blank symbol, for example '-' to the alphabet.
Upvotes: 0