Adding Linear layers to Thinc Model Example - Understanding Data Dimensions Through Model Architecture

Question

Trying to learn the inner workings of models trained with Spacy, and Thinc models are it. Looking at this tutorial and I'm modifying the model to see what breaks and what works. Instead of tagging, I'm modifying it to fit a NER dataset I have with 16 classes. I want to add several layers after the TransformerTokenizer + Transformer layers already outlined in this tutorial, but I'm getting tons of dimension ValueErrors. Also, it's important to me that the TransformersTagger layer outputs the last hidden layer of the given transformer model, which I'm not confident this code is doing. Here's the error I'm getting:

ValueError: Attempt to change dimension 'nI' for model 'linear' from 512 to 16

And here is my full code adaptation to date. To be fair, I don't like that there's a softmax(num_ner_classes) prior to the Linear() layer, but I can't get anything else to work with with_array() after the Transformer layer:

@dataclass
class TokensPlus:
    batch_size: int
    tok2wp: List[Ints1d]
    input_ids: torch.Tensor
    token_type_ids: torch.Tensor
    attention_mask: torch.Tensor

    def __init__(self, inputs: List[List[str]], wordpieces: BatchEncoding):
        self.input_ids = wordpieces["input_ids"]
        self.attention_mask = wordpieces["attention_mask"]
        self.token_type_ids = wordpieces["token_type_ids"]
        self.batch_size = self.input_ids.shape[0]
        self.tok2wp = []
        for i in range(self.batch_size):
            print(i, inputs[i])
            spans = [wordpieces.word_to_tokens(i, j) for j in range(len(inputs[i]))]
            print(spans)
            self.tok2wp.append(self.get_wp_starts(spans))

    def get_wp_starts(self, spans: List[Optional[TokenSpan]]) -> Ints1d:
        """Calculate an alignment mapping each token index to its first wordpiece."""
        alignment = numpy.zeros((len(spans)), dtype="i")
        for i, span in enumerate(spans):
            if span is None:
                raise ValueError(
                    "Token did not align to any wordpieces. Was the tokenizer "
                    "run with is_split_into_words=True?"
                )
            else:
                alignment[i] = span.start
        return alignment

@thinc.registry.layers("transformers_tokenizer.v1")
def TransformersTokenizer(name: str) -> Model[List[List[str]], TokensPlus]:
    def forward(model, inputs: List[List[str]], is_train: bool):
        tokenizer = model.attrs["tokenizer"]
        wordpieces = tokenizer(
            inputs,
            is_split_into_words=True,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_length=True,
            return_tensors="pt",
            padding="longest"
        )
        return TokensPlus(inputs, wordpieces), lambda d_tokens: []

    return Model("tokenizer", forward, attrs={"tokenizer": AutoTokenizer.from_pretrained(name)})

def convert_transformer_inputs(model, tokens: TokensPlus, is_train):
    kwargs = {
        "input_ids": tokens.input_ids,
        "attention_mask": tokens.attention_mask,
        "token_type_ids": tokens.token_type_ids,
    }
    return ArgsKwargs(args=(), kwargs=kwargs), lambda dX: []

def convert_transformer_outputs(model: Model, inputs_outputs: Tuple[TokensPlus, Tuple[torch.Tensor]], is_train: bool) -> Tuple[List[Floats2d], Callable]:
    tplus, trf_outputs = inputs_outputs
    wp_vectors = torch2xp(trf_outputs[0])
    tokvecs = [wp_vectors[i, idx] for i, idx in enumerate(tplus.tok2wp)]

    def backprop(d_tokvecs: List[Floats2d]) -> ArgsKwargs:
        # Restore entries for BOS and EOS markers
        d_wp_vectors = model.ops.alloc3f(*trf_outputs[0].shape, dtype="f")
        for i, idx in enumerate(tplus.tok2wp):
            d_wp_vectors[i, idx] += d_tokvecs[i]
        return ArgsKwargs(
            args=(trf_outputs[0],),
            kwargs={"grad_tensors": xp2torch(d_wp_vectors)},
        )

    return tokvecs, backprop

@thinc.registry.layers("transformers_encoder.v1")
def Transformer(name: str = "bert-large-cased") -> Model[TokensPlus, List[Floats2d]]:
    return PyTorchWrapper(
        AutoModel.from_pretrained(name),
        convert_inputs=convert_transformer_inputs,
        convert_outputs=convert_transformer_outputs,
    )

@thinc.registry.layers("TransformersNer.v1")
def TransformersNer(name: str, num_ner_classes: int = 16) -> Model[List[List[str]], List[Floats2d]]:
    return chain(
        TransformersTokenizer(name),
        Transformer(name),
        with_array(Softmax(num_ner_classes)),
        Linear(512, 1024)
    )

How do I best determine how to pipe the output of the PyTorchWrapped TransformersTagger layer into a Linear() + more layers down the chain? I've been using this model visualization but even when I run model.initialize() on the first examples of my data, there are still a lot of (?, ?).

import pydot

def visualize_model(model):
    def get_label(layer):
        layer_name = layer.name
        nO = layer.get_dim("nO") if layer.has_dim("nO") else "?"
        nI = layer.get_dim("nI") if layer.has_dim("nI") else "?"
        return f"{layer.name}|({nO}, {nI})".replace(">", ">")
    dot = pydot.Dot()
    dot.set("rankdir", "LR")
    dot.set_node_defaults(shape="record", fontname="arial", fontsize="10")
    dot.set_edge_defaults(arrowsize="0.7")
    nodes = {}
    for i, layer in enumerate(model.layers):
        label = get_label(layer)
        node = pydot.Node(layer.id, label=label)
        dot.add_node(node)
        nodes[layer.id] = node
        if i == 0:
            continue
        from_node = nodes[model.layers[i - 1].id]
        to_node = nodes[layer.id]
        if not dot.get_edge(from_node, to_node):
            dot.add_edge(pydot.Edge(from_node, to_node))
    print(dot)

Produces:

digraph G {
rankdir=LR;
node [fontname=arial, fontsize=10, shape=record];
edge [arrowsize="0.7"];
176 [label="tokenizer|(?, ?)"];
177 [label="pytorch|(?, ?)"];
176 -> 177;
179 [label="with_array(softmax)|(16, 1024)"];
177 -> 179;
180 [label="linear|(512, 1024)"];
179 -> 180;
}

Adding Linear layers to Thinc Model Example - Understanding Data Dimensions Through Model Architecture

Answers (1)

Related Questions