BERT Text Classification

Question

I am new to BERT and try to learn BERT Fine-Tuning for Text Classification via a coursera course https://www.coursera.org/projects/fine-tune-bert-tensorflow/

Based on the course, I would like to compare the text classification performance between BERT-12 and BERT-24 using 'SGD' and 'ADAM' optimizer respectively.

I found that when I use BERT-12, the result is normal. However, when switching to BERT-24, though the accuracy is good (9X%), the recall and precision value are extremely low (even close to zero).

May I know if there are anything wrong with my code?

Also, in order to improve the precision and recall, should I add more dense layers and change the activation functions? And what are the optimal learning rate values that I should use?

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import sys
sys.path.append('models')
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df= pd.read_csv('https://archive.org/download/fine-tune-bert-tensorflow-train.csv/train.csv.zip', compression='zip', low_memory=False)

train_data_ratio = 0.1
val_data_ratio = 0.1
rand_seed = 42

train_df, remaining = train_test_split(df, random_state=rand_seed, train_size=train_data_ratio, stratify=df.target.values)
valid_df, _ = train_test_split (remaining , random_state=rand_seed, train_size=val_data_ratio, stratify=remaining.target.values)

#load data from main memory to cpu
with tf.device('/cpu:0'):
  train_data = tf.data.Dataset.from_tensor_slices ((train_df['question_text'].values, train_df['target'].values))
  valid_data = tf.data.Dataset.from_tensor_slices ((valid_df.question_text.values, valid_df.target.values))

"""
Each line of the dataset is composed of the review text and its label
- Data preprocessing consists of transforming text to BERT input features:
input_word_ids, input_mask, segment_ids
- In the process, tokenizing the text is done with the provided BERT model tokenizer
"""

label_list = [0,1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
train_batch_size= 32
learning_rate = 0.001 
num_layer = 24 # change between bert-12 and bert-24 to compare the diff
epochs = 4
optimizer = 'SGD'

assert num_layer in [12, 24] 
if num_layer == 12:
    train_batch_size = 32
elif num_layer == 24:
    train_batch_size = 4 

assert optimizer in ['SGD', 'Adam'] 
if optimizer == 'Adam':
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
elif optimizer == 'SGD':
    opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)


# Get BERT layer and tokenizer:
https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2
bert_12 = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
bert_24 = "https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2"

if num_layer == 12:
    bert_layer = hub.KerasLayer(bert_12, trainable=True)
elif num_layer == 24:
    bert_layer = hub.KerasLayer(bert_24, trainable=True)
    
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #from tensor to numpy
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() #check if it is lower case (no conversion. to check better)
tokenizer = tokenization.FullTokenizer (vocab_file, do_lower_case)


# from data to features that can be understood by bert

def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
  example = classifier_data_lib.InputExample(guid=None,
                                             text_a=text.numpy(),
                                             text_b=None,
                                             label=label.numpy())
  feature=classifier_data_lib.convert_single_example(0,example,label_list,max_seq_length, tokenizer)

  return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
  
def to_feature_map(text, label):
  input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], 
                                Tout=[tf.int32, tf.int32, tf.int32, tf.int32])


  input_ids.set_shape([max_seq_length])
  input_mask.set_shape([max_seq_length])
  segment_ids.set_shape([max_seq_length])
  label_id.set_shape([])

  x = {
        'input_word_ids': input_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }
  return (x, label_id)
  
with tf.device('/cpu:0'):
  # train
  train_data = (train_data.map(to_feature_map,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          #.cache()
                          .shuffle(1000)
                          .batch(train_batch_size, drop_remainder=True)
                          .prefetch(tf.data.experimental.AUTOTUNE))

  # valid
  valid_data = (valid_data.map(to_feature_map,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .batch(train_batch_size, drop_remainder=True)
                          .prefetch(tf.data.experimental.AUTOTUNE)) 
  

# Building the model
def create_model():
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_mask")
  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_type_ids")

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

  drop = tf.keras.layers.Dropout(0.4)(pooled_output)
  output = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(drop)

  model = tf.keras.Model(
    inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
    outputs=output)
  return model
  
  
model = create_model()
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              #metrics=[tf.keras.metrics.BinaryAccuracy()])
              metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

epochs = epochs
history = model.fit(train_data,
                    validation_data=valid_data,
                    epochs=epochs,
                    verbose=1)
                    
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()

Thank you very much !

ML_Engine · Accepted Answer

Maybe try adding precision and recall to a custom callback function so you can inspect what's going on. I've added a debug point in (pdb.set_trace()) so the process will pause once the first epoch has ended and you can step through each point to investigate the data.

from sklearn.metrics import precision_score, recall_score
import pdb


class Callbacks(tf.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(myCallback, self).__init__()
        self.valid_data = valid_data
        

    def on_epoch_end(self, epoch, logs={}):

        pdb.set_trace()

        val_x = valid_data[:-1] # Get bert inputs
        val_y = valid_data[-1] # Get labels

        # Get predictions for the filtered val data
        val_scores = self.model.predict(val_x)

        # Get indices of best predictions - you might need to alter this
        val_y_pred = tf.argmax(val_scores, axis=1)
        val_y_true = tf.argmax(val_y, axis=1)
        
        # Calculate precision and recall
        precision = precision_score(val_y_true, val_y_pred, average='weighted')
        recall = recall_score(val_y_true, val_y_pred, average='weighted')
        
        # Add scores to logs to see in training output
        logs['precision'] = precision
        logs['recall'] = recall

To pass the validation data to the callback you'll need to add something like the below to your fit function:

cbs = Callbacks(valid_data)

model.fit(...., callbacks=[cbs])

BERT Text Classification

Answers (1)

Related Questions