hou五五
hou五五

Reputation: 1

Convert text to 32-dimensional vector using albert-v2 weights

Inconsistency problem of text-to-32-dimensional vector data: Use huggingface to download Albert-v2 weights to the local computer and read the text for inference. Why is the vector of the same text different when batch_size is set to different values ​​when using GPU? Here code:

import tensorflow as tf
import numpy as np
import random
from transformers import AlbertTokenizer, TFAlbertModel, BertTokenizer, TFBertModel
from tqdm import tqdm


seed = 42
tf.random.set_seed(seed)
random.seed(seed)
np.random.seed(seed)

tokenizer = AlbertTokenizer.from_pretrained('albert')
albert_model = TFAlbertModel.from_pretrained("albert", output_hidden_states=True)
albert_model.trainable = False  # 禁用训练模式

embedding_dim = 32

dense_layer = tf.keras.layers.Dense(embedding_dim, activation='linear', use_bias=False)
dense_layer.build((None, 768)) 
dense_layer.kernel.assign(tf.random.normal((768, embedding_dim)))


def text_to_embedding(texts, max_length=60):
    inputs = tokenizer(texts, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
    
    with tf.device('/GPU:0'):
        outputs = albert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        last_hidden_state = outputs.last_hidden_state
        # hidden_states = outputs.hidden_states  # 获取所有层的输出
        # print(hidden_states)
    
    mean_output = tf.reduce_mean(last_hidden_state, axis=1)
    reduced_output = dense_layer(mean_output)
    
    return reduced_output.numpy()  # 返回 numpy 数组
    

texts = ['无经验描述', '无工作经历 ', '-', '什么都没有', '无工作经历-无经验描述']

max_length = 60
batch_size = 2  # 或者 10
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

for batch in tqdm(batches):
    embedding = text_to_embedding(batch, max_length=max_length)
    print(embedding)

why batch-size=1 and 2,result is different?

Upvotes: 0

Views: 15

Answers (0)

Related Questions