Reputation: 1
Inconsistency problem of text-to-32-dimensional vector data: Use huggingface to download Albert-v2 weights to the local computer and read the text for inference. Why is the vector of the same text different when batch_size is set to different values when using GPU? Here code:
import tensorflow as tf
import numpy as np
import random
from transformers import AlbertTokenizer, TFAlbertModel, BertTokenizer, TFBertModel
from tqdm import tqdm
seed = 42
tf.random.set_seed(seed)
random.seed(seed)
np.random.seed(seed)
tokenizer = AlbertTokenizer.from_pretrained('albert')
albert_model = TFAlbertModel.from_pretrained("albert", output_hidden_states=True)
albert_model.trainable = False # 禁用训练模式
embedding_dim = 32
dense_layer = tf.keras.layers.Dense(embedding_dim, activation='linear', use_bias=False)
dense_layer.build((None, 768))
dense_layer.kernel.assign(tf.random.normal((768, embedding_dim)))
def text_to_embedding(texts, max_length=60):
inputs = tokenizer(texts, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
with tf.device('/GPU:0'):
outputs = albert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
last_hidden_state = outputs.last_hidden_state
# hidden_states = outputs.hidden_states # 获取所有层的输出
# print(hidden_states)
mean_output = tf.reduce_mean(last_hidden_state, axis=1)
reduced_output = dense_layer(mean_output)
return reduced_output.numpy() # 返回 numpy 数组
texts = ['无经验描述', '无工作经历 ', '-', '什么都没有', '无工作经历-无经验描述']
max_length = 60
batch_size = 2 # 或者 10
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
for batch in tqdm(batches):
embedding = text_to_embedding(batch, max_length=max_length)
print(embedding)
why batch-size=1 and 2,result is different?
Upvotes: 0
Views: 15