Reputation: 68
I have the following problem to load a transformer model. The strange thing is that it work on google colab or even when I tried on another computer, it seems to be version / cache problem but I didn't found it.
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-8-0b8b6a3eea75> in <module>
1 from sentence_transformers import SentenceTransformer
2 from sentence_transformers.util import cos_sim
----> 3 model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
4
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sentence_transformers\SentenceTransformer.py in __init__(self, model_name_or_path, modules, device, cache_folder)
88
89 if os.path.exists(os.path.join(model_path, 'modules.json')): #Load as SentenceTransformer model
---> 90 modules = self._load_sbert_model(model_path)
91 else: #Load with AutoModel
92 modules = self._load_auto_model(model_path)
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sentence_transformers\SentenceTransformer.py in _load_sbert_model(self, model_path)
820 for module_config in modules_config:
821 module_class = import_from_string(module_config['type'])
--> 822 module = module_class.load(os.path.join(model_path, module_config['path']))
823 modules[module_config['name']] = module
824
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sentence_transformers\models\Transformer.py in load(input_path)
122 with open(sbert_config_path) as fIn:
123 config = json.load(fIn)
--> 124 return Transformer(model_name_or_path=input_path, **config)
125
126
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sentence_transformers\models\Transformer.py in __init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path)
28 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
29 self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
---> 30 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
31
32 #No max_seq_length set. Try to infer from model
~\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\auto\tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
566 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
567 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 568 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
569 else:
570 if tokenizer_class_py is not None:
~\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1730 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1731
-> 1732 return cls._from_pretrained(
1733 resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
1734 )
~\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs)
1848 # Instantiate tokenizer.
1849 try:
-> 1850 tokenizer = cls(*init_inputs, **init_kwargs)
1851 except OSError:
1852 raise OSError(
~\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\xlm_roberta\tokenization_xlm_roberta_fast.py in __init__(self, vocab_file, tokenizer_file, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, **kwargs)
132 mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
133
--> 134 super().__init__(
135 vocab_file,
136 tokenizer_file=tokenizer_file,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\tokenization_utils_fast.py in __init__(self, *args, **kwargs)
105 elif fast_tokenizer_file is not None and not from_slow:
106 # We have a serialization from tokenizers which let us directly build the backend
--> 107 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
108 elif slow_tokenizer is not None:
109 # We need to convert a slow tokenizer to build the backend
Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 1 column 317584
To give your more details, i also got another problem only on this computer with another model :
model = SentenceTransformer('etalab-ia/dpr-question_encoder-fr_qa-camembert')
ValueError: unable to parse C:\Users\david.rouyre/.cache\torch\sentence_transformers\etalab-ia_dpr-question_encoder-fr_qa-camembert\tokenizer_config.json as a URL or as a local path
So i checked in the cache path and there was not tokenizer_config.json
, only tokenizer.json
(by renaming the file it worked)
The package : (same version in colab)
Name: sentence-transformers
Version: 2.0.0
Summary: Sentence Embeddings using BERT / RoBERTa / XLM-R
Home-page: https://github.com/UKPLab/sentence-transformers
Author: Nils Reimers
Author-email: info@nils-reimers.de
License: Apache License 2.0
Location: c:\users\david.rouyre\appdata\local\programs\python\python39\lib\site-packages
Requires: transformers, tqdm, torch, torchvision, numpy, scikit-learn, scipy, nltk, sentencepiece, huggingface-hub
Required-by:
I tried to clear the cache and uninstall with pip all dependencies (transformers, tqdm, torch, torchvision, numpy, scikit-learn, scipy, nltk, sentencepiece, huggingface-hub), uninstall sentence-transformers and reinstalling it.
Upvotes: 3
Views: 5917
Reputation: 59
Which tokenizer version do you have installed? For me it helped to upgrade the tokenizer:
pip3 install tokenizers==0.10.3
Upvotes: -1