33
44import json
55import os
6- import sys
76from itertools import product
87
98from transformers import AutoTokenizer , AutoConfig
109import numpy as np
1110
1211from scripts .supported_models import SUPPORTED_MODELS
1312
14- # Handle protobuf compatibility issues by setting the environment variable if not already set
15- # This is one of the workarounds mentioned in the error message
16- if 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION' not in os .environ :
17- os .environ ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION' ] = 'python'
18-
19- # Check if we should run in local mode (safer settings)
20- LOCAL_MODE = '--local' in sys .argv
21- if LOCAL_MODE :
22- print ("Running in local mode with safer settings" )
23-
2413# List of tokenizers where the model isn't yet supported, but the tokenizer is
2514ADDITIONAL_TOKENIZERS_TO_TEST = {
2615 'falcon' : [
6958
7059 # TODO: remove when https://github.com/huggingface/transformers/issues/28096 is addressed
7160 'RajuKandasamy/tamillama_tiny_30m' ,
72-
73- # TODO: remove when need for trust_remote_code can be addressed in CI
74- 'monologg/kobert' ,
75-
76- # TODO: remove when protobuf compatibility issues are resolved
77- 'dangvantuan/sentence-camembert-large' ,
7861]
7962
8063MAX_TESTS = {
@@ -286,21 +269,15 @@ def generate_tokenizer_tests():
286269 tokenizer = AutoTokenizer .from_pretrained (
287270 tokenizer_name ,
288271 use_fast = False ,
289- trust_remote_code = True ,
290272 )
291273 decoder_tokenizer = AutoTokenizer .from_pretrained (
292274 tokenizer_name ,
293275 use_fast = True ,
294- trust_remote_code = True ,
295276 )
296277
297278 else :
298- # In local mode, always use slow tokenizers to avoid protobuf issues
299- use_fast = not LOCAL_MODE
300279 decoder_tokenizer = tokenizer = AutoTokenizer .from_pretrained (
301- tokenizer_name ,
302- trust_remote_code = True ,
303- use_fast = use_fast )
280+ tokenizer_name )
304281
305282 except (KeyError , EnvironmentError ):
306283 # If a KeyError/EnvironmentError is raised from the AutoTokenizer, it
@@ -347,42 +324,29 @@ def generate_tokenizer_tests():
347324
348325 for tokenizer_id in TOKENIZERS_WITH_CHAT_TEMPLATES :
349326 print (f'Generating chat templates for { tokenizer_id } ' )
327+ tokenizer = AutoTokenizer .from_pretrained (
328+ tokenizer_id ,
350329
351- try :
352- # In local mode, use safer settings
353- use_fast = not LOCAL_MODE or 'llama' not in tokenizer_id
330+ # TODO: Remove once https://github.com/huggingface/transformers/pull/26678 is fixed
331+ use_fast = 'llama' not in tokenizer_id ,
332+ )
333+ tokenizer_results = []
334+ for key in TOKENIZERS_WITH_CHAT_TEMPLATES [tokenizer_id ]:
335+ messages = CHAT_MESSAGES_EXAMPLES [key ]
354336
355- tokenizer = AutoTokenizer .from_pretrained (
356- tokenizer_id ,
357- use_fast = use_fast ,
358- trust_remote_code = True ,
359- )
360- tokenizer_results = []
361- for key in TOKENIZERS_WITH_CHAT_TEMPLATES [tokenizer_id ]:
362- messages = CHAT_MESSAGES_EXAMPLES [key ]
363-
364- for add_generation_prompt , tokenize in product ([True , False ], [True , False ]):
365- try :
366- result = tokenizer .apply_chat_template (
367- messages ,
368- add_generation_prompt = add_generation_prompt ,
369- tokenize = tokenize ,
370- )
371- tokenizer_results .append (dict (
372- messages = messages ,
373- add_generation_prompt = add_generation_prompt ,
374- tokenize = tokenize ,
375- target = result ,
376- ))
377- except ValueError as e :
378- print (f" - Skipping template for { tokenizer_id } with { key } : { str (e )} " )
379- continue
337+ for add_generation_prompt , tokenize in product ([True , False ], [True , False ]):
338+ tokenizer_results .append (dict (
339+ messages = messages ,
340+ add_generation_prompt = add_generation_prompt ,
341+ tokenize = tokenize ,
342+ target = tokenizer .apply_chat_template (
343+ messages ,
344+ add_generation_prompt = add_generation_prompt ,
345+ tokenize = tokenize ,
346+ ),
347+ ))
380348
381- if tokenizer_results :
382- template_results [tokenizer_id ] = tokenizer_results
383- except Exception as e :
384- print (f" - Error processing tokenizer { tokenizer_id } : { str (e )} " )
385- continue
349+ template_results [tokenizer_id ] = tokenizer_results
386350
387351 return dict (
388352 tokenization = tokenization_results ,
@@ -399,7 +363,7 @@ def generate_config_tests():
399363 print (' -' , config_name )
400364 try :
401365 # Load config
402- config = AutoConfig .from_pretrained (config_name , trust_remote_code = True )
366+ config = AutoConfig .from_pretrained (config_name )
403367 except Exception :
404368 # Something went wrong, skip this config
405369 continue
@@ -464,5 +428,4 @@ def main():
464428
465429
466430if __name__ == "__main__" :
467- main ()
468-
431+ main ()
0 commit comments