1+ import pytest
2+
13from whisper .tokenizer import get_tokenizer
24
35
4- def test_tokenizer ():
6+ @pytest .mark .parametrize ("multilingual" , [True , False ])
7+ def test_tokenizer (multilingual ):
8+ tokenizer = get_tokenizer (multilingual = False )
9+ assert tokenizer .sot in tokenizer .sot_sequence
10+ assert len (tokenizer .all_language_codes ) == len (tokenizer .all_language_tokens )
11+ assert all (c < tokenizer .timestamp_begin for c in tokenizer .all_language_tokens )
12+
13+
14+ def test_multilingual_tokenizer ():
515 gpt2_tokenizer = get_tokenizer (multilingual = False )
616 multilingual_tokenizer = get_tokenizer (multilingual = True )
717
@@ -20,5 +30,5 @@ def test_split_on_unicode():
2030 tokens = [8404 , 871 , 287 , 6 , 246 , 526 , 3210 , 20378 ]
2131 words , word_tokens = multilingual_tokenizer .split_tokens_on_unicode (tokens )
2232
23- assert words == [" elle" , " est" , " l" , "'" , "� " , "é" , "rit" , "oire" ]
33+ assert words == [" elle" , " est" , " l" , "'" , "\ufffd " , "é" , "rit" , "oire" ]
2434 assert word_tokens == [[8404 ], [871 ], [287 ], [6 ], [246 ], [526 ], [3210 ], [20378 ]]
0 commit comments