Skip to content

Commit 746aaae

Browse files
authored
remove tiktoken pin (#1759)
1 parent b9f17e1 commit 746aaae

File tree

2 files changed

+13
-3
lines changed

2 files changed

+13
-3
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ numpy
33
torch
44
tqdm
55
more-itertools
6-
tiktoken==0.3.3
6+
tiktoken

tests/test_tokenizer.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
1+
import pytest
2+
13
from whisper.tokenizer import get_tokenizer
24

35

4-
def test_tokenizer():
6+
@pytest.mark.parametrize("multilingual", [True, False])
7+
def test_tokenizer(multilingual):
8+
tokenizer = get_tokenizer(multilingual=False)
9+
assert tokenizer.sot in tokenizer.sot_sequence
10+
assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens)
11+
assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens)
12+
13+
14+
def test_multilingual_tokenizer():
515
gpt2_tokenizer = get_tokenizer(multilingual=False)
616
multilingual_tokenizer = get_tokenizer(multilingual=True)
717

@@ -20,5 +30,5 @@ def test_split_on_unicode():
2030
tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
2131
words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
2232

23-
assert words == [" elle", " est", " l", "'", "", "é", "rit", "oire"]
33+
assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"]
2434
assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]

0 commit comments

Comments
 (0)