Skip to content

Commit 5f9ac65

Browse files
authored
Fix truncated words list when the replacement character is decoded (#1089)
1 parent ba88b8e commit 5f9ac65

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

tests/test_tokenizer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,13 @@ def test_tokenizer():
1212
assert gpt2_tokenizer.decode(gpt2_tokens) == text
1313
assert multilingual_tokenizer.decode(multilingual_tokens) == text
1414
assert len(gpt2_tokens) > len(multilingual_tokens)
15+
16+
17+
def test_split_on_unicode():
18+
multilingual_tokenizer = get_tokenizer(multilingual=True)
19+
20+
tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
21+
words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
22+
23+
assert words == [" elle", " est", " l", "'", "�", "é", "rit", "oire"]
24+
assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]

whisper/tokenizer.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,17 +279,27 @@ def split_to_word_tokens(self, tokens: List[int]):
279279
return self.split_tokens_on_spaces(tokens)
280280

281281
def split_tokens_on_unicode(self, tokens: List[int]):
282+
decoded_full = self.decode_with_timestamps(tokens)
283+
replacement_char = "\ufffd"
284+
282285
words = []
283286
word_tokens = []
284287
current_tokens = []
288+
unicode_offset = 0
285289

286290
for token in tokens:
287291
current_tokens.append(token)
288292
decoded = self.decode_with_timestamps(current_tokens)
289-
if "\ufffd" not in decoded:
293+
294+
if (
295+
replacement_char not in decoded
296+
or decoded_full[unicode_offset + decoded.index(replacement_char)]
297+
== replacement_char
298+
):
290299
words.append(decoded)
291300
word_tokens.append(current_tokens)
292301
current_tokens = []
302+
unicode_offset += len(decoded)
293303

294304
return words, word_tokens
295305

0 commit comments

Comments
 (0)