Skip to content

Commit 0775327

Browse files
gh-89855: Improve support of non-ASCII identifiers in IDLE
Replace ASCII-only identifier character sets and lookup tables in autocomplete, autoexpand, editor, hyperparser and undo with the more general ("a" + char).isidentifier() test or a Unicode-aware regular expression. Non-ASCII identifiers (e.g. "café" or "Ångström") are now handled correctly when autocompleting, autoexpanding, highlighting errors and merging undo operations.
1 parent 2303eea commit 0775327

9 files changed

Lines changed: 112 additions & 46 deletions

File tree

Lib/idlelib/autocomplete.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,8 @@
2828
TRY_A = False, False, False, ATTRS # '.' for attributes.
2929
TRY_F = False, False, False, FILES # '/' in quotes for file name.
3030

31-
# This string includes all chars that may be in an identifier.
32-
# TODO Update this here and elsewhere.
33-
ID_CHARS = string.ascii_letters + string.digits + "_"
31+
# all ASCII chars that may be in an identifier
32+
_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
3433

3534
SEPS = f"{os.sep}{os.altsep if os.altsep else ''}"
3635
TRIGGERS = f".{SEPS}"
@@ -134,7 +133,11 @@ def open_completions(self, args):
134133
elif hp.is_in_code() and (not mode or mode==ATTRS):
135134
self._remove_autocomplete_window()
136135
mode = ATTRS
137-
while i and (curline[i-1] in ID_CHARS or ord(curline[i-1]) > 127):
136+
while i:
137+
c = curline[i-1]
138+
if c not in _ASCII_ID_CHARS:
139+
if c <= '\x7f' or not ('a' + c).isidentifier():
140+
break
138141
i -= 1
139142
comp_start = curline[i:j]
140143
if i and curline[i-1] == '.': # Need object with attributes.

Lib/idlelib/autoexpand.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,10 @@
1313
There is only one instance of Autoexpand.
1414
'''
1515
import re
16-
import string
1716

17+
_LAST_WORD_RE = re.compile(r'\b\w+\Z')
1818

1919
class AutoExpand:
20-
wordchars = string.ascii_letters + string.digits + "_"
21-
2220
def __init__(self, editwin):
2321
self.text = editwin.text
2422
self.bell = self.text.bell
@@ -85,10 +83,8 @@ def getwords(self):
8583
def getprevword(self):
8684
"Return the word prefix before the cursor."
8785
line = self.text.get("insert linestart", "insert")
88-
i = len(line)
89-
while i > 0 and line[i-1] in self.wordchars:
90-
i = i-1
91-
return line[i:]
86+
m = _LAST_WORD_RE.search(line)
87+
return m[0] if m else ''
9288

9389

9490
if __name__ == '__main__':

Lib/idlelib/editor.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import os
44
import platform
55
import re
6-
import string
76
import sys
87
import tokenize
98
import traceback
@@ -809,14 +808,12 @@ def ResetColorizer(self):
809808
if self.line_numbers is not None:
810809
self.line_numbers.update_colors()
811810

812-
IDENTCHARS = string.ascii_letters + string.digits + "_"
813-
814811
def colorize_syntax_error(self, text, pos):
815812
text.tag_add("ERROR", pos)
816813
char = text.get(pos)
817-
if char and char in self.IDENTCHARS:
814+
if char and ('a' + char).isidentifier():
818815
text.tag_add("ERROR", pos + " wordstart", pos)
819-
if '\n' == text.get(pos): # error at line end
816+
if char == '\n': # error at line end
820817
text.mark_set("insert", pos)
821818
else:
822819
text.mark_set("insert", pos + "+1c")

Lib/idlelib/hyperparser.py

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,6 @@
1414
# all ASCII chars that may be the first char of an identifier
1515
_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
1616

17-
# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
18-
_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
19-
# lookup table for whether 7-bit ASCII chars are valid as the first
20-
# char in a Python identifier
21-
_IS_ASCII_ID_FIRST_CHAR = \
22-
[(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
23-
2417

2518
class HyperParser:
2619
def __init__(self, editwin, index):
@@ -166,53 +159,47 @@ def _eat_identifier(cls, str, limit, pos):
166159
167160
This ignores non-identifier eywords are not identifiers.
168161
"""
169-
is_ascii_id_char = _IS_ASCII_ID_CHAR
170-
171162
# Start at the end (pos) and work backwards.
172163
i = pos
173164

174165
# Go backwards as long as the characters are valid ASCII
175166
# identifier characters. This is an optimization, since it
176167
# is faster in the common case where most of the characters
177168
# are ASCII.
178-
while i > limit and (
179-
ord(str[i - 1]) < 128 and
180-
is_ascii_id_char[ord(str[i - 1])]
181-
):
169+
while i > limit and str[i - 1] in _ASCII_ID_CHARS:
182170
i -= 1
183171

184172
# If the above loop ended due to reaching a non-ASCII
185173
# character, continue going backwards using the most generic
186174
# test for whether a string contains only valid identifier
187175
# characters.
188-
if i > limit and ord(str[i - 1]) >= 128:
189-
while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
176+
if i > limit and str[i - 1] > '\x7f':
177+
while i - 4 >= limit and ('a' + str[i - 4:i]).isidentifier():
190178
i -= 4
191-
if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
179+
if i - 2 >= limit and ('a' + str[i - 2:i]).isidentifier():
192180
i -= 2
193-
if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
181+
if i - 1 >= limit and ('a' + str[i - 1]).isidentifier():
194182
i -= 1
195183

196184
# The identifier candidate starts here. If it isn't a valid
197185
# identifier, don't eat anything. At this point that is only
198186
# possible if the first character isn't a valid first
199187
# character for an identifier.
200-
if not str[i:pos].isidentifier():
188+
if i < pos and not str[i].isidentifier():
201189
return 0
202190
elif i < pos:
203191
# All characters in str[i:pos] are valid ASCII identifier
204192
# characters, so it is enough to check that the first is
205193
# valid as the first character of an identifier.
206-
if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
194+
if str[i] not in _ASCII_ID_FIRST_CHARS:
207195
return 0
208196

209197
# All keywords are valid identifiers, but should not be
210198
# considered identifiers here, except for True, False and None.
211-
if i < pos and (
212-
iskeyword(str[i:pos]) and
213-
str[i:pos] not in cls._ID_KEYWORDS
214-
):
215-
return 0
199+
if i < pos:
200+
word = str[i:pos]
201+
if iskeyword(word) and word not in cls._ID_KEYWORDS:
202+
return 0
216203

217204
return pos - i
218205

Lib/idlelib/idle_test/test_autoexpand.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,23 @@ def test_get_prevword(self):
7474
text.delete('1.0', 'end')
7575
equal(previous(), '')
7676

77+
def test_get_prevword_non_ascii(self):
78+
# gh-89855: word prefixes may contain non-ASCII identifier characters.
79+
text = self.text
80+
previous = self.auto_expand.getprevword
81+
equal = self.assertEqual
82+
83+
text.insert('insert', 'Ångström')
84+
equal(previous(), 'Ångström')
85+
86+
text.delete('1.0', 'end')
87+
text.insert('insert', 'їжак_naïve1')
88+
equal(previous(), 'їжак_naïve1')
89+
90+
# A non-word character ends the prefix.
91+
text.insert('insert', '+')
92+
equal(previous(), '')
93+
7794
def test_before_only(self):
7895
previous = self.auto_expand.getprevword
7996
expand = self.auto_expand.expand_word_event

Lib/idlelib/idle_test/test_editor.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,5 +237,51 @@ def test_rclick(self):
237237
pass
238238

239239

240+
class ColorizeSyntaxErrorTest(unittest.TestCase):
241+
"Test EditorWindow.colorize_syntax_error highlighting."
242+
243+
@classmethod
244+
def setUpClass(cls):
245+
requires('gui')
246+
cls.root = Tk()
247+
cls.root.withdraw()
248+
cls.text = Text(cls.root)
249+
250+
@classmethod
251+
def tearDownClass(cls):
252+
cls.text.destroy()
253+
del cls.text
254+
cls.root.destroy()
255+
del cls.root
256+
257+
def tearDown(self):
258+
self.text.tag_delete('ERROR')
259+
self.text.delete('1.0', 'end')
260+
261+
def colorize(self, line, pos):
262+
self.text.insert('1.0', line)
263+
# colorize_syntax_error() only uses its text and pos arguments.
264+
Editor.colorize_syntax_error(None, self.text, pos)
265+
266+
def test_ascii_identifier(self):
267+
# The whole word is highlighted, not just the error character.
268+
self.colorize('abc', '1.2')
269+
self.assertEqual(self.text.index('ERROR.first'), '1.0')
270+
self.assertEqual(self.text.index('ERROR.last'), '1.3')
271+
272+
def test_non_ascii_identifier(self):
273+
# gh-89855: this works when the word both starts with and has the
274+
# error at a non-ASCII character.
275+
self.colorize('їжак', '1.1') # 'ж', in a word starting with 'ї'
276+
self.assertEqual(self.text.index('ERROR.first'), '1.0')
277+
self.assertEqual(self.text.index('ERROR.last'), '1.2')
278+
279+
def test_non_identifier_char(self):
280+
# A non-identifier character highlights only itself.
281+
self.colorize('a+b', '1.1') # the '+'
282+
self.assertEqual(self.text.index('ERROR.first'), '1.1')
283+
self.assertEqual(self.text.index('ERROR.last'), '1.2')
284+
285+
240286
if __name__ == '__main__':
241287
unittest.main(verbosity=2)

Lib/idlelib/idle_test/test_undo.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"Test undo, coverage 77%."
22
# Only test UndoDelegator so far.
33

4-
from idlelib.undo import UndoDelegator
4+
from idlelib.undo import InsertCommand, UndoDelegator
55
import unittest
66
from test.support import requires
77
requires('gui')
@@ -131,5 +131,26 @@ def test_addcmd(self):
131131
self.assertLessEqual(len(self.delegator.undolist), max_undo)
132132

133133

134+
class InsertCommandTest(unittest.TestCase):
135+
"Test how InsertCommand classifies characters for merging insertions."
136+
137+
def test_classify(self):
138+
classify = InsertCommand('1.0', 'x').classify
139+
# ASCII identifier characters are alphanumeric.
140+
for c in 'aZ5_':
141+
with self.subTest(c=c):
142+
self.assertEqual(classify(c), 'alphanumeric')
143+
# gh-89855: so are non-ASCII identifier characters.
144+
for c in 'éñü\N{CYRILLIC SMALL LETTER UKRAINIAN IE}\N{GREEK SMALL LETTER ALPHA}':
145+
with self.subTest(c=c):
146+
self.assertEqual(classify(c), 'alphanumeric')
147+
self.assertEqual(classify('\n'), 'newline')
148+
# Everything else, including non-ASCII non-identifier characters,
149+
# is punctuation.
150+
for c in ' +.²':
151+
with self.subTest(c=c):
152+
self.assertEqual(classify(c), 'punctuation')
153+
154+
134155
if __name__ == '__main__':
135156
unittest.main(verbosity=2, exit=False)

Lib/idlelib/undo.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import string
2-
31
from idlelib.delegator import Delegator
42

53
# tkinter import not needed because module does not create widgets,
@@ -251,10 +249,8 @@ def merge(self, cmd):
251249
self.chars = self.chars + cmd.chars
252250
return True
253251

254-
alphanumeric = string.ascii_letters + string.digits + "_"
255-
256252
def classify(self, c):
257-
if c in self.alphanumeric:
253+
if ('a' + c).isidentifier():
258254
return "alphanumeric"
259255
if c == "\n":
260256
return "newline"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Improve support of non-ASCII identifiers in IDLE
2+
(autoexpanding, autocompletion, undo, etc).
3+

0 commit comments

Comments
 (0)