diff --git a/Lib/idlelib/autocomplete.py b/Lib/idlelib/autocomplete.py index 032d31225315fb..4d4b71c46e8ecc 100644 --- a/Lib/idlelib/autocomplete.py +++ b/Lib/idlelib/autocomplete.py @@ -28,9 +28,8 @@ TRY_A = False, False, False, ATTRS # '.' for attributes. TRY_F = False, False, False, FILES # '/' in quotes for file name. -# This string includes all chars that may be in an identifier. -# TODO Update this here and elsewhere. -ID_CHARS = string.ascii_letters + string.digits + "_" +# all ASCII chars that may be in an identifier +_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") SEPS = f"{os.sep}{os.altsep if os.altsep else ''}" TRIGGERS = f".{SEPS}" @@ -134,7 +133,11 @@ def open_completions(self, args): elif hp.is_in_code() and (not mode or mode==ATTRS): self._remove_autocomplete_window() mode = ATTRS - while i and (curline[i-1] in ID_CHARS or ord(curline[i-1]) > 127): + while i: + c = curline[i-1] + if c not in _ASCII_ID_CHARS: + if c <= '\x7f' or not ('a' + c).isidentifier(): + break i -= 1 comp_start = curline[i:j] if i and curline[i-1] == '.': # Need object with attributes. diff --git a/Lib/idlelib/autoexpand.py b/Lib/idlelib/autoexpand.py index 92f5c84eb6f401..206d36994ce9e8 100644 --- a/Lib/idlelib/autoexpand.py +++ b/Lib/idlelib/autoexpand.py @@ -13,12 +13,10 @@ There is only one instance of Autoexpand. ''' import re -import string +_LAST_WORD_RE = re.compile(r'\b\w+\Z') class AutoExpand: - wordchars = string.ascii_letters + string.digits + "_" - def __init__(self, editwin): self.text = editwin.text self.bell = self.text.bell @@ -85,10 +83,8 @@ def getwords(self): def getprevword(self): "Return the word prefix before the cursor." line = self.text.get("insert linestart", "insert") - i = len(line) - while i > 0 and line[i-1] in self.wordchars: - i = i-1 - return line[i:] + m = _LAST_WORD_RE.search(line) + return m[0] if m else '' if __name__ == '__main__': diff --git a/Lib/idlelib/editor.py b/Lib/idlelib/editor.py index 239bf5af470567..d03987aa19cf4e 100644 --- a/Lib/idlelib/editor.py +++ b/Lib/idlelib/editor.py @@ -3,7 +3,6 @@ import os import platform import re -import string import sys import tokenize import traceback @@ -809,14 +808,12 @@ def ResetColorizer(self): if self.line_numbers is not None: self.line_numbers.update_colors() - IDENTCHARS = string.ascii_letters + string.digits + "_" - def colorize_syntax_error(self, text, pos): text.tag_add("ERROR", pos) char = text.get(pos) - if char and char in self.IDENTCHARS: + if char and ('a' + char).isidentifier(): text.tag_add("ERROR", pos + " wordstart", pos) - if '\n' == text.get(pos): # error at line end + if char == '\n': # error at line end text.mark_set("insert", pos) else: text.mark_set("insert", pos + "+1c") diff --git a/Lib/idlelib/hyperparser.py b/Lib/idlelib/hyperparser.py index 76144ee8fb30f5..c510f4dc79d93f 100644 --- a/Lib/idlelib/hyperparser.py +++ b/Lib/idlelib/hyperparser.py @@ -14,13 +14,6 @@ # all ASCII chars that may be the first char of an identifier _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") -# lookup table for whether 7-bit ASCII chars are valid in a Python identifier -_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] -# lookup table for whether 7-bit ASCII chars are valid as the first -# char in a Python identifier -_IS_ASCII_ID_FIRST_CHAR = \ - [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] - class HyperParser: def __init__(self, editwin, index): @@ -166,8 +159,6 @@ def _eat_identifier(cls, str, limit, pos): This ignores non-identifier eywords are not identifiers. """ - is_ascii_id_char = _IS_ASCII_ID_CHAR - # Start at the end (pos) and work backwards. i = pos @@ -175,44 +166,40 @@ def _eat_identifier(cls, str, limit, pos): # identifier characters. This is an optimization, since it # is faster in the common case where most of the characters # are ASCII. - while i > limit and ( - ord(str[i - 1]) < 128 and - is_ascii_id_char[ord(str[i - 1])] - ): + while i > limit and str[i - 1] in _ASCII_ID_CHARS: i -= 1 # If the above loop ended due to reaching a non-ASCII # character, continue going backwards using the most generic # test for whether a string contains only valid identifier # characters. - if i > limit and ord(str[i - 1]) >= 128: - while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): + if i > limit and str[i - 1] > '\x7f': + while i - 4 >= limit and ('a' + str[i - 4:i]).isidentifier(): i -= 4 - if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): + if i - 2 >= limit and ('a' + str[i - 2:i]).isidentifier(): i -= 2 - if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): + if i - 1 >= limit and ('a' + str[i - 1]).isidentifier(): i -= 1 # The identifier candidate starts here. If it isn't a valid # identifier, don't eat anything. At this point that is only # possible if the first character isn't a valid first # character for an identifier. - if not str[i:pos].isidentifier(): + if i < pos and not str[i].isidentifier(): return 0 elif i < pos: # All characters in str[i:pos] are valid ASCII identifier # characters, so it is enough to check that the first is # valid as the first character of an identifier. - if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: + if str[i] not in _ASCII_ID_FIRST_CHARS: return 0 # All keywords are valid identifiers, but should not be # considered identifiers here, except for True, False and None. - if i < pos and ( - iskeyword(str[i:pos]) and - str[i:pos] not in cls._ID_KEYWORDS - ): - return 0 + if i < pos: + word = str[i:pos] + if iskeyword(word) and word not in cls._ID_KEYWORDS: + return 0 return pos - i diff --git a/Lib/idlelib/idle_test/test_autoexpand.py b/Lib/idlelib/idle_test/test_autoexpand.py index e734a8be714a2a..ac17fe8caf03e3 100644 --- a/Lib/idlelib/idle_test/test_autoexpand.py +++ b/Lib/idlelib/idle_test/test_autoexpand.py @@ -74,6 +74,23 @@ def test_get_prevword(self): text.delete('1.0', 'end') equal(previous(), '') + def test_get_prevword_non_ascii(self): + # gh-89855: word prefixes may contain non-ASCII identifier characters. + text = self.text + previous = self.auto_expand.getprevword + equal = self.assertEqual + + text.insert('insert', 'Ångström') + equal(previous(), 'Ångström') + + text.delete('1.0', 'end') + text.insert('insert', 'їжак_naïve1') + equal(previous(), 'їжак_naïve1') + + # A non-word character ends the prefix. + text.insert('insert', '+') + equal(previous(), '') + def test_before_only(self): previous = self.auto_expand.getprevword expand = self.auto_expand.expand_word_event diff --git a/Lib/idlelib/idle_test/test_editor.py b/Lib/idlelib/idle_test/test_editor.py index e28ee549f180aa..7a309fbe04b768 100644 --- a/Lib/idlelib/idle_test/test_editor.py +++ b/Lib/idlelib/idle_test/test_editor.py @@ -237,5 +237,51 @@ def test_rclick(self): pass +class ColorizeSyntaxErrorTest(unittest.TestCase): + "Test EditorWindow.colorize_syntax_error highlighting." + + @classmethod + def setUpClass(cls): + requires('gui') + cls.root = Tk() + cls.root.withdraw() + cls.text = Text(cls.root) + + @classmethod + def tearDownClass(cls): + cls.text.destroy() + del cls.text + cls.root.destroy() + del cls.root + + def tearDown(self): + self.text.tag_delete('ERROR') + self.text.delete('1.0', 'end') + + def colorize(self, line, pos): + self.text.insert('1.0', line) + # colorize_syntax_error() only uses its text and pos arguments. + Editor.colorize_syntax_error(None, self.text, pos) + + def test_ascii_identifier(self): + # The whole word is highlighted, not just the error character. + self.colorize('abc', '1.2') + self.assertEqual(self.text.index('ERROR.first'), '1.0') + self.assertEqual(self.text.index('ERROR.last'), '1.3') + + def test_non_ascii_identifier(self): + # gh-89855: this works when the word both starts with and has the + # error at a non-ASCII character. + self.colorize('їжак', '1.1') # 'ж', in a word starting with 'ї' + self.assertEqual(self.text.index('ERROR.first'), '1.0') + self.assertEqual(self.text.index('ERROR.last'), '1.2') + + def test_non_identifier_char(self): + # A non-identifier character highlights only itself. + self.colorize('a+b', '1.1') # the '+' + self.assertEqual(self.text.index('ERROR.first'), '1.1') + self.assertEqual(self.text.index('ERROR.last'), '1.2') + + if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/Lib/idlelib/idle_test/test_undo.py b/Lib/idlelib/idle_test/test_undo.py index beb5b582039f88..fd7382cc53a13e 100644 --- a/Lib/idlelib/idle_test/test_undo.py +++ b/Lib/idlelib/idle_test/test_undo.py @@ -1,7 +1,7 @@ "Test undo, coverage 77%." # Only test UndoDelegator so far. -from idlelib.undo import UndoDelegator +from idlelib.undo import InsertCommand, UndoDelegator import unittest from test.support import requires requires('gui') @@ -131,5 +131,26 @@ def test_addcmd(self): self.assertLessEqual(len(self.delegator.undolist), max_undo) +class InsertCommandTest(unittest.TestCase): + "Test how InsertCommand classifies characters for merging insertions." + + def test_classify(self): + classify = InsertCommand('1.0', 'x').classify + # ASCII identifier characters are alphanumeric. + for c in 'aZ5_': + with self.subTest(c=c): + self.assertEqual(classify(c), 'alphanumeric') + # gh-89855: so are non-ASCII identifier characters. + for c in 'éñü\N{CYRILLIC SMALL LETTER UKRAINIAN IE}\N{GREEK SMALL LETTER ALPHA}': + with self.subTest(c=c): + self.assertEqual(classify(c), 'alphanumeric') + self.assertEqual(classify('\n'), 'newline') + # Everything else, including non-ASCII non-identifier characters, + # is punctuation. + for c in ' +.²': + with self.subTest(c=c): + self.assertEqual(classify(c), 'punctuation') + + if __name__ == '__main__': unittest.main(verbosity=2, exit=False) diff --git a/Lib/idlelib/undo.py b/Lib/idlelib/undo.py index f52446d5fcdcf8..5c6ee57a925d1c 100644 --- a/Lib/idlelib/undo.py +++ b/Lib/idlelib/undo.py @@ -1,5 +1,3 @@ -import string - from idlelib.delegator import Delegator # tkinter import not needed because module does not create widgets, @@ -251,10 +249,8 @@ def merge(self, cmd): self.chars = self.chars + cmd.chars return True - alphanumeric = string.ascii_letters + string.digits + "_" - def classify(self, c): - if c in self.alphanumeric: + if ('a' + c).isidentifier(): return "alphanumeric" if c == "\n": return "newline" diff --git a/Misc/NEWS.d/next/IDLE/2021-11-03-10-37-29.gh-issue-89855.QSuHbM.rst b/Misc/NEWS.d/next/IDLE/2021-11-03-10-37-29.gh-issue-89855.QSuHbM.rst new file mode 100644 index 00000000000000..55ab99803b34a6 --- /dev/null +++ b/Misc/NEWS.d/next/IDLE/2021-11-03-10-37-29.gh-issue-89855.QSuHbM.rst @@ -0,0 +1,3 @@ +Improve support of non-ASCII identifiers in IDLE +(autoexpanding, autocompletion, undo, etc). +