From 3cf86c838e974e811c2a9e7f417eaed59818a13f Mon Sep 17 00:00:00 2001 From: Josh Stockin Date: Fri, 17 Mar 2023 13:59:24 -0500 Subject: [PATCH 1/2] Prevent attempts at nesting charsets in regex filter Not sure how I didn't catch this when testing a couple years ago. Basically, nesting character sets is impossible in Python's regex, which is what my filter code tried to do because it wasn't checking for IN tokens. This adds that check and performs confusable insertion in place for those sets. Also cleans up some of the code in that area. --- futaba/cogs/filter/filter.py | 62 ++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/futaba/cogs/filter/filter.py b/futaba/cogs/filter/filter.py index 506d461c..7db24207 100644 --- a/futaba/cogs/filter/filter.py +++ b/futaba/cogs/filter/filter.py @@ -103,32 +103,54 @@ def build_regex(text, groups): return str(pattern) + @staticmethod + def generate_confusable_literals(confusable: chr) -> list: + # Generates list of confusable homoglyphs LITERALs based on input + # character, includes input character + + groups = confusables.is_confusable(confusable, greedy=True) + + confusable_literals = [(sre_parse.LITERAL, ord(confusable))] + + if not groups: + return confusable_literals + + # Append confusables to list as LITERALs + for homoglyph in groups[0]["homoglyphs"]: + confusable_literals += [ + (re.sre_parse.LITERAL, ord(char)) for char in homoglyph["c"] + ] + + return confusable_literals + @staticmethod def convert_raw_regex_ast(regex_ast: Iterable): for index, value in enumerate(regex_ast): - # Parse lexemes for LITERALs if isinstance(value, tuple): - lexeme_tuple = value - if lexeme_tuple[0] == sre_parse.LITERAL: - # LITERAL found, check if it's a confusable homoglyph... - groups = confusables.is_confusable( - chr(lexeme_tuple[1]), greedy=True + # (TOKEN_NAME, TOKEN_VALUE) likely + ( + token_name, + token_value, + *_, + ) = value + if token_name == sre_parse.IN: + # (IN, [ (LITERAL, ord) ... ]) + # sets cannot be nested, need to insert confusables in place + confusable_literals = [] + for literal in token_value: + confusable_literals += Filter.generate_confusable_literals( + chr(literal[1]) + ) + token_value += confusable_literals + if token_name == sre_parse.LITERAL: + # (LITERAL, ord), replace with set + confusable_literals = Filter.generate_confusable_literals( + chr(token_value) ) - if not groups: - continue - # Convert group into list of confusable LITERALs - group = groups[0] # one char, so only one group - confusable_literals = [lexeme_tuple] - for homoglyph in group["homoglyphs"]: - confusable_literals += [ - (sre_parse.LITERAL, ord(char)) for char in homoglyph["c"] - ] - in_lexeme_tuple = (sre_parse.IN, confusable_literals) - - # Overwrite this lexeme - regex_ast[index] = in_lexeme_tuple + regex_ast[index] = (sre_parse.IN, confusable_literals) else: - # More possible lexemes, recurse and overwrite... + # Miscellaneous token, might contain more tokens + # Recurse and overwrite... regex_ast[index] = tuple(Filter.convert_raw_regex_ast(list(value))) elif isinstance(value, sre_parse.SubPattern): regex_ast[index] = Filter.convert_raw_regex_ast(value) From d98e3c7e6201c4a6bf6a5af418b528b6188005e6 Mon Sep 17 00:00:00 2001 From: Josh Stockin Date: Sat, 18 Mar 2023 13:21:10 -0500 Subject: [PATCH 2/2] Add support for range in character set Also missed this. "a-z" in [a-z] is tokenized as (RANGE, (97, 122)). This commit adds support for those tokens. --- futaba/cogs/filter/filter.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/futaba/cogs/filter/filter.py b/futaba/cogs/filter/filter.py index 7db24207..a786f611 100644 --- a/futaba/cogs/filter/filter.py +++ b/futaba/cogs/filter/filter.py @@ -137,10 +137,20 @@ def convert_raw_regex_ast(regex_ast: Iterable): # (IN, [ (LITERAL, ord) ... ]) # sets cannot be nested, need to insert confusables in place confusable_literals = [] - for literal in token_value: - confusable_literals += Filter.generate_confusable_literals( - chr(literal[1]) - ) + for set_token in token_value: + if set_token[0] == sre_parse.RANGE: + # (RANGE, (LOWER_BOUND, UPPER_BOUND)), e.g. a-z in [a-z] + lower_bound = set_token[1][0] + upper_bound = set_token[1][1] + for ord_value in range(lower_bound, upper_bound + 1): + confusable_literals += ( + Filter.generate_confusable_literals(chr(ord_value)) + ) + else: + # Must be LITERAL + confusable_literals += Filter.generate_confusable_literals( + chr(set_token[1]) + ) token_value += confusable_literals if token_name == sre_parse.LITERAL: # (LITERAL, ord), replace with set