From 3cf86c838e974e811c2a9e7f417eaed59818a13f Mon Sep 17 00:00:00 2001
From: Josh Stockin <josh@joshstock.in>
Date: Fri, 17 Mar 2023 13:59:24 -0500
Subject: [PATCH 1/2] Prevent attempts at nesting charsets in regex filter

Not sure how I didn't catch this when testing a couple years ago.
Basically, nesting character sets is impossible in Python's regex, which
is what my filter code tried to do because it wasn't checking for IN
tokens. This adds that check and performs confusable insertion in place
for those sets. Also cleans up some of the code in that area.
---
 futaba/cogs/filter/filter.py | 62 ++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/futaba/cogs/filter/filter.py b/futaba/cogs/filter/filter.py
index 506d461c..7db24207 100644
--- a/futaba/cogs/filter/filter.py
+++ b/futaba/cogs/filter/filter.py
@@ -103,32 +103,54 @@ def build_regex(text, groups):
 
         return str(pattern)
 
+    @staticmethod
+    def generate_confusable_literals(confusable: chr) -> list:
+        # Generates list of confusable homoglyphs LITERALs based on input
+        # character, includes input character
+
+        groups = confusables.is_confusable(confusable, greedy=True)
+
+        confusable_literals = [(sre_parse.LITERAL, ord(confusable))]
+
+        if not groups:
+            return confusable_literals
+
+        # Append confusables to list as LITERALs
+        for homoglyph in groups[0]["homoglyphs"]:
+            confusable_literals += [
+                (re.sre_parse.LITERAL, ord(char)) for char in homoglyph["c"]
+            ]
+
+        return confusable_literals
+
     @staticmethod
     def convert_raw_regex_ast(regex_ast: Iterable):
         for index, value in enumerate(regex_ast):
-            # Parse lexemes for LITERALs
             if isinstance(value, tuple):
-                lexeme_tuple = value
-                if lexeme_tuple[0] == sre_parse.LITERAL:
-                    # LITERAL found, check if it's a confusable homoglyph...
-                    groups = confusables.is_confusable(
-                        chr(lexeme_tuple[1]), greedy=True
+                # (TOKEN_NAME, TOKEN_VALUE) likely
+                (
+                    token_name,
+                    token_value,
+                    *_,
+                ) = value
+                if token_name == sre_parse.IN:
+                    # (IN, [ (LITERAL, ord) ... ])
+                    # sets cannot be nested, need to insert confusables in place
+                    confusable_literals = []
+                    for literal in token_value:
+                        confusable_literals += Filter.generate_confusable_literals(
+                            chr(literal[1])
+                        )
+                    token_value += confusable_literals
+                if token_name == sre_parse.LITERAL:
+                    # (LITERAL, ord), replace with set
+                    confusable_literals = Filter.generate_confusable_literals(
+                        chr(token_value)
                     )
-                    if not groups:
-                        continue
-                    # Convert group into list of confusable LITERALs
-                    group = groups[0]  # one char, so only one group
-                    confusable_literals = [lexeme_tuple]
-                    for homoglyph in group["homoglyphs"]:
-                        confusable_literals += [
-                            (sre_parse.LITERAL, ord(char)) for char in homoglyph["c"]
-                        ]
-                    in_lexeme_tuple = (sre_parse.IN, confusable_literals)
-
-                    # Overwrite this lexeme
-                    regex_ast[index] = in_lexeme_tuple
+                    regex_ast[index] = (sre_parse.IN, confusable_literals)
                 else:
-                    # More possible lexemes, recurse and overwrite...
+                    # Miscellaneous token, might contain more tokens
+                    # Recurse and overwrite...
                     regex_ast[index] = tuple(Filter.convert_raw_regex_ast(list(value)))
             elif isinstance(value, sre_parse.SubPattern):
                 regex_ast[index] = Filter.convert_raw_regex_ast(value)

From d98e3c7e6201c4a6bf6a5af418b528b6188005e6 Mon Sep 17 00:00:00 2001
From: Josh Stockin <josh@joshstock.in>
Date: Sat, 18 Mar 2023 13:21:10 -0500
Subject: [PATCH 2/2] Add support for range in character set

Also missed this. "a-z" in [a-z] is tokenized as (RANGE, (97, 122)).
This commit adds support for those tokens.
---
 futaba/cogs/filter/filter.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/futaba/cogs/filter/filter.py b/futaba/cogs/filter/filter.py
index 7db24207..a786f611 100644
--- a/futaba/cogs/filter/filter.py
+++ b/futaba/cogs/filter/filter.py
@@ -137,10 +137,20 @@ def convert_raw_regex_ast(regex_ast: Iterable):
                     # (IN, [ (LITERAL, ord) ... ])
                     # sets cannot be nested, need to insert confusables in place
                     confusable_literals = []
-                    for literal in token_value:
-                        confusable_literals += Filter.generate_confusable_literals(
-                            chr(literal[1])
-                        )
+                    for set_token in token_value:
+                        if set_token[0] == sre_parse.RANGE:
+                            # (RANGE, (LOWER_BOUND, UPPER_BOUND)), e.g. a-z in [a-z]
+                            lower_bound = set_token[1][0]
+                            upper_bound = set_token[1][1]
+                            for ord_value in range(lower_bound, upper_bound + 1):
+                                confusable_literals += (
+                                    Filter.generate_confusable_literals(chr(ord_value))
+                                )
+                        else:
+                            # Must be LITERAL
+                            confusable_literals += Filter.generate_confusable_literals(
+                                chr(set_token[1])
+                            )
                     token_value += confusable_literals
                 if token_name == sre_parse.LITERAL:
                     # (LITERAL, ord), replace with set