From c4167e2f20d04add84ebfb64601ab12de6e5189f Mon Sep 17 00:00:00 2001 From: Labib-Bin-Salam Date: Tue, 23 Jun 2026 18:47:00 +0100 Subject: [PATCH] Bugfix: call __default_token__ from an embedded transformer (Issue #1582) When a transformer is applied during parsing via Lark(transformer=...), tokens without a dedicated method were left untouched, whereas Transformer.transform() falls back to __default_token__ for them. _get_lexer_callbacks now wires up an overridden __default_token__ as the fallback token callback, matching transform(). The base no-op is skipped, so the common case keeps tokens untouched with no extra call per token. --- lark/parser_frontends.py | 10 +++++++++- tests/test_parser.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e67890d42..3e6fcc433 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,6 +7,7 @@ from .parsers.lalr_parser import LALR_Parser from .tree import Tree from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType +from .visitors import Transformer if TYPE_CHECKING: from .parsers.lalr_analysis import ParseTableBase @@ -153,8 +154,15 @@ def _validate_frontend_args(parser, lexer) -> None: def _get_lexer_callbacks(transformer, terminals): result = {} + # Tokens without a dedicated transformer method fall back to + # __default_token__, mirroring Transformer.transform(). The base + # implementation is a no-op, so it's only wired up when overridden, + # to avoid a needless call per token (see issue #1582). + default_token = getattr(transformer, '__default_token__', None) + if getattr(type(transformer), '__default_token__', None) is Transformer.__default_token__: + default_token = None for terminal in terminals: - callback = getattr(transformer, terminal.name, None) + callback = getattr(transformer, terminal.name, default_token) if callback is not None: result[terminal.name] = callback return result diff --git a/tests/test_parser.py b/tests/test_parser.py index 721e8693a..4f0eb784f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2770,6 +2770,34 @@ def __default__(self, data, children, meta): b = parser.parse(s) assert a == b + @unittest.skipIf(PARSER != 'lalr', "Embedded token callbacks are only applied by the lalr parser") + def test_default_token_in_treeless_mode(self): + # Regression test for issue #1582: an embedded transformer did not + # call __default_token__ on tokens, unlike Transformer.transform(). + grammar = r""" + start: expr + + expr: A B + | A expr B + + A: "a" + B: "b" + + %import common.WS + %ignore WS + """ + s = 'a a a b b b' + + class AbTransformer(Transformer): + def __default_token__(self, token): + return token.update(value=str(token).upper()) + + parser = _Lark(grammar) + a = AbTransformer().transform(parser.parse(s)) + parser = _Lark(grammar, transformer=AbTransformer()) + b = parser.parse(s) + assert a == b + @unittest.skipIf(PARSER != 'lalr', "strict mode is only supported in lalr for now") def test_strict(self): # Test regex collision