diff --git a/Doc/reference/grammar.rst b/Doc/reference/grammar.rst index 1037feb691f6bc..0ce8e42ddf3b0c 100644 --- a/Doc/reference/grammar.rst +++ b/Doc/reference/grammar.rst @@ -12,8 +12,17 @@ The notation used here is the same as in the preceding docs, and is described in the :ref:`notation ` section, except for an extra complication: -* ``~`` ("cut"): commit to the current alternative and fail the rule - even if this fails to parse +* ``~`` ("cut"): commit to the current alternative; fail the rule + if the alternative fails to parse + + Python mainly uses cuts for optimizations or improved error + messages. They often appear to be useless in the listing below. + + .. see gh-143054, and CutValidator in the source, if you want to change this: + + Cuts currently don't appear inside parentheses, brackets, lookaheads + and similar. + Their behavior in these contexts is deliberately left unspecified. .. literalinclude:: ../../Grammar/python.gram :language: peg diff --git a/Lib/test/test_peg_generator/test_grammar_validator.py b/Lib/test/test_peg_generator/test_grammar_validator.py index c7f20e1de802ce..857aced8ae5dcf 100644 --- a/Lib/test/test_peg_generator/test_grammar_validator.py +++ b/Lib/test/test_peg_generator/test_grammar_validator.py @@ -4,7 +4,8 @@ test_tools.skip_if_missing("peg_generator") with test_tools.imports_under_tool("peg_generator"): from pegen.grammar_parser import GeneratedParser as GrammarParser - from pegen.validator import SubRuleValidator, ValidationError, RaiseRuleValidator + from pegen.validator import SubRuleValidator, ValidationError + from pegen.validator import RaiseRuleValidator, CutValidator from pegen.testutil import parse_string from pegen.grammar import Grammar @@ -59,3 +60,18 @@ def test_raising_valid_rule(self) -> None: with self.assertRaises(ValidationError): for rule_name, rule in grammar.rules.items(): validator.validate_rule(rule_name, rule) + + def test_cut_validator(self) -> None: + grammar_source = """ + star: (OP ~ OP)* + plus: (OP ~ OP)+ + bracket: [OP ~ OP] + gather: OP.(OP ~ OP)+ + nested: [OP | NAME ~ OP] + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + validator = CutValidator(grammar) + for rule_name, rule in grammar.rules.items(): + with self.subTest(rule_name): + with self.assertRaises(ValidationError): + validator.validate_rule(rule_name, rule) diff --git a/Lib/test/test_peg_generator/test_pegen.py b/Lib/test/test_peg_generator/test_pegen.py index d03ba07975a616..f39fcc2e0d8daf 100644 --- a/Lib/test/test_peg_generator/test_pegen.py +++ b/Lib/test/test_peg_generator/test_pegen.py @@ -755,6 +755,30 @@ def test_cut(self) -> None: ], ) + def test_cut_is_local_in_rule(self) -> None: + grammar = """ + start: + | inner + | 'x' { "ok" } + inner: + | 'x' ~ 'y' + | 'x' + """ + parser_class = make_parser(grammar) + node = parse_string("x", parser_class) + self.assertEqual(node, 'ok') + + def test_cut_is_local_in_parens(self) -> None: + # we currently don't guarantee this behavior, see gh-143054 + grammar = """ + start: + | ('x' ~ 'y' | 'x') + | 'x' { "ok" } + """ + parser_class = make_parser(grammar) + node = parse_string("x", parser_class) + self.assertEqual(node, 'ok') + def test_dangling_reference(self) -> None: grammar = """ start: foo ENDMARKER diff --git a/Tools/peg_generator/pegen/validator.py b/Tools/peg_generator/pegen/validator.py index 635eb398b41808..5e2bc238a1e966 100644 --- a/Tools/peg_generator/pegen/validator.py +++ b/Tools/peg_generator/pegen/validator.py @@ -1,3 +1,5 @@ +from typing import Any + from pegen import grammar from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule @@ -44,6 +46,37 @@ def visit_Alt(self, node: Alt) -> None: ) +class CutValidator(GrammarValidator): + """Fail if Cut is not directly in a rule. + + For simplicity, we currently document that a Cut affects alternatives + of the *rule* it is in. + However, the implementation makes cuts local to enclosing Rhs + (e.g. parenthesized list of choices). + Additionally, in academic papers about PEG, repeats and optional items + are "desugared" to choices with an empty alternative, and thus contain + a Cut's effect. + + Please update documentation and tests when adding this cut, + then get rid of this validator. + + See gh-143054. + """ + + def visit(self, node: Any, parents: tuple[Any, ...] = ()) -> None: + super().visit(node, parents=(*parents, node)) + + def visit_Cut(self, node: Alt, parents: tuple[Any, ...] = ()) -> None: + parent_types = [type(p).__name__ for p in parents] + if parent_types != ['Rule', 'Rhs', 'Alt', 'NamedItem', 'Cut']: + raise ValidationError( + f"Rule {self.rulename!r} contains cut that's not on the " + "top level. " + "The intended semantics of such cases need " + "to be clarified; see the CutValidator docstring." + f"\nThe cut is inside: {parent_types}" + ) + def validate_grammar(the_grammar: grammar.Grammar) -> None: for validator_cls in GrammarValidator.__subclasses__(): validator = validator_cls(the_grammar)