diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index acb946d..a47595a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -48,6 +48,11 @@ jobs: - name: Lint with Clippy (All features) run: cargo clippy --all-features -- -D warnings + - name: Check documentation + run: cargo doc --no-deps --all-features + env: + RUSTDOCFLAGS: "-D warnings" + - name: Build run: cargo build --features protoc --verbose diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0051720..b6054a5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -141,6 +141,18 @@ Keep parser internals split into two phases: This separation keeps grammar concerns and semantic checks independent and easier to test. +For extension declaration lines, use parser-layer entrypoints in +`parser/lalrpop_line.rs`: + +- Prefer explicit helpers (`parse_extension_urn_declaration`, + `parse_extension_declaration`) at parser call sites. +- `FromStr` is also available on parser AST declaration types + (`ExtensionUrnDeclaration`, `ExtensionDeclaration`) for ergonomic tests and + small adapters. + +Keep extension handlers parser-independent: lowering converts parser AST +relation arguments into `ExtensionArgs` before invoking extension resolution. + #### Documentation Formatting ##### Rustdoc Markdown Formatting diff --git a/Justfile b/Justfile index da248ed..85c3672 100644 --- a/Justfile +++ b/Justfile @@ -23,6 +23,7 @@ test: # Run clippy to check for linting errors. check: cargo clippy --examples --tests --all-features + RUSTDOCFLAGS="-D warnings" cargo doc --no-deps --all-features examples: cargo run --example basic_usage diff --git a/adrs/lalrpop.md b/adrs/lalrpop.md index cbf2df5..f3a677f 100644 --- a/adrs/lalrpop.md +++ b/adrs/lalrpop.md @@ -75,6 +75,10 @@ Relation-specific validation (e.g., "Filter expects an expression before `=>` an **Implementation note (2026-02-26):** The current implementation parses argument entries as a single ordered list and records whether named/positional ordering is invalid, then reports that as a lowering validation error. This was chosen to keep the grammar conflict-free while preserving the same user-visible constraint. +**Implementation note (2026-02-26):** Extension declaration lines in the +`=== Extensions` section (`@...` URNs and `#...` declarations) are parsed via +the same LALRPOP -> AST pipeline and then validated in parser/lowering code. + **Benefits:** - Smaller grammar with fewer LR conflict risks. - Adding new relation types (Window, Set, HashJoin) requires only lowering code, no grammar recompilation. diff --git a/src/extensions/any.rs b/src/extensions/any.rs index 7176c2f..fa78b47 100644 --- a/src/extensions/any.rs +++ b/src/extensions/any.rs @@ -1,3 +1,10 @@ +//! Unified wrapper for the protobuf `Any` type. +//! +//! Substrait extension relations carry opaque payloads as protobuf `Any` +//! messages. The Rust ecosystem has two incompatible `Any` types +//! (`prost_types::Any` and `pbjson_types::Any`); this module provides [`Any`] +//! and [`AnyRef`] to convert between them transparently. + use prost::{Message, Name}; use crate::extensions::registry::ExtensionError; @@ -11,8 +18,8 @@ pub struct Any { } /// A reference to a protobuf `Any` type. Can be created from references to -/// [`prost_types::Any`](prost_types::Any), -/// [`pbjson_types::Any`](pbjson_types::Any), or our own [`Any`](Any) type. +/// [`prost_types::Any`], +/// [`pbjson_types::Any`], or our own [`Any`] type. #[derive(Debug, Copy, Clone, PartialEq)] pub struct AnyRef<'a> { pub type_url: &'a str, diff --git a/src/extensions/args.rs b/src/extensions/args.rs index 955f9db..c137ef6 100644 --- a/src/extensions/args.rs +++ b/src/extensions/args.rs @@ -309,8 +309,8 @@ impl ExtensionRelationType { } } -// Note: create_rel is implemented in parser/extensions.rs to avoid -// pulling in protobuf dependencies in the core args module +// Note: relation construction lives in parser/lower/extensions.rs so this +// core args module stays parser- and protobuf-agnostic. impl ExtensionArgs { /// Create a new empty ExtensionArgs diff --git a/src/extensions/simple.rs b/src/extensions/simple.rs index 9b74086..9b4b59b 100644 --- a/src/extensions/simple.rs +++ b/src/extensions/simple.rs @@ -1,3 +1,10 @@ +//! Anchor-to-name lookup table for Substrait simple extensions. +//! +//! [`SimpleExtensions`] maps extension anchors (`#10`, `@1`) to URNs and +//! function/type names. It is the shared data structure used by both the +//! parser (to resolve references while lowering) and the textifier (to +//! display human-readable names instead of numeric anchors). + use std::collections::BTreeMap; use std::collections::btree_map::Entry; use std::fmt; diff --git a/src/parser/ast.rs b/src/parser/ast.rs index dc49c60..66c39f5 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -1,10 +1,51 @@ -//! AST used by the parser pipeline (`line_grammar.lalrpop` -> lowering). +//! AST for the text representation of `substrait-explain`, used by the +//! [`crate::parser`] module. +//! +//! Parsing happens in two steps: +//! +//! 1. Conversion from text to the AST in the [`crate::parser`] module via +//! LALRPOP, with syntactical validation. +//! 2. Conversion from the AST to a Protobuf [`substrait::proto::Plan`], via the +//! [`crate::parser::lower`] module, with semantic validation, e.g. function +//! references, anchors, column references, argument types/shapes match. //! //! This AST is intentionally close to the text syntax and does not encode all //! semantic constraints; semantic validation happens in lowering. use std::fmt; +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ExtensionUrnDeclaration { + pub anchor: u32, + pub urn: String, +} + +impl ExtensionUrnDeclaration { + pub fn new(anchor: u32, urn: impl Into) -> Self { + Self { + anchor, + urn: urn.into(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ExtensionDeclaration { + pub anchor: u32, + pub urn_anchor: u32, + pub name: String, +} + +impl ExtensionDeclaration { + pub fn new(anchor: u32, urn_anchor: u32, name: impl Into) -> Self { + Self { + anchor, + urn_anchor, + name: name.into(), + } + } +} + #[derive(Debug, Clone, PartialEq)] pub struct Relation { pub name: RelationName, diff --git a/src/parser/errors.rs b/src/parser/errors.rs index 41c2bc9..61499e5 100644 --- a/src/parser/errors.rs +++ b/src/parser/errors.rs @@ -1,3 +1,14 @@ +//! Error types for the parser. +//! +//! Three layers of errors, matching the parsing phases: +//! +//! - [`SyntaxErrorDetail`] — LALRPOP parse failures with span and expected-token +//! info, for rendering caret diagnostics on a single line. +//! - [`MessageParseError`] — semantic errors during lowering (invalid values, +//! missing references), categorised by [`ErrorKind`]. +//! - [`ParseError`] — top-level error that attaches [`ParseContext`] (line +//! number + source text) to any of the above. + use std::fmt; use thiserror::Error; diff --git a/src/parser/extensions.rs b/src/parser/extensions.rs index 581cc51..433a673 100644 --- a/src/parser/extensions.rs +++ b/src/parser/extensions.rs @@ -1,16 +1,21 @@ +//! Incremental parser for the `=== Extensions` section. +//! +//! The structural parser feeds lines one at a time to [`ExtensionParser`], +//! which uses a state machine ([`ExtensionParserState`]) to track which +//! subsection it's in (URNs, Functions, Types, or Type Variations) and +//! dispatches each line to the appropriate LALRPOP entry point. + use std::fmt; use std::str::FromStr; use thiserror::Error; use crate::extensions::any::Any; +use crate::extensions::registry::ExtensionError; use crate::extensions::simple::{self, ExtensionKind}; -use crate::extensions::{ - ExtensionArgs, ExtensionColumn, ExtensionRegistry, ExtensionRelationType, ExtensionValue, - InsertError, RawExpression, SimpleExtensions, -}; +use crate::extensions::{ExtensionArgs, ExtensionRegistry, InsertError, SimpleExtensions}; use crate::parser::ast; -use crate::parser::errors::{MessageParseError, ParseContext, ParseError}; +use crate::parser::errors::{MessageParseError, ParseContext, ParseError, SyntaxErrorDetail}; use crate::parser::structural::IndentedLine; #[derive(Debug, Clone, Error)] @@ -100,7 +105,7 @@ impl ExtensionParser { match line { IndentedLine(0, _s) => self.parse_subsection(line), IndentedLine(1, s) => { - let urn = URNExtensionDeclaration::from_str(s)?; + let urn = parse_urn_declaration(s)?; self.extensions.add_extension_urn(urn.urn, urn.anchor)?; Ok(()) } @@ -116,7 +121,7 @@ impl ExtensionParser { match line { IndentedLine(0, _s) => self.parse_subsection(line), IndentedLine(1, s) => { - let decl = SimpleExtensionDeclaration::from_str(s)?; + let decl = parse_simple_declaration(s)?; self.extensions.add_extension( extension_kind, decl.urn_anchor, @@ -138,133 +143,37 @@ impl ExtensionParser { } } -#[derive(Debug, Clone, PartialEq)] -pub struct URNExtensionDeclaration { - pub anchor: u32, - pub urn: String, -} - -impl FromStr for URNExtensionDeclaration { - type Err = MessageParseError; - - fn from_str(s: &str) -> Result { - let trimmed = s.trim(); - let Some(after_at) = trimmed.strip_prefix('@') else { - return Err(MessageParseError::syntax( - "URNExtensionDeclaration", - format!("expected '@: ', got '{trimmed}'"), - )); - }; - - let Some((anchor, urn)) = after_at.split_once(':') else { - return Err(MessageParseError::syntax( - "URNExtensionDeclaration", - format!("missing ':' in '{trimmed}'"), - )); - }; - - let anchor = anchor - .trim() - .parse::() - .map_err(|e| MessageParseError::invalid("URNExtensionDeclaration", e))?; - let urn = urn.trim(); - if urn.is_empty() { - return Err(MessageParseError::invalid( - "URNExtensionDeclaration", - "URN cannot be empty", - )); - } - - Ok(Self { - anchor, - urn: urn.to_string(), - }) +fn parse_urn_declaration(input: &str) -> Result { + let parsed: ast::ExtensionUrnDeclaration = + parse_declaration_line(input, "URNExtensionDeclaration")?; + if parsed.urn.trim().is_empty() { + return Err(MessageParseError::invalid( + "URNExtensionDeclaration", + "URN cannot be empty", + )); } + Ok(parsed) } -#[derive(Debug, Clone, PartialEq)] -pub struct SimpleExtensionDeclaration { - pub anchor: u32, - pub urn_anchor: u32, - pub name: String, +fn parse_simple_declaration(input: &str) -> Result { + let parsed: ast::ExtensionDeclaration = + parse_declaration_line(input, "SimpleExtensionDeclaration")?; + if parsed.name.trim().is_empty() { + return Err(MessageParseError::invalid( + "SimpleExtensionDeclaration", + "name cannot be empty", + )); + } + Ok(parsed) } -impl FromStr for SimpleExtensionDeclaration { - type Err = MessageParseError; - - fn from_str(s: &str) -> Result { - let trimmed = s.trim(); - let Some(mut rest) = trimmed.strip_prefix('#') else { - return Err(MessageParseError::syntax( - "SimpleExtensionDeclaration", - format!("expected '# @: ', got '{trimmed}'"), - )); - }; - - rest = rest.trim_start(); - // Accept both compact and padded forms: - // `#5@2:name` and `# 5 @ 2: name`. - let anchor_end = rest - .find(|c: char| !c.is_ascii_digit()) - .unwrap_or(rest.len()); - if anchor_end == 0 { - return Err(MessageParseError::syntax( - "SimpleExtensionDeclaration", - format!("missing anchor in '{trimmed}'"), - )); - } - let anchor = rest[..anchor_end] - .parse::() - .map_err(|e| MessageParseError::invalid("SimpleExtensionDeclaration", e))?; - rest = rest[anchor_end..].trim_start(); - - let Some(after_at) = rest.strip_prefix('@') else { - return Err(MessageParseError::syntax( - "SimpleExtensionDeclaration", - format!("expected '@' in '{trimmed}'"), - )); - }; - rest = after_at.trim_start(); - let urn_end = rest - .find(|c: char| !c.is_ascii_digit()) - .unwrap_or(rest.len()); - if urn_end == 0 { - return Err(MessageParseError::syntax( - "SimpleExtensionDeclaration", - format!("missing urn anchor in '{trimmed}'"), - )); - } - let urn_anchor = rest[..urn_end] - .parse::() - .map_err(|e| MessageParseError::invalid("SimpleExtensionDeclaration", e))?; - rest = rest[urn_end..].trim_start(); - - let Some(after_colon) = rest.strip_prefix(':') else { - return Err(MessageParseError::syntax( - "SimpleExtensionDeclaration", - format!("missing ':' in '{trimmed}'"), - )); - }; - let raw_name = after_colon.trim(); - if raw_name.is_empty() { - return Err(MessageParseError::invalid( - "SimpleExtensionDeclaration", - "name cannot be empty", - )); - } - - let name = if raw_name.starts_with('"') && raw_name.ends_with('"') && raw_name.len() >= 2 { - ast::unescape_quoted(raw_name) - } else { - raw_name.to_string() - }; - - Ok(Self { - anchor, - urn_anchor, - name, - }) - } +fn parse_declaration_line(input: &str, target: &'static str) -> Result +where + T: FromStr, +{ + input + .parse::() + .map_err(|e| MessageParseError::syntax(target, e.describe_with_line(input))) } pub fn resolve_extension_detail( @@ -278,12 +187,10 @@ pub fn resolve_extension_detail( match detail { Ok(any) => Ok(Some(any)), - Err(crate::extensions::registry::ExtensionError::ExtensionNotFound(_)) => { - Err(ParseError::UnregisteredExtension { - name: extension_name.to_string(), - context: ParseContext::new(line_no, line.to_string()), - }) - } + Err(ExtensionError::ExtensionNotFound(_)) => Err(ParseError::UnregisteredExtension { + name: extension_name.to_string(), + context: ParseContext::new(line_no, line.to_string()), + }), Err(err) => Err(ParseError::ExtensionDetail( ParseContext::new(line_no, line.to_string()), err, @@ -291,86 +198,6 @@ pub fn resolve_extension_detail( } } -pub fn extension_value_from_arg(arg: &ast::Arg) -> Result { - match arg { - ast::Arg::Expr(ast::Expr::FieldRef(idx)) => Ok(ExtensionValue::Reference(*idx)), - ast::Arg::Expr(ast::Expr::Literal(lit)) => match (&lit.value, &lit.typ) { - (ast::LiteralValue::String(value), None) => Ok(ExtensionValue::String(value.clone())), - (ast::LiteralValue::Integer(value), None) => Ok(ExtensionValue::Integer(*value)), - (ast::LiteralValue::Float(value), None) => Ok(ExtensionValue::Float(*value)), - (ast::LiteralValue::Boolean(value), None) => Ok(ExtensionValue::Boolean(*value)), - _ => Ok(ExtensionValue::Expression(RawExpression::new( - lit.to_string(), - ))), - }, - ast::Arg::Expr(expr) => Ok(ExtensionValue::Expression(RawExpression::new( - expr.to_string(), - ))), - ast::Arg::NamedColumn(_, _) - | ast::Arg::Enum(_) - | ast::Arg::Tuple(_) - | ast::Arg::Wildcard => Ok(ExtensionValue::Expression(RawExpression::new( - arg.to_string(), - ))), - } -} - -pub fn extension_column_from_arg(arg: &ast::Arg) -> Result { - match arg { - ast::Arg::NamedColumn(name, typ) => Ok(ExtensionColumn::Named { - name: name.clone(), - type_spec: typ.to_string(), - }), - ast::Arg::Expr(ast::Expr::FieldRef(idx)) => Ok(ExtensionColumn::Reference(*idx)), - ast::Arg::Expr(expr) => Ok(ExtensionColumn::Expression(RawExpression::new( - expr.to_string(), - ))), - ast::Arg::Enum(_) | ast::Arg::Tuple(_) | ast::Arg::Wildcard => Ok( - ExtensionColumn::Expression(RawExpression::new(arg.to_string())), - ), - } -} - -impl ExtensionRelationType { - pub fn create_rel( - self, - detail: Option, - children: Vec>, - ) -> Result { - use substrait::proto::rel::RelType; - use substrait::proto::{ExtensionLeafRel, ExtensionMultiRel, ExtensionSingleRel}; - - self.validate_child_count(children.len())?; - - let rel_type = match self { - ExtensionRelationType::Leaf => RelType::ExtensionLeaf(ExtensionLeafRel { - common: None, - detail: detail.map(Into::into), - }), - ExtensionRelationType::Single => { - let input = children.into_iter().next().map(|child| *child); - RelType::ExtensionSingle(Box::new(ExtensionSingleRel { - common: None, - detail: detail.map(Into::into), - input: input.map(Box::new), - })) - } - ExtensionRelationType::Multi => { - let inputs = children.into_iter().map(|child| *child).collect(); - RelType::ExtensionMulti(ExtensionMultiRel { - common: None, - detail: detail.map(Into::into), - inputs, - }) - } - }; - - Ok(substrait::proto::Rel { - rel_type: Some(rel_type), - }) - } -} - #[cfg(test)] mod tests { use super::*; @@ -379,7 +206,7 @@ mod tests { #[test] fn test_parse_urn_extension_declaration() { let line = "@1: /my/urn1"; - let urn = URNExtensionDeclaration::from_str(line).unwrap(); + let urn = parse_urn_declaration(line).unwrap(); assert_eq!(urn.anchor, 1); assert_eq!(urn.urn, "/my/urn1"); } @@ -387,13 +214,13 @@ mod tests { #[test] fn test_parse_simple_extension_declaration() { let line = "#5@2: my_function_name"; - let decl = SimpleExtensionDeclaration::from_str(line).unwrap(); + let decl = parse_simple_declaration(line).unwrap(); assert_eq!(decl.anchor, 5); assert_eq!(decl.urn_anchor, 2); assert_eq!(decl.name, "my_function_name"); let line2 = "#10 @200: another_ext_123"; - let decl = SimpleExtensionDeclaration::from_str(line2).unwrap(); + let decl = parse_simple_declaration(line2).unwrap(); assert_eq!(decl.anchor, 10); assert_eq!(decl.urn_anchor, 200); assert_eq!(decl.name, "another_ext_123"); diff --git a/src/parser/lalrpop_line.rs b/src/parser/lalrpop_line.rs index 3372478..8f7166d 100644 --- a/src/parser/lalrpop_line.rs +++ b/src/parser/lalrpop_line.rs @@ -1,5 +1,7 @@ //! Thin wrappers around LALRPOP entry points for single-line parsing. +use std::str::FromStr; + use lalrpop_util::ParseError as LalrpopError; use lalrpop_util::lexer::Token; @@ -28,6 +30,38 @@ pub fn parse_root_names(input: &str) -> Result, SyntaxErrorDetail> { .map_err(|e| map_syntax_error(input, e)) } +pub fn parse_extension_urn_declaration( + input: &str, +) -> Result { + line_grammar::ExtensionUrnDeclParser::new() + .parse(input) + .map_err(|e| map_syntax_error(input, e)) +} + +pub fn parse_extension_declaration( + input: &str, +) -> Result { + line_grammar::ExtensionDeclParser::new() + .parse(input) + .map_err(|e| map_syntax_error(input, e)) +} + +impl FromStr for ast::ExtensionUrnDeclaration { + type Err = SyntaxErrorDetail; + + fn from_str(s: &str) -> Result { + parse_extension_urn_declaration(s) + } +} + +impl FromStr for ast::ExtensionDeclaration { + type Err = SyntaxErrorDetail; + + fn from_str(s: &str) -> Result { + parse_extension_declaration(s) + } +} + fn map_syntax_error( input: &str, error: LalrpopError, &'static str>, @@ -135,4 +169,47 @@ mod tests { ) )); } + + #[test] + fn parses_extension_urn_declaration() { + let decl = parse_extension_urn_declaration("@ 1: https://example.com/functions.yaml") + .expect("parse urn declaration"); + assert_eq!(decl.anchor, 1); + assert_eq!(decl.urn, "https://example.com/functions.yaml"); + } + + #[test] + fn parses_extension_declaration() { + let decl = parse_extension_declaration("# 10 @ 1: my_function").expect("parse declaration"); + assert_eq!(decl.anchor, 10); + assert_eq!(decl.urn_anchor, 1); + assert_eq!(decl.name, "my_function"); + } + + #[test] + fn from_str_parses_extension_urn_declaration() { + let decl: ast::ExtensionUrnDeclaration = "@ 1: https://example.com/functions.yaml" + .parse() + .expect("parse urn declaration"); + assert_eq!(decl.anchor, 1); + assert_eq!(decl.urn, "https://example.com/functions.yaml"); + } + + #[test] + fn from_str_parses_extension_declaration() { + let decl: ast::ExtensionDeclaration = + "# 10 @ 1: my_function".parse().expect("parse declaration"); + assert_eq!(decl.anchor, 10); + assert_eq!(decl.urn_anchor, 1); + assert_eq!(decl.name, "my_function"); + } + + #[test] + fn from_str_reports_syntax_details() { + let err = "@ nope: urn" + .parse::() + .expect_err("should fail"); + assert!(matches!(err.kind, SyntaxErrorKind::UnexpectedToken { .. })); + assert!(!err.expected.is_empty()); + } } diff --git a/src/parser/line_grammar.lalrpop b/src/parser/line_grammar.lalrpop index 8bc2f68..deaf745 100644 --- a/src/parser/line_grammar.lalrpop +++ b/src/parser/line_grammar.lalrpop @@ -1,8 +1,9 @@ grammar; use crate::parser::ast::{ - unescape_quoted, Arg, ArgEntry, ArgList, Expr, ExtensionType, FunctionCall, IfThenExpr, - Literal, Nullability, Relation, RelationName, TypeExpr, + ExtensionDeclaration, ExtensionUrnDeclaration, unescape_quoted, Arg, ArgEntry, ArgList, Expr, + ExtensionType, FunctionCall, IfThenExpr, Literal, Nullability, Relation, RelationName, + TypeExpr, }; // -------------------- @@ -13,6 +14,7 @@ match { r"\s*" => { }, r"-?[0-9]+\.[0-9]+" => FLOAT, r"-?[0-9]+" => INT, + r"([A-Za-z][A-Za-z0-9+.\-]*://[^\s]+)|(/[^\s]+)" => URN_VALUE, r"\$[0-9]+" => FIELD, r#""([^"\\]|\\.)*""# => QUOTED_NAME, r"'([^'\\]|\\.)*'" => STRING, @@ -28,11 +30,26 @@ pub RootNames: Vec = { "Root" "[" "]" => names, }; +pub ExtensionUrnDecl: ExtensionUrnDeclaration = { + ":" => ExtensionUrnDeclaration::new(anchor, urn), +}; + +pub ExtensionDecl: ExtensionDeclaration = { + ":" + => ExtensionDeclaration::new(anchor, urn_anchor, name), +}; + NameListOpt: Vec = { => vec![], > => names, }; +UrnValue: String = { + => value.to_string(), + => value.to_string(), + => unescape_quoted(value), +}; + pub Relation: Relation = { "[" "]" => Relation::new(name, body.0, body.1), }; diff --git a/src/parser/lower/extensions.rs b/src/parser/lower/extensions.rs new file mode 100644 index 0000000..c0cbc02 --- /dev/null +++ b/src/parser/lower/extensions.rs @@ -0,0 +1,165 @@ +//! Extension-relation lowering and AST -> `ExtensionArgs` conversion. + +use std::collections::HashSet; + +use substrait::proto::rel::RelType; +use substrait::proto::{ExtensionLeafRel, ExtensionMultiRel, ExtensionSingleRel, Rel}; + +use super::LowerCtx; +use super::validate::ensure_unique_named_arg; +use crate::extensions::any::Any; +use crate::extensions::{ + ExtensionArgs, ExtensionColumn, ExtensionRelationType, ExtensionValue, RawExpression, +}; +use crate::parser::ast; +use crate::parser::errors::{MessageParseError, ParseError}; +use crate::parser::extensions::resolve_extension_detail; + +impl From for ExtensionRelationType { + fn from(value: ast::ExtensionType) -> Self { + match value { + ast::ExtensionType::Leaf => ExtensionRelationType::Leaf, + ast::ExtensionType::Single => ExtensionRelationType::Single, + ast::ExtensionType::Multi => ExtensionRelationType::Multi, + } + } +} + +#[allow(clippy::vec_box)] +pub(crate) fn lower_extension( + ctx: &LowerCtx<'_>, + relation: &ast::Relation, + extension_type: ast::ExtensionType, + extension_name: &str, + child_relations: Vec>, +) -> Result { + let relation_type: ExtensionRelationType = extension_type.into(); + let args = build_extension_args_from_relation(ctx, relation, relation_type)?; + + relation_type + .validate_child_count(child_relations.len()) + .map_err(|e| plan_error(ctx, MessageParseError::invalid("extension_relation", e)))?; + + let detail = + resolve_extension_detail(ctx.registry, ctx.line_no, ctx.line, extension_name, &args)?; + + Ok(build_extension_relation( + relation_type, + detail, + child_relations, + )) +} + +pub(crate) fn build_extension_args_from_relation( + ctx: &LowerCtx<'_>, + relation: &ast::Relation, + relation_type: ExtensionRelationType, +) -> Result { + let mut args = ExtensionArgs::new(relation_type); + let mut seen_named = HashSet::new(); + + for arg in &relation.args.positional { + args.add_positional_arg(extension_value_from_arg(arg).map_err(|e| plan_error(ctx, e))?); + } + + for named in &relation.args.named { + ensure_unique_named_arg(ctx, &mut seen_named, &named.name, "extension_relation")?; + args.add_named_arg( + named.name.clone(), + extension_value_from_arg(&named.value).map_err(|e| plan_error(ctx, e))?, + ); + } + + for arg in &relation.outputs { + args.add_output_column(extension_column_from_arg(arg).map_err(|e| plan_error(ctx, e))?); + } + + Ok(args) +} + +pub(crate) fn extension_value_from_arg( + arg: &ast::Arg, +) -> Result { + match arg { + ast::Arg::Expr(ast::Expr::FieldRef(idx)) => Ok(ExtensionValue::Reference(*idx)), + ast::Arg::Expr(ast::Expr::Literal(lit)) => match (&lit.value, &lit.typ) { + (ast::LiteralValue::String(value), None) => Ok(ExtensionValue::String(value.clone())), + (ast::LiteralValue::Integer(value), None) => Ok(ExtensionValue::Integer(*value)), + (ast::LiteralValue::Float(value), None) => Ok(ExtensionValue::Float(*value)), + (ast::LiteralValue::Boolean(value), None) => Ok(ExtensionValue::Boolean(*value)), + _ => Ok(ExtensionValue::Expression(RawExpression::new( + lit.to_string(), + ))), + }, + ast::Arg::Expr(expr) => Ok(ExtensionValue::Expression(RawExpression::new( + expr.to_string(), + ))), + ast::Arg::NamedColumn(_, _) + | ast::Arg::Enum(_) + | ast::Arg::Tuple(_) + | ast::Arg::Wildcard => Ok(ExtensionValue::Expression(RawExpression::new( + arg.to_string(), + ))), + } +} + +pub(crate) fn extension_column_from_arg( + arg: &ast::Arg, +) -> Result { + match arg { + ast::Arg::NamedColumn(name, typ) => Ok(ExtensionColumn::Named { + name: name.clone(), + type_spec: typ.to_string(), + }), + ast::Arg::Expr(ast::Expr::FieldRef(idx)) => Ok(ExtensionColumn::Reference(*idx)), + ast::Arg::Expr(expr) => Ok(ExtensionColumn::Expression(RawExpression::new( + expr.to_string(), + ))), + ast::Arg::Enum(_) | ast::Arg::Tuple(_) | ast::Arg::Wildcard => Ok( + ExtensionColumn::Expression(RawExpression::new(arg.to_string())), + ), + } +} + +#[allow(clippy::vec_box)] +fn build_extension_relation( + relation_type: ExtensionRelationType, + detail: Option, + children: Vec>, +) -> Rel { + debug_assert!( + relation_type.validate_child_count(children.len()).is_ok(), + "child count should be validated before relation construction" + ); + + let rel_type = match relation_type { + ExtensionRelationType::Leaf => RelType::ExtensionLeaf(ExtensionLeafRel { + common: None, + detail: detail.map(Into::into), + }), + ExtensionRelationType::Single => { + let input = children.into_iter().next().map(|child| *child); + RelType::ExtensionSingle(Box::new(ExtensionSingleRel { + common: None, + detail: detail.map(Into::into), + input: input.map(Box::new), + })) + } + ExtensionRelationType::Multi => { + let inputs = children.into_iter().map(|child| *child).collect(); + RelType::ExtensionMulti(ExtensionMultiRel { + common: None, + detail: detail.map(Into::into), + inputs, + }) + } + }; + + Rel { + rel_type: Some(rel_type), + } +} + +fn plan_error(ctx: &LowerCtx<'_>, error: MessageParseError) -> ParseError { + ParseError::Plan(ctx.parse_context(), error) +} diff --git a/src/parser/lower/mod.rs b/src/parser/lower/mod.rs index 9c2af65..3ae76f1 100644 --- a/src/parser/lower/mod.rs +++ b/src/parser/lower/mod.rs @@ -1,9 +1,14 @@ //! AST -> protobuf lowering. //! -//! Relation lowering stays explicit (per relation type), while leaf AST nodes -//! (`Expr`, `Literal`, `TypeExpr`) use the `Lower` trait for uniform conversion. +//! The AST is the representation of the syntax / semantics in a Rustic form; +//! that is assembled during the `parse` stage. +//! +//! Relation lowering is explicit (per relation type), while leaf AST nodes +//! ([`ast::Expr`], [`ast::Literal`], [`ast::TypeExpr`]) use the internal +//! `Lower` trait for uniform conversion. mod expr; +mod extensions; mod literals; mod relations; mod types; @@ -15,9 +20,11 @@ use crate::extensions::{ExtensionRegistry, SimpleExtensions}; use crate::parser::ast; use crate::parser::errors::{MessageParseError, ParseContext, ParseError}; -pub(crate) trait Lower { +trait Lower { + /// Protobuf type that this trait converts into type Output; + /// Convert Self into that protobuf type fn lower(&self, ctx: &LowerCtx<'_>, message: &'static str) -> Result; } @@ -85,7 +92,7 @@ pub fn lower_relation( relations::lower_standard(&ctx, relation, name, child_relations, input_field_count) } ast::RelationName::Extension(kind, name) => { - relations::lower_extension(&ctx, relation, *kind, name, child_relations) + extensions::lower_extension(&ctx, relation, *kind, name, child_relations) } } } diff --git a/src/parser/lower/relations.rs b/src/parser/lower/relations.rs index 6eaf273..6b1572c 100644 --- a/src/parser/lower/relations.rs +++ b/src/parser/lower/relations.rs @@ -13,16 +13,13 @@ use substrait::proto::{ use super::expr::{fetch_value_expression, field_ref_expression, lower_arg_as_expression}; use super::validate::{ - ensure_no_children, ensure_no_named_args, expect_one_child, output_mapping_from_args, - parse_join_type, parse_sort_direction, + ensure_exact_child_count, ensure_exact_positional_count, ensure_no_children, + ensure_no_named_args, expect_one_child, output_mapping_from_args, parse_join_type, + parse_sort_direction, }; use super::{Lower, LowerCtx}; -use crate::extensions::{ExtensionArgs, ExtensionRelationType}; use crate::parser::ast; -use crate::parser::errors::{MessageParseError, ParseError}; -use crate::parser::extensions::{ - extension_column_from_arg, extension_value_from_arg, resolve_extension_detail, -}; +use crate::parser::errors::ParseError; #[allow(clippy::vec_box)] pub(crate) fn lower_standard( @@ -56,12 +53,13 @@ fn lower_read( ensure_no_named_args(ctx, relation, "Read")?; ensure_no_children(ctx, child_relations.len(), "Read")?; - if relation.args.positional.len() != 1 { - return ctx.invalid( - "Read", - "Read expects exactly one positional argument (table name)", - ); - } + ensure_exact_positional_count( + ctx, + relation.args.positional.len(), + 1, + "Read", + "Read expects exactly one positional argument (table name)", + )?; let table_name = match &relation.args.positional[0] { ast::Arg::Expr(ast::Expr::Identifier(name)) => name, @@ -124,9 +122,13 @@ fn lower_filter( ensure_no_named_args(ctx, relation, "Filter")?; let input = expect_one_child(ctx, &mut child_relations, "Filter")?; - if relation.args.positional.len() != 1 { - return ctx.invalid("Filter", "Filter expects one positional condition argument"); - } + ensure_exact_positional_count( + ctx, + relation.args.positional.len(), + 1, + "Filter", + "Filter expects one positional condition argument", + )?; let condition = lower_arg_as_expression(ctx, &relation.args.positional[0], "Filter")?; @@ -409,22 +411,15 @@ fn lower_join( ) -> Result { ensure_no_named_args(ctx, relation, "Join")?; - if child_relations.len() != 2 { - return ctx.invalid( - "Join", - format!( - "Join expects exactly 2 input children, found {}", - child_relations.len() - ), - ); - } + ensure_exact_child_count(ctx, child_relations.len(), 2, "Join")?; - if relation.args.positional.len() != 2 { - return ctx.invalid( - "Join", - "Join expects two positional args: join type and condition", - ); - } + ensure_exact_positional_count( + ctx, + relation.args.positional.len(), + 2, + "Join", + "Join expects two positional args: join type and condition", + )?; let join_type = match &relation.args.positional[0] { ast::Arg::Enum(value) => parse_join_type(ctx, value)?, @@ -456,68 +451,3 @@ fn lower_join( }))), }) } - -#[allow(clippy::vec_box)] -#[allow(clippy::too_many_arguments)] -pub(crate) fn lower_extension( - ctx: &LowerCtx<'_>, - relation: &ast::Relation, - extension_type: ast::ExtensionType, - extension_name: &str, - child_relations: Vec>, -) -> Result { - let relation_type = match extension_type { - ast::ExtensionType::Leaf => ExtensionRelationType::Leaf, - ast::ExtensionType::Single => ExtensionRelationType::Single, - ast::ExtensionType::Multi => ExtensionRelationType::Multi, - }; - - let mut args = ExtensionArgs::new(relation_type); - - for arg in &relation.args.positional { - args.add_positional_arg( - extension_value_from_arg(arg).map_err(|e| ParseError::Plan(ctx.parse_context(), e))?, - ); - } - - for named in &relation.args.named { - if args.named.contains_key(&named.name) { - return ctx.invalid( - "extension_relation", - format!("duplicate named argument '{}'", named.name), - ); - } - args.add_named_arg( - named.name.clone(), - extension_value_from_arg(&named.value) - .map_err(|e| ParseError::Plan(ctx.parse_context(), e))?, - ); - } - - for arg in &relation.outputs { - args.add_output_column( - extension_column_from_arg(arg).map_err(|e| ParseError::Plan(ctx.parse_context(), e))?, - ); - } - - relation_type - .validate_child_count(child_relations.len()) - .map_err(|e| { - ParseError::Plan( - ctx.parse_context(), - MessageParseError::invalid("extension_relation", e), - ) - })?; - - let detail = - resolve_extension_detail(ctx.registry, ctx.line_no, ctx.line, extension_name, &args)?; - - relation_type - .create_rel(detail, child_relations) - .map_err(|e| { - ParseError::Plan( - ctx.parse_context(), - MessageParseError::invalid("extension_relation", e), - ) - }) -} diff --git a/src/parser/lower/validate.rs b/src/parser/lower/validate.rs index 0df0ce0..0a4282e 100644 --- a/src/parser/lower/validate.rs +++ b/src/parser/lower/validate.rs @@ -1,16 +1,19 @@ //! Shared relation-level validation helpers used by lowering. +use std::collections::HashSet; + use substrait::proto::sort_field::SortDirection; use substrait::proto::{Rel, join_rel}; use super::LowerCtx; use crate::parser::ast; +use crate::parser::errors::ParseError; pub(crate) fn ensure_no_named_args( ctx: &LowerCtx<'_>, relation: &ast::Relation, message: &'static str, -) -> Result<(), crate::parser::errors::ParseError> { +) -> Result<(), ParseError> { if relation.args.named.is_empty() { return Ok(()); } @@ -21,7 +24,7 @@ pub(crate) fn ensure_no_children( ctx: &LowerCtx<'_>, child_count: usize, message: &'static str, -) -> Result<(), crate::parser::errors::ParseError> { +) -> Result<(), ParseError> { if child_count == 0 { return Ok(()); } @@ -31,12 +34,31 @@ pub(crate) fn ensure_no_children( ) } +pub(crate) fn ensure_exact_child_count( + ctx: &LowerCtx<'_>, + child_count: usize, + expected: usize, + message: &'static str, +) -> Result<(), ParseError> { + if child_count == expected { + return Ok(()); + } + + let suffix = if expected == 1 { "" } else { "ren" }; + ctx.invalid( + message, + format!( + "{message} should have exactly {expected} input child{suffix}, found {child_count}" + ), + ) +} + #[allow(clippy::vec_box)] pub(crate) fn expect_one_child( ctx: &LowerCtx<'_>, child_relations: &mut Vec>, message: &'static str, -) -> Result, crate::parser::errors::ParseError> { +) -> Result, ParseError> { match child_relations.len() { 1 => Ok(child_relations.remove(0)), n => ctx.invalid( @@ -46,11 +68,37 @@ pub(crate) fn expect_one_child( } } +pub(crate) fn ensure_exact_positional_count( + ctx: &LowerCtx<'_>, + positional_count: usize, + expected: usize, + message: &'static str, + detail: &'static str, +) -> Result<(), ParseError> { + if positional_count == expected { + return Ok(()); + } + ctx.invalid(message, detail) +} + +pub(crate) fn ensure_unique_named_arg( + ctx: &LowerCtx<'_>, + seen: &mut HashSet, + name: &str, + message: &'static str, +) -> Result<(), ParseError> { + if seen.insert(name.to_string()) { + return Ok(()); + } + + ctx.invalid(message, format!("duplicate named argument '{name}'")) +} + pub(crate) fn output_mapping_from_args( ctx: &LowerCtx<'_>, args: &[ast::Arg], message: &'static str, -) -> Result, crate::parser::errors::ParseError> { +) -> Result, ParseError> { // Output mappings are index-only in the text format and refer to field // positions after relation-specific projection/aggregation semantics. let mut mapping = Vec::with_capacity(args.len()); @@ -71,7 +119,7 @@ pub(crate) fn output_mapping_from_args( pub(crate) fn parse_sort_direction( ctx: &LowerCtx<'_>, value: &str, -) -> Result { +) -> Result { let direction = match value { "AscNullsFirst" => SortDirection::AscNullsFirst, "AscNullsLast" => SortDirection::AscNullsLast, @@ -87,7 +135,7 @@ pub(crate) fn parse_sort_direction( pub(crate) fn parse_join_type( ctx: &LowerCtx<'_>, value: &str, -) -> Result { +) -> Result { let join_type = match value { "Inner" => join_rel::JoinType::Inner, "Left" => join_rel::JoinType::Left, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 4678ffc..f11b886 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,3 +1,17 @@ +//! Text → Substrait protobuf parsing. +//! +//! Parsing proceeds in two phases: +//! +//! 1. **Syntax** — each line is parsed into an [`ast`] node by the LALRPOP +//! grammar (invoked through `lalrpop_line`). The [`structural`] module +//! drives this, handling indentation, section headers, and the extensions +//! preamble. +//! 2. **Lowering** — the AST is converted to a [`substrait::proto::Plan`] by +//! the [`lower`] module, which performs semantic validation (anchor +//! resolution, column counts, type checking). +//! +//! The main entry point is [`Parser::parse`]. + pub mod ast; pub mod errors; pub mod extensions; diff --git a/src/textify/expressions.rs b/src/textify/expressions.rs index cd64ce5..3b2c9c9 100644 --- a/src/textify/expressions.rs +++ b/src/textify/expressions.rs @@ -1,3 +1,18 @@ +//! Expression and literal formatting — everything that appears inside the +//! `[...]` brackets of a relation line. +//! +//! Syntax conventions used in the text format: +//! +//! - `(…)` function call, e.g. `add($0, $1)` +//! - `[…]` relation brackets +//! - `<…>` type parameters, e.g. `list` +//! - `!{…}` missing/error placeholder +//! - `$…` field reference, e.g. `$0` +//! - `#…` extension anchor, e.g. `#10` +//! - `@…` URN anchor, e.g. `@1` +//! - `…:…` type annotation, e.g. `42:i64` +//! - `&…` enum value, e.g. `&AscNullsFirst` + use std::fmt::{self}; use chrono::{DateTime, NaiveDate}; @@ -18,18 +33,6 @@ use crate::extensions::SimpleExtensions; use crate::extensions::simple::ExtensionKind; use crate::textify::types::{Name, NamedAnchor, OutputType, escaped}; -// …(…) for function call -// […] for variant -// <…> for parameters -// !{…} for missing value - -// $… for field reference -// #… for anchor -// @… for URN anchor -// …::… for cast -// …:… for specifying type -// &… for enum - pub fn textify_binary(items: &[u8], ctx: &S, w: &mut W) -> fmt::Result { if ctx.options().show_literal_binaries { write!(w, "0x")?; diff --git a/src/textify/foundation.rs b/src/textify/foundation.rs index 01c5551..9ee46c6 100644 --- a/src/textify/foundation.rs +++ b/src/textify/foundation.rs @@ -91,10 +91,15 @@ impl OutputOptions { } } } +/// Collects non-fatal formatting errors so that textification can continue +/// and produce best-effort output. Errors are later returned alongside the +/// formatted text. pub trait ErrorAccumulator: Clone { fn push(&self, e: FormatError); } +/// Channel-backed [`ErrorAccumulator`]. Cloning shares the same underlying +/// channel, so errors from any clone are visible to the consumer. #[derive(Debug, Clone)] pub struct ErrorQueue { sender: mpsc::Sender, @@ -219,6 +224,9 @@ impl<'a> fmt::Display for IndentStack<'a> { } } +/// Concrete [`Scope`] implementation that carries formatting options, +/// extension lookups, error accumulation, and the current indentation level +/// through the textify call tree. #[derive(Debug, Copy, Clone)] pub struct ScopedContext<'a, Err: ErrorAccumulator> { errors: &'a Err, diff --git a/src/textify/mod.rs b/src/textify/mod.rs index a8d4b27..9b02b1b 100644 --- a/src/textify/mod.rs +++ b/src/textify/mod.rs @@ -1,4 +1,9 @@ -//! Output a plan in text format. +//! Substrait protobuf → human-readable text conversion. +//! +//! The core pattern: each protobuf type implements [`Textify`], writing itself +//! into a [`std::fmt::Write`] with a [`Scope`] context that provides formatting +//! options, extension lookups, and error accumulation. [`plan::PlanWriter`] +//! is the top-level entry point. pub mod expressions; pub mod extensions; diff --git a/src/textify/plan.rs b/src/textify/plan.rs index 61b6ddb..4e5481b 100644 --- a/src/textify/plan.rs +++ b/src/textify/plan.rs @@ -1,3 +1,10 @@ +//! Top-level plan formatting — orchestrates the conversion of a full +//! [`proto::Plan`] into its text representation. +//! +//! [`PlanWriter`] is the entry point: it extracts extensions from the plan, +//! creates a [`ScopedContext`] that carries options, error accumulation, and +//! extension lookups, then delegates to the relation and expression formatters. + use std::fmt; use substrait::proto; @@ -8,6 +15,10 @@ use crate::parser::PLAN_HEADER; use crate::textify::foundation::ErrorAccumulator; use crate::textify::{OutputOptions, ScopedContext}; +/// Formats a [`proto::Plan`] to its text representation. +/// +/// Use [`Display`](std::fmt::Display) or call [`write_extensions`](Self::write_extensions) and +/// [`write_relations`](Self::write_relations) separately for more control. #[derive(Debug, Clone)] pub struct PlanWriter<'a, E: ErrorAccumulator + Default> { options: &'a OutputOptions, @@ -18,6 +29,10 @@ pub struct PlanWriter<'a, E: ErrorAccumulator + Default> { } impl<'a, E: ErrorAccumulator + Default + Clone> PlanWriter<'a, E> { + /// Create a writer for the given plan. Returns the [`PlanWriter`] and an + /// output channel for accumulated errors, an [`ErrorAccumulator`] — output + /// is best-effort and will run to completion, despite incomplete plans, + /// unresolved extensions or references, or other issues. pub fn new( options: &'a OutputOptions, plan: &'a proto::Plan, diff --git a/src/textify/rels.rs b/src/textify/rels.rs index 762a73e..2d6aeb8 100644 --- a/src/textify/rels.rs +++ b/src/textify/rels.rs @@ -1,3 +1,11 @@ +//! Relation formatting — converts protobuf relation types to the +//! `Name[args => outputs]` text format. +//! +//! Each Substrait relation (Read, Filter, Project, …) is converted to a +//! [`Relation`] struct containing its name, [`Arguments`], and child +//! relations. Arguments and outputs are represented as [`Value`] variants, +//! which bridge between the typed protobuf world and the flat text format. + use std::borrow::Cow; use std::convert::TryFrom; use std::fmt; @@ -89,6 +97,9 @@ pub struct NamedArg<'a> { } #[derive(Debug, Clone)] +/// A single argument or output value in a relation's `[args => outputs]` +/// bracket. Covers field references, expressions, literals, enum values, +/// named columns, and error placeholders ([`Value::Missing`]). pub enum Value<'a> { Name(Name<'a>), TableName(Vec>), diff --git a/src/textify/types.rs b/src/textify/types.rs index 9f831d1..6253e34 100644 --- a/src/textify/types.rs +++ b/src/textify/types.rs @@ -1,3 +1,7 @@ +//! Type formatting — nullability suffixes, parameterized types, user-defined +//! types, and the [`Name`]/[`Anchor`] helpers used by relation and expression +//! formatting. + use std::fmt; use std::ops::Deref; @@ -91,9 +95,14 @@ impl<'a> Textify for Name<'a> { } } +/// An extension anchor (e.g. `#10`) that may be shown or hidden based on +/// [`OutputOptions::show_simple_extension_anchors`](super::OutputOptions::show_simple_extension_anchors). #[derive(Debug, Copy, Clone)] pub struct Anchor { reference: u32, + /// When `true`, the anchor is shown even at `Visibility::Required` + /// (i.e. it's needed for disambiguation). When `false`, it's only + /// shown at `Visibility::Always`. required: bool, }