Skip to content

Commit fa75d99

Browse files
authored
feat: implement parser for type derivation programs (#43)
BREAKING CHANGE: meta pattern match operation now yields a Result<bool> instead of just bool, and the binding pattern was changed in order to be able to model all variations of MIRROR nullability and inconsistent variadic argument slots
1 parent f29eb39 commit fa75d99

File tree

33 files changed

+11744
-268
lines changed

33 files changed

+11744
-268
lines changed

.github/workflows/misc.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,15 @@ jobs:
2828
- name: Check version update patch file
2929
run: python3 ci/version.py check
3030

31+
verify-antlr:
32+
name: Verify ANTLR-generated code
33+
runs-on: ubuntu-latest
34+
steps:
35+
- uses: actions/checkout@v2
36+
- name: Check
37+
working-directory: rs/antlr
38+
run: python3 generate.py --ci
39+
3140
commitlint:
3241
name: Lint commits for semantic-release
3342
runs-on: ubuntu-latest

Cargo.lock

Lines changed: 45 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

proto/substrait/validator/validator.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ message Node {
7373
// represents the parse result of the referred file.
7474
string resolved_uri = 9;
7575

76+
// This node represents an abstract syntax tree node, used for representing
77+
// complex YAML string parse results.
78+
google.protobuf.Empty ast_node = 10;
79+
7680
// No longer used. The more generic ResolvedUri type is used instead.
7781
YamlReference yaml_reference = 5 [deprecated = true];
7882
}

rs/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ serde_json = "1"
4646
# being useful elsewhere too).
4747
regex = "1.5"
4848

49+
# Used for the type derivation DSL.
50+
antlr-rust = "0.3.0-beta"
51+
4952
# Used for checking URI syntax.
5053
uriparse = "0.6"
5154

rs/antlr/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
antlr.jar
2+
__pycache__

rs/antlr/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# ANTLR code generation logic
2+
3+
The validator includes a parser for type expressions based on an ANTLR grammar.
4+
Unfortunately, the ANTLR code generator is written in Java, and would thus add
5+
a huge build dependency (a JRE) to the validator build environment. This is
6+
especially problematic for the distribution of Cargo crates, which are
7+
fundamentally source distributions that should not depend on anything other
8+
than other Rust crates. Therefore, the generated files are checked in to git
9+
and distributed with the crate, and regeneration must thus be done manually.
10+
Call the generate.py script to do so.

rs/antlr/antlr.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Wrapper script to download and run a suitable version of ANTLR for
5+
generating or verifying the Rust bindings for a given grammar."""
6+
7+
import urllib.request
8+
import os
9+
import sys
10+
import hashlib
11+
import logging
12+
import tempfile
13+
import shutil
14+
import filecmp
15+
import subprocess
16+
import difflib
17+
import argparse
18+
19+
20+
# NOTE: the Rust bindings for ANTLR are not (yet) official, so we need to
21+
# download a forked ANTLR build.
22+
ANTLR_URL = "https://github.com/rrevenantt/antlr4rust/releases/download/antlr4-4.8-2-Rust0.3.0-beta/antlr4-4.8-2-SNAPSHOT-complete.jar"
23+
ANTLR_SHA1 = "775d24ac1ad5df1eb0ed0e802f0fb2a5aeace43c"
24+
25+
26+
class Failure(Exception):
27+
"""Used for fatal errors."""
28+
29+
30+
def fail(msg):
31+
"""Logs and throws an error message."""
32+
logging.error(msg)
33+
raise Failure(msg)
34+
35+
36+
def download_file(fname, url):
37+
"""Downloads a file if it does not already exist."""
38+
if not os.path.isfile(fname):
39+
logging.info(f"Downloading {fname}...")
40+
urllib.request.urlretrieve(ANTLR_URL, fname)
41+
42+
43+
def verify_file_hash(fname, hash_str):
44+
"""Verifies the hash of a (downloaded) file."""
45+
logging.info(f"Verifying {fname}...")
46+
with open(fname, "rb") as f:
47+
file_hash = hashlib.sha1()
48+
while chunk := f.read(8192):
49+
file_hash.update(chunk)
50+
actual = file_hash.hexdigest()
51+
if hash_str != actual:
52+
fail(f"Verification failed; hash should be {hash_str} but was {actual}")
53+
54+
55+
def verify_file_identical(new, old):
56+
"""Verifies that two text files are identical, printing a diff if not."""
57+
logging.info(f"Verifying {new} against {old}...")
58+
if not os.path.isfile(new):
59+
fail(f"{new} does not exist")
60+
if not os.path.isfile(old):
61+
fail(f"{old} does not exist")
62+
if not filecmp.cmp(new, old, shallow=False):
63+
with open(new, "r") as f:
64+
new_data = f.readlines()
65+
with open(old, "r") as f:
66+
old_data = f.readlines()
67+
sys.stdout.writelines(difflib.unified_diff(old_data, new_data, old, new))
68+
fail(f"{new} is different, see diff")
69+
70+
71+
def run_antlr(antlr, grammar, output_dir, verify=False, java="java"):
72+
"""Runns the given ANTLR JAR on the given grammar, sending outputs to
73+
output_dir. If verify is set, instead of copying the newly-generated files,
74+
this checks that there are no differences between the newly and previously
75+
generated files."""
76+
logging.info("Running ANTLR...")
77+
78+
# Determine the names of the generated files that we're interested in.
79+
name = os.path.basename(grammar).split(".")[0].lower()
80+
expected_files = [f"{name}lexer.rs", f"{name}parser.rs", f"{name}listener.rs"]
81+
82+
# Run in a temporary directory, because ANTLR spams random files we didn't
83+
# ask for in its working directory.
84+
with tempfile.TemporaryDirectory() as generate_dir:
85+
shutil.copyfile(grammar, os.path.join(generate_dir, os.path.basename(grammar)))
86+
subprocess.run(
87+
[
88+
java,
89+
"-jar",
90+
os.path.realpath(antlr),
91+
"-Dlanguage=Rust",
92+
os.path.basename(grammar),
93+
],
94+
cwd=generate_dir,
95+
)
96+
97+
logging.info("Copying/verifying output files...")
98+
for expected_file in expected_files:
99+
src = os.path.join(generate_dir, expected_file)
100+
dest = os.path.join(output_dir, expected_file)
101+
if not os.path.isfile(src):
102+
fail(f"ANTLR failed to generate {expected_file}")
103+
with open(src, "r+") as f:
104+
data = f.read()
105+
data = (
106+
"// SPDX-License-Identifier: Apache-2.0\n"
107+
"#![allow(clippy::all)]\n"
108+
"#![cfg_attr(rustfmt, rustfmt_skip)]\n"
109+
f"{data}"
110+
)
111+
f.seek(0)
112+
f.write(data)
113+
if verify:
114+
verify_file_identical(src, dest)
115+
else:
116+
if os.path.exists(dest):
117+
os.unlink(dest)
118+
shutil.copyfile(src, dest)
119+
120+
121+
def main(*args):
122+
"""Utility to generate Rust bindings for an ANTLR grammar."""
123+
parser = argparse.ArgumentParser(description=main.__doc__)
124+
parser.add_argument(
125+
"--antlr",
126+
metavar="antlr.jar",
127+
default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "antlr.jar"),
128+
help="alternate location for the ANTLR jar",
129+
)
130+
parser.add_argument(
131+
"--no-download",
132+
action="store_true",
133+
help="don't attempt to download the ANTLR jar",
134+
)
135+
parser.add_argument(
136+
"--no-verify",
137+
action="store_true",
138+
help="don't attempt to verify the hash of the ANTLR jar",
139+
)
140+
parser.add_argument(
141+
"--java", default="java", help="path to java executable to call ANTLR with"
142+
)
143+
parser.add_argument(
144+
"--ci-check",
145+
action="store_true",
146+
help="instead of regenerating the files, assert that the files do not need to be regenerated",
147+
)
148+
parser.add_argument("grammar", help="the .g4 grammar file to generate")
149+
parser.add_argument(
150+
"dest_dir", default=".", nargs="?", help="where to copy the generated files to"
151+
)
152+
args = parser.parse_args(args)
153+
154+
logging.basicConfig(level=logging.INFO)
155+
156+
# Acquire ANTLR jar.
157+
if args.no_download:
158+
if not os.path.isfile(args.antlr):
159+
parser.error(f"{args.antlr} does not exist and auto-download is disabled")
160+
else:
161+
download_file(args.antlr, ANTLR_URL)
162+
if not args.no_verify:
163+
verify_file_hash(args.antlr, ANTLR_SHA1)
164+
165+
# Run ANTLR.
166+
if not os.path.isfile(args.grammar):
167+
parser.error(f"{args.grammar} does not exist")
168+
run_antlr(
169+
args.antlr, args.grammar, args.dest_dir, verify=args.ci_check, java=args.java
170+
)
171+
172+
173+
if __name__ == "__main__":
174+
try:
175+
main(*sys.argv[1:])
176+
logging.info("Done")
177+
except Failure:
178+
logging.info("Returning failure exit status")
179+
sys.exit(1)

rs/antlr/generate.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import sys
5+
import logging
6+
import argparse
7+
import antlr
8+
9+
"""Script for regenerating or verifying all the ANTLR-generated files of the
10+
validator."""
11+
12+
13+
def main(*args):
14+
parser = argparse.ArgumentParser(description=__doc__)
15+
parser.add_argument(
16+
"--ci",
17+
action="store_true",
18+
help="instead of regenerating, verify that the files don't need to be regenerated",
19+
)
20+
args = parser.parse_args(args)
21+
22+
logging.basicConfig(level=logging.INFO)
23+
24+
ci = ["--ci-check"] if args.ci else []
25+
antlr.main(
26+
"../src/parse/extensions/simple/derivations/SubstraitType.g4",
27+
"../src/parse/extensions/simple/derivations",
28+
*ci,
29+
)
30+
31+
32+
if __name__ == "__main__":
33+
try:
34+
main(*sys.argv[1:])
35+
logging.info("Done")
36+
except antlr.Failure:
37+
logging.info("Returning failure exit status")
38+
sys.exit(1)

rs/src/export/html/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,7 @@ fn format_node_tree(
524524
tree::NodeType::YamlMap => format!("{brief} {}", format_span("type", "YAML map")),
525525
tree::NodeType::YamlArray => format!("{brief} {}", format_span("type", "YAML array")),
526526
tree::NodeType::YamlPrimitive(data) => format!("= {}{brief}", format_span("value", data)),
527+
tree::NodeType::AstNode => format!("{brief} {}", format_span("type", "AST node")),
527528
};
528529
let header = format!(
529530
"{} {value} {}",

rs/src/export/proto.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ impl From<&tree::NodeType> for validator::node::NodeType {
188188
validator::node::NodeType::YamlPrimitive(data.into())
189189
}
190190
tree::NodeType::ResolvedUri(uri) => validator::node::NodeType::ResolvedUri(uri.clone()),
191+
tree::NodeType::AstNode => validator::node::NodeType::AstNode(()),
191192
}
192193
}
193194
}

0 commit comments

Comments
 (0)