|
| 1 | +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt) |
| 2 | +# Source for "Build a Reasoning Model (From Scratch)": https://mng.bz/lZ5B |
| 3 | +# Code repository: https://github.com/rasbt/reasoning-from-scratch |
| 4 | + |
| 5 | +# Verify that Python source files (and optionally notebooks) use double quotes for strings. |
| 6 | + |
| 7 | +import argparse |
| 8 | +import ast |
| 9 | +import io |
| 10 | +import json |
| 11 | +import sys |
| 12 | +import tokenize |
| 13 | +from pathlib import Path |
| 14 | + |
| 15 | +EXCLUDED_DIRS = { |
| 16 | + ".git", |
| 17 | + ".hg", |
| 18 | + ".mypy_cache", |
| 19 | + ".pytest_cache", |
| 20 | + ".ruff_cache", |
| 21 | + ".svn", |
| 22 | + ".tox", |
| 23 | + ".venv", |
| 24 | + "__pycache__", |
| 25 | + "build", |
| 26 | + "dist", |
| 27 | + "node_modules", |
| 28 | +} |
| 29 | + |
| 30 | +PREFIX_CHARS = {"r", "u", "f", "b"} |
| 31 | +SINGLE_QUOTE = "'" |
| 32 | +DOUBLE_QUOTE = "\"" |
| 33 | +TRIPLE_SINGLE = SINGLE_QUOTE * 3 |
| 34 | +TRIPLE_DOUBLE = DOUBLE_QUOTE * 3 |
| 35 | + |
| 36 | + |
| 37 | +def should_skip(path): |
| 38 | + parts = set(path.parts) |
| 39 | + return bool(EXCLUDED_DIRS & parts) |
| 40 | + |
| 41 | + |
| 42 | +def collect_fstring_expr_string_positions(source): |
| 43 | + """ |
| 44 | + Return set of (lineno, col_offset) for string literals that appear inside |
| 45 | + formatted expressions of f-strings. These should be exempt from the double |
| 46 | + quote check, since enforcing double quotes there is unnecessarily strict. |
| 47 | + """ |
| 48 | + try: |
| 49 | + tree = ast.parse(source) |
| 50 | + except SyntaxError: |
| 51 | + return set() |
| 52 | + |
| 53 | + positions = set() |
| 54 | + |
| 55 | + class Collector(ast.NodeVisitor): |
| 56 | + def visit_JoinedStr(self, node): |
| 57 | + for value in node.values: |
| 58 | + if isinstance(value, ast.FormattedValue): |
| 59 | + self._collect_from_expr(value.value) |
| 60 | + # Continue walking to catch nested f-strings within expressions |
| 61 | + self.generic_visit(node) |
| 62 | + |
| 63 | + def _collect_from_expr(self, node): |
| 64 | + if isinstance(node, ast.Constant) and isinstance(node.value, str): |
| 65 | + positions.add((node.lineno, node.col_offset)) |
| 66 | + elif isinstance(node, ast.Str): # Python <3.8 compatibility |
| 67 | + positions.add((node.lineno, node.col_offset)) |
| 68 | + else: |
| 69 | + for child in ast.iter_child_nodes(node): |
| 70 | + self._collect_from_expr(child) |
| 71 | + |
| 72 | + Collector().visit(tree) |
| 73 | + return positions |
| 74 | + |
| 75 | + |
| 76 | +def check_quotes_in_source(source, path): |
| 77 | + violations = [] |
| 78 | + ignored_positions = collect_fstring_expr_string_positions(source) |
| 79 | + tokens = tokenize.generate_tokens(io.StringIO(source).readline) |
| 80 | + for tok_type, tok_str, start, _, _ in tokens: |
| 81 | + if tok_type == tokenize.STRING: |
| 82 | + if start in ignored_positions: |
| 83 | + continue |
| 84 | + lowered = tok_str.lower() |
| 85 | + # ignore triple-quoted strings |
| 86 | + if lowered.startswith((TRIPLE_DOUBLE, TRIPLE_SINGLE)): |
| 87 | + continue |
| 88 | + |
| 89 | + # find the prefix and quote type |
| 90 | + # prefix = "" |
| 91 | + for c in PREFIX_CHARS: |
| 92 | + if lowered.startswith(c): |
| 93 | + # prefix = c |
| 94 | + lowered = lowered[1:] |
| 95 | + break |
| 96 | + |
| 97 | + # report if not using double quotes |
| 98 | + if lowered.startswith(SINGLE_QUOTE): |
| 99 | + line, col = start |
| 100 | + violations.append(f"{path}:{line}:{col}: uses single quotes") |
| 101 | + return violations |
| 102 | + |
| 103 | + |
| 104 | +def check_file(path): |
| 105 | + try: |
| 106 | + if path.suffix == ".ipynb": |
| 107 | + return check_notebook(path) |
| 108 | + else: |
| 109 | + text = path.read_text(encoding="utf-8") |
| 110 | + return check_quotes_in_source(text, path) |
| 111 | + except Exception as e: |
| 112 | + return [f"{path}: failed to check ({e})"] |
| 113 | + |
| 114 | + |
| 115 | +def check_notebook(path): |
| 116 | + violations = [] |
| 117 | + with open(path, encoding="utf-8") as f: |
| 118 | + nb = json.load(f) |
| 119 | + for cell in nb.get("cells", []): |
| 120 | + if cell.get("cell_type") == "code": |
| 121 | + src = "".join(cell.get("source", [])) |
| 122 | + violations.extend(check_quotes_in_source(src, path)) |
| 123 | + return violations |
| 124 | + |
| 125 | + |
| 126 | +def parse_args(): |
| 127 | + parser = argparse.ArgumentParser(description="Verify double-quoted string literals.") |
| 128 | + parser.add_argument( |
| 129 | + "--include-notebooks", |
| 130 | + action="store_true", |
| 131 | + help="Also scan Jupyter notebooks (.ipynb files) for single-quoted strings.", |
| 132 | + ) |
| 133 | + return parser.parse_args() |
| 134 | + |
| 135 | + |
| 136 | +def main(): |
| 137 | + args = parse_args() |
| 138 | + project_root = Path(".").resolve() |
| 139 | + py_files = sorted(project_root.rglob("*.py")) |
| 140 | + notebook_files = sorted(project_root.rglob("*.ipynb")) if args.include_notebooks else [] |
| 141 | + |
| 142 | + violations = [] |
| 143 | + for path in py_files + notebook_files: |
| 144 | + if should_skip(path): |
| 145 | + continue |
| 146 | + violations.extend(check_file(path)) |
| 147 | + |
| 148 | + if violations: |
| 149 | + print("\n".join(violations)) |
| 150 | + print(f"\n{len(violations)} violations found.") |
| 151 | + return 1 |
| 152 | + |
| 153 | + print("All files use double quotes correctly.") |
| 154 | + return 0 |
| 155 | + |
| 156 | + |
| 157 | +if __name__ == "__main__": |
| 158 | + sys.exit(main()) |
0 commit comments