Skip to content

Commit 56bbe04

Browse files
authored
Correctly tokenize nested comments in Databricks (#40)
2 parents cea440d + 71865d0 commit 56bbe04

File tree

4 files changed

+81
-76
lines changed

4 files changed

+81
-76
lines changed

src/dialect/ansi.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,9 @@ impl Dialect for AnsiDialect {
3333
fn require_interval_qualifier(&self) -> bool {
3434
true
3535
}
36+
37+
// The SQL standard explictly states that block comments nest.
38+
fn supports_nested_comments(&self) -> bool {
39+
true
40+
}
3641
}

src/dialect/clickhouse.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,10 @@ impl Dialect for ClickHouseDialect {
9494
fn supports_group_by_with_modifier(&self) -> bool {
9595
true
9696
}
97+
98+
// Supported since 2020.
99+
// See <https://clickhouse.com/docs/whats-new/changelog/2020#backward-incompatible-change-2>
100+
fn supports_nested_comments(&self) -> bool {
101+
true
102+
}
97103
}

src/dialect/databricks.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,9 @@ impl Dialect for DatabricksDialect {
7979
fn supports_struct_literal(&self) -> bool {
8080
true
8181
}
82+
83+
// https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment
84+
fn supports_nested_comments(&self) -> bool {
85+
true
86+
}
8287
}

src/tokenizer.rs

Lines changed: 65 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -2497,7 +2497,7 @@ mod tests {
24972497
use crate::dialect::{
24982498
BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
24992499
};
2500-
use crate::test_utils::all_dialects_where;
2500+
use crate::test_utils::{all_dialects_except, all_dialects_where};
25012501
use core::fmt::Debug;
25022502

25032503
#[test]
@@ -3247,90 +3247,79 @@ mod tests {
32473247

32483248
#[test]
32493249
fn tokenize_nested_multiline_comment() {
3250-
let dialect = GenericDialect {};
3251-
let test_cases = vec![
3252-
(
3253-
"0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3254-
vec![
3255-
Token::Number("0".to_string(), false),
3256-
Token::Whitespace(Whitespace::MultiLineComment(
3257-
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3258-
)),
3259-
Token::Whitespace(Whitespace::Space),
3260-
Token::Div,
3261-
Token::Word(Word {
3262-
value: "comment".to_string(),
3263-
quote_style: None,
3264-
keyword: Keyword::COMMENT,
3265-
}),
3266-
Token::Mul,
3267-
Token::Div,
3268-
Token::Number("1".to_string(), false),
3269-
],
3270-
),
3271-
(
3272-
"0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3273-
vec![
3274-
Token::Number("0".to_string(), false),
3275-
Token::Whitespace(Whitespace::MultiLineComment(
3276-
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3277-
)),
3278-
Token::Number("1".to_string(), false),
3279-
],
3280-
),
3281-
(
3282-
"SELECT 1/* a /* b */ c */0",
3283-
vec![
3284-
Token::make_keyword("SELECT"),
3285-
Token::Whitespace(Whitespace::Space),
3286-
Token::Number("1".to_string(), false),
3287-
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3288-
Token::Number("0".to_string(), false),
3289-
],
3290-
),
3291-
];
3250+
all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3251+
"0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3252+
vec![
3253+
Token::Number("0".to_string(), false),
3254+
Token::Whitespace(Whitespace::MultiLineComment(
3255+
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3256+
)),
3257+
Token::Whitespace(Whitespace::Space),
3258+
Token::Div,
3259+
Token::Word(Word {
3260+
value: "comment".to_string(),
3261+
quote_style: None,
3262+
keyword: Keyword::COMMENT,
3263+
}),
3264+
Token::Mul,
3265+
Token::Div,
3266+
Token::Number("1".to_string(), false),
3267+
],
3268+
);
32923269

3293-
for (sql, expected) in test_cases {
3294-
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3295-
compare(expected, tokens);
3296-
}
3270+
all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3271+
"0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3272+
vec![
3273+
Token::Number("0".to_string(), false),
3274+
Token::Whitespace(Whitespace::MultiLineComment(
3275+
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3276+
)),
3277+
Token::Number("1".to_string(), false),
3278+
],
3279+
);
3280+
3281+
all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3282+
"SELECT 1/* a /* b */ c */0",
3283+
vec![
3284+
Token::make_keyword("SELECT"),
3285+
Token::Whitespace(Whitespace::Space),
3286+
Token::Number("1".to_string(), false),
3287+
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3288+
Token::Number("0".to_string(), false),
3289+
],
3290+
);
32973291
}
32983292

32993293
#[test]
33003294
fn tokenize_nested_multiline_comment_empty() {
3301-
let sql = "select 1/*/**/*/0";
3302-
3303-
let dialect = GenericDialect {};
3304-
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3305-
let expected = vec![
3306-
Token::make_keyword("select"),
3307-
Token::Whitespace(Whitespace::Space),
3308-
Token::Number("1".to_string(), false),
3309-
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3310-
Token::Number("0".to_string(), false),
3311-
];
3312-
3313-
compare(expected, tokens);
3295+
all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3296+
"select 1/*/**/*/0",
3297+
vec![
3298+
Token::make_keyword("select"),
3299+
Token::Whitespace(Whitespace::Space),
3300+
Token::Number("1".to_string(), false),
3301+
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3302+
Token::Number("0".to_string(), false),
3303+
],
3304+
);
33143305
}
33153306

33163307
#[test]
33173308
fn tokenize_nested_comments_if_not_supported() {
3318-
let dialect = SQLiteDialect {};
3319-
let sql = "SELECT 1/*/* nested comment */*/0";
3320-
let tokens = Tokenizer::new(&dialect, sql).tokenize();
3321-
let expected = vec![
3322-
Token::make_keyword("SELECT"),
3323-
Token::Whitespace(Whitespace::Space),
3324-
Token::Number("1".to_string(), false),
3325-
Token::Whitespace(Whitespace::MultiLineComment(
3326-
"/* nested comment ".to_string(),
3327-
)),
3328-
Token::Mul,
3329-
Token::Div,
3330-
Token::Number("0".to_string(), false),
3331-
];
3332-
3333-
compare(expected, tokens.unwrap());
3309+
all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3310+
"SELECT 1/*/* nested comment */*/0",
3311+
vec![
3312+
Token::make_keyword("SELECT"),
3313+
Token::Whitespace(Whitespace::Space),
3314+
Token::Number("1".to_string(), false),
3315+
Token::Whitespace(Whitespace::MultiLineComment(
3316+
"/* nested comment ".to_string(),
3317+
)),
3318+
Token::Mul,
3319+
Token::Div,
3320+
Token::Number("0".to_string(), false),
3321+
],
3322+
);
33343323
}
33353324

33363325
#[test]

0 commit comments

Comments
 (0)