Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/backend/parser/scan.l
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,8 @@ other .
SET_YYLLOC();
/* throw back all but the initial u/U */
yyless(1);
/* Reject identifiers containing Unicode whitespace */
check_ident_for_unicode_whitespace(yytext, yyleng);
/* and treat it as {identifier} */
ident = downcase_truncate_identifier(yytext, yyleng, true);
yylval->str = ident;
Expand Down Expand Up @@ -1075,6 +1077,9 @@ other .

SET_YYLLOC();

/* Reject identifiers containing Unicode whitespace */
check_ident_for_unicode_whitespace(yytext, yyleng);

/* Is it a keyword? */
kwnum = ScanKeywordLookup(yytext,
yyextra->keywordlist);
Expand Down
59 changes: 59 additions & 0 deletions src/backend/parser/scansup.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <ctype.h>

#include "common/unicode_category.h"
#include "mb/pg_wchar.h"
#include "parser/scansup.h"

Expand Down Expand Up @@ -104,6 +105,64 @@ truncate_identifier(char *ident, int len, bool warn)
}
}

/*
* check_ident_for_unicode_whitespace() --- reject identifiers containing
* Unicode whitespace or other invisible characters.
*
* The flex scanner's identifier rules use byte ranges (\200-\377) that match
* any non-ASCII byte, including bytes that form multi-byte Unicode whitespace
* characters like NO-BREAK SPACE (U+00A0). This creates a "Trojan Source"
* vulnerability where queries can be visually deceptive:
*
* SELECT password is<NBSP>null FROM users;
*
* looks like "password IS NULL" but parses as password aliased to "is null",
* leaking the password value. This function detects and rejects such cases.
*
* Only applies to multi-byte encodings (primarily UTF-8) where the issue
* arises. Single-byte encodings are not affected because their high-byte
* characters don't encode Unicode whitespace.
*/
void
check_ident_for_unicode_whitespace(const char *ident, int len)
{
int encoding = GetDatabaseEncoding();
int i;

/* Only UTF-8 encodes Unicode whitespace as sequences of high bytes */
if (encoding != PG_UTF8)
return;

for (i = 0; i < len;)
{
unsigned char ch = (unsigned char) ident[i];

if (IS_HIGHBIT_SET(ch))
{
int mblen = pg_mblen(&ident[i]);
pg_wchar uchar;

/* Ensure we don't read past the end */
if (i + mblen > len)
break;

uchar = utf8_to_unicode((const unsigned char *) &ident[i]);

if (pg_u_prop_white_space(uchar))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("identifier contains Unicode whitespace character U+%04X",
(unsigned int) uchar),
errdetail("Unicode whitespace characters are not allowed in identifiers because they are visually indistinguishable from regular spaces."),
errhint("Remove or replace the Unicode whitespace character.")));

i += mblen;
}
else
i++;
}
}

/*
* scanner_isspace() --- return true if flex scanner considers char whitespace
*
Expand Down
2 changes: 2 additions & 0 deletions src/include/parser/scansup.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ extern char *downcase_identifier(const char *ident, int len,

extern void truncate_identifier(char *ident, int len, bool warn);

extern void check_ident_for_unicode_whitespace(const char *ident, int len);

extern bool scanner_isspace(char ch);

#endif /* SCANSUP_H */
23 changes: 23 additions & 0 deletions src/test/regress/expected/unicode.out
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,26 @@ ORDER BY num;

SELECT is_normalized('abc', 'def'); -- run-time error
ERROR: invalid normalization form: def
-- Test that Unicode whitespace in unquoted identifiers is rejected.
-- This prevents "Trojan Source" attacks where visually identical queries
-- parse with different semantics.
-- Normal identifiers with non-Latin letters should still work:
SELECT 1 AS тест;
тест
------
1
(1 row)

-- U+00A0 NO-BREAK SPACE via Unicode escape in a string (should work as data):
SELECT U&'\00A0' = ' ' AS nbsp_is_not_regular_space;
nbsp_is_not_regular_space
---------------------------
f
(1 row)

-- The following line contains U+00A0 (NBSP) between "is" and "null".
-- It should produce an error about Unicode whitespace in identifiers.
SELECT 1 is null;
ERROR: identifier contains Unicode whitespace character U+00A0
DETAIL: Unicode whitespace characters are not allowed in identifiers because they are visually indistinguishable from regular spaces.
HINT: Remove or replace the Unicode whitespace character.
14 changes: 14 additions & 0 deletions src/test/regress/sql/unicode.sql
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,17 @@ FROM
ORDER BY num;

SELECT is_normalized('abc', 'def'); -- run-time error

-- Test that Unicode whitespace in unquoted identifiers is rejected.
-- This prevents "Trojan Source" attacks where visually identical queries
-- parse with different semantics.

-- Normal identifiers with non-Latin letters should still work:
SELECT 1 AS тест;

-- U+00A0 NO-BREAK SPACE via Unicode escape in a string (should work as data):
SELECT U&'\00A0' = ' ' AS nbsp_is_not_regular_space;

-- The following line contains U+00A0 (NBSP) between "is" and "null".
-- It should produce an error about Unicode whitespace in identifiers.
SELECT 1 is null;