NikolayS · NikolayS · May 26, 2026
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
@@ -830,6 +830,8 @@ other			.
 					SET_YYLLOC();
 					/* throw back all but the initial u/U */
 					yyless(1);
+					/* Reject identifiers containing Unicode whitespace */
+					check_ident_for_unicode_whitespace(yytext, yyleng);
 					/* and treat it as {identifier} */
 					ident = downcase_truncate_identifier(yytext, yyleng, true);
 					yylval->str = ident;
@@ -1075,6 +1077,9 @@ other			.
 
 					SET_YYLLOC();
 
+					/* Reject identifiers containing Unicode whitespace */
+					check_ident_for_unicode_whitespace(yytext, yyleng);
+
 					/* Is it a keyword? */
 					kwnum = ScanKeywordLookup(yytext,
 											  yyextra->keywordlist);

diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
@@ -16,6 +16,7 @@
 
 #include <ctype.h>
 
+#include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 #include "parser/scansup.h"
 
@@ -104,6 +105,64 @@ truncate_identifier(char *ident, int len, bool warn)
 	}
 }
 
+/*
+ * check_ident_for_unicode_whitespace() --- reject identifiers containing
+ * Unicode whitespace or other invisible characters.
+ *
+ * The flex scanner's identifier rules use byte ranges (\200-\377) that match
+ * any non-ASCII byte, including bytes that form multi-byte Unicode whitespace
+ * characters like NO-BREAK SPACE (U+00A0).  This creates a "Trojan Source"
+ * vulnerability where queries can be visually deceptive:
+ *
+ *   SELECT password is<NBSP>null FROM users;
+ *
+ * looks like "password IS NULL" but parses as password aliased to "is null",
+ * leaking the password value.  This function detects and rejects such cases.
+ *
+ * Only applies to multi-byte encodings (primarily UTF-8) where the issue
+ * arises.  Single-byte encodings are not affected because their high-byte
+ * characters don't encode Unicode whitespace.
+ */
+void
+check_ident_for_unicode_whitespace(const char *ident, int len)
+{
+	int			encoding = GetDatabaseEncoding();
+	int			i;
+
+	/* Only UTF-8 encodes Unicode whitespace as sequences of high bytes */
+	if (encoding != PG_UTF8)
+		return;
+
+	for (i = 0; i < len;)
+	{
+		unsigned char ch = (unsigned char) ident[i];
+
+		if (IS_HIGHBIT_SET(ch))
+		{
+			int			mblen = pg_mblen(&ident[i]);
+			pg_wchar	uchar;
+
+			/* Ensure we don't read past the end */
+			if (i + mblen > len)
+				break;
+
+			uchar = utf8_to_unicode((const unsigned char *) &ident[i]);
+
+			if (pg_u_prop_white_space(uchar))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("identifier contains Unicode whitespace character U+%04X",
+								(unsigned int) uchar),
+						 errdetail("Unicode whitespace characters are not allowed in identifiers because they are visually indistinguishable from regular spaces."),
+						 errhint("Remove or replace the Unicode whitespace character.")));
+
+			i += mblen;
+		}
+		else
+			i++;
+	}
+}
+
 /*
  * scanner_isspace() --- return true if flex scanner considers char whitespace
  *

diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
@@ -22,6 +22,8 @@ extern char *downcase_identifier(const char *ident, int len,
 
 extern void truncate_identifier(char *ident, int len, bool warn);
 
+extern void check_ident_for_unicode_whitespace(const char *ident, int len);
+
 extern bool scanner_isspace(char ch);
 
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
@@ -105,3 +105,26 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+-- Test that Unicode whitespace in unquoted identifiers is rejected.
+-- This prevents "Trojan Source" attacks where visually identical queries
+-- parse with different semantics.
+-- Normal identifiers with non-Latin letters should still work:
+SELECT 1 AS тест;
+ тест 
+------
+    1
+(1 row)
+
+-- U+00A0 NO-BREAK SPACE via Unicode escape in a string (should work as data):
+SELECT U&'\00A0' = ' ' AS nbsp_is_not_regular_space;
+ nbsp_is_not_regular_space 
+---------------------------
+ f
+(1 row)
+
+-- The following line contains U+00A0 (NBSP) between "is" and "null".
+-- It should produce an error about Unicode whitespace in identifiers.
+SELECT 1 is null;
+ERROR:  identifier contains Unicode whitespace character U+00A0
+DETAIL:  Unicode whitespace characters are not allowed in identifiers because they are visually indistinguishable from regular spaces.
+HINT:  Remove or replace the Unicode whitespace character.
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
@@ -36,3 +36,17 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+-- Test that Unicode whitespace in unquoted identifiers is rejected.
+-- This prevents "Trojan Source" attacks where visually identical queries
+-- parse with different semantics.
+
+-- Normal identifiers with non-Latin letters should still work:
+SELECT 1 AS тест;
+
+-- U+00A0 NO-BREAK SPACE via Unicode escape in a string (should work as data):
+SELECT U&'\00A0' = ' ' AS nbsp_is_not_regular_space;
+
+-- The following line contains U+00A0 (NBSP) between "is" and "null".
+-- It should produce an error about Unicode whitespace in identifiers.
+SELECT 1 is null;