1- # This software is Copyright (c) 2012-2020 magnum, and it is hereby
1+ # This software is Copyright (c) 2012-2024 magnum, and it is hereby
22# released to the general public under the following terms:
33# Redistribution and use in source and binary forms, with or without
44# modification, are permitted.
55#
66# Generic implementation of "dumb" exhaustive search of Unicode BMP.
7- # Default is to try *all* allocated characters in the BMP of Unicode v13
8- # (there's 55,387 of them). Even if a fast format can exhaust two characters
7+ # Default is to try *all* allocated characters in the BMP of Unicode v16
8+ # (there's 55,537 of them). Even if a fast format can exhaust two characters
99# in 15 minutes, three characters would take 1.5 years...
1010#
1111# Note that these modes will handle --max-len differently than normal: They
@@ -22,7 +22,7 @@ int maxlength; // Maximum password length to try
2222int last; // Last character position, zero-based
2323int lastid; // Character index in the last position
2424int id[0x7f]; // Current character indices for other positions
25- int charset[0x10000 ], c0; // Characters
25+ int charset[0xd900 ], c0; // Characters
2626
2727void init()
2828{
@@ -43,7 +43,7 @@ void init()
4343
4444/*
4545 * This defines the character set. This is auto-generated from UnicodeData.txt
46- * and we skip control characters.
46+ * of Unicode 16.0.0 and we skip control characters.
4747 */
4848 i = 0;
4949// 0000..007F; Basic Latin
@@ -119,9 +119,6 @@ void init()
119119 charset[i++] = c++;
120120// 0600..06FF; Arabic
121121 c = 0x600; // from ARABIC NUMBER SIGN
122- while (c <= 0x61c) // ..to ARABIC LETTER MARK
123- charset[i++] = c++;
124- c = 0x61e; // from ARABIC TRIPLE DOT PUNCTUATION MARK
125122 while (c <= 0x6ff) // ..to ARABIC LETTER HEH WITH INVERTED V
126123 charset[i++] = c++;
127124// 0700..074F; Syriac
@@ -163,14 +160,17 @@ void init()
163160 c = 0x860; // from SYRIAC LETTER MALAYALAM NGA
164161 while (c <= 0x86a) // ..to SYRIAC LETTER MALAYALAM SSA
165162 charset[i++] = c++;
166- // 08A0..08FF ; Arabic Extended-A
167- c = 0x8a0 ; // from ARABIC LETTER BEH WITH SMALL V BELOW
168- while (c <= 0x8b4 ) // ..to ARABIC LETTER KAF WITH DOT BELOW
163+ // 0870..089F ; Arabic Extended-B
164+ c = 0x870 ; // from ARABIC LETTER ALEF WITH ATTACHED FATHA
165+ while (c <= 0x88e ) // ..to ARABIC VERTICAL TAIL
169166 charset[i++] = c++;
170- c = 0x8b6; // from ARABIC LETTER BEH WITH SMALL MEEM ABOVE
171- while (c <= 0x8c7) // ..to ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
167+ charset[i++] = 0x890; // ARABIC POUND MARK ABOVE
168+ charset[i++] = 0x891; // ARABIC PIASTRE MARK ABOVE
169+ c = 0x897; // from ARABIC PEPET
170+ while (c <= 0x89f) // ..to ARABIC HALF MADDA OVER MADDA
172171 charset[i++] = c++;
173- c = 0x8d3; // from ARABIC SMALL LOW WAW
172+ // 08A0..08FF; Arabic Extended-A
173+ c = 0x8a0; // from ARABIC LETTER BEH WITH SMALL V BELOW
174174 while (c <= 0x8ff) // ..to ARABIC MARK SIDEWAYS NOON GHUNNA
175175 charset[i++] = c++;
176176// 0900..097F; Devanagari
@@ -360,7 +360,7 @@ void init()
360360 c = 0xc2a; // from TELUGU LETTER PA
361361 while (c <= 0xc39) // ..to TELUGU LETTER HA
362362 charset[i++] = c++;
363- c = 0xc3d ; // from TELUGU SIGN AVAGRAHA
363+ c = 0xc3c ; // from TELUGU SIGN NUKTA
364364 while (c <= 0xc44) // ..to TELUGU VOWEL SIGN VOCALIC RR
365365 charset[i++] = c++;
366366 charset[i++] = 0xc46; // TELUGU VOWEL SIGN E
@@ -406,14 +406,16 @@ void init()
406406 charset[i++] = c++;
407407 charset[i++] = 0xcd5; // KANNADA LENGTH MARK
408408 charset[i++] = 0xcd6; // KANNADA AI LENGTH MARK
409+ charset[i++] = 0xcdd; // KANNADA LETTER NAKAARA POLLU
410+ charset[i++] = 0xcde; // KANNADA LETTER FA
409411 c = 0xce0; // from KANNADA LETTER VOCALIC RR
410412 while (c <= 0xce3) // ..to KANNADA VOWEL SIGN VOCALIC LL
411413 charset[i++] = c++;
412414 c = 0xce6; // from KANNADA DIGIT ZERO
413415 while (c <= 0xcef) // ..to KANNADA DIGIT NINE
414416 charset[i++] = c++;
415417 charset[i++] = 0xcf1; // KANNADA SIGN JIHVAMULIYA
416- charset[i++] = 0xcf2 ; // KANNADA SIGN UPADHMANIYA
418+ charset[i++] = 0xcf3 ; // KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
417419// 0D00..0D7F; Malayalam
418420 c = 0xd00; // from MALAYALAM SIGN COMBINING ANUSVARA ABOVE
419421 while (c <= 0xd0c) // ..to MALAYALAM LETTER VOCALIC L
@@ -483,7 +485,7 @@ void init()
483485 while (c <= 0xec4) // ..to LAO VOWEL SIGN AI
484486 charset[i++] = c++;
485487 c = 0xec8; // from LAO TONE MAI EK
486- while (c <= 0xecd ) // ..to LAO NIGGAHITA
488+ while (c <= 0xece ) // ..to LAO YAMAKKAN
487489 charset[i++] = c++;
488490 c = 0xed0; // from LAO DIGIT ZERO
489491 while (c <= 0xed9) // ..to LAO DIGIT NINE
@@ -596,11 +598,9 @@ void init()
596598 charset[i++] = c++;
597599// 1700..171F; Tagalog
598600 c = 0x1700; // from TAGALOG LETTER A
599- while (c <= 0x170c) // ..to TAGALOG LETTER YA
600- charset[i++] = c++;
601- c = 0x170e; // from TAGALOG LETTER LA
602- while (c <= 0x1714) // ..to TAGALOG SIGN VIRAMA
601+ while (c <= 0x1715) // ..to TAGALOG SIGN PAMUDPOD
603602 charset[i++] = c++;
603+ charset[i++] = 0x171f; // TAGALOG LETTER ARCHAIC RA
604604// 1720..173F; Hanunoo
605605 c = 0x1720; // from HANUNOO LETTER A
606606 while (c <= 0x1736) // ..to PHILIPPINE DOUBLE PUNCTUATION
@@ -629,9 +629,6 @@ void init()
629629 charset[i++] = c++;
630630// 1800..18AF; Mongolian
631631 c = 0x1800; // from MONGOLIAN BIRGA
632- while (c <= 0x180e) // ..to MONGOLIAN VOWEL SEPARATOR
633- charset[i++] = c++;
634- c = 0x1810; // from MONGOLIAN DIGIT ZERO
635632 while (c <= 0x1819) // ..to MONGOLIAN DIGIT NINE
636633 charset[i++] = c++;
637634 c = 0x1820; // from MONGOLIAN LETTER A
@@ -704,14 +701,14 @@ void init()
704701 charset[i++] = c++;
705702// 1AB0..1AFF; Combining Diacritical Marks Extended
706703 c = 0x1ab0; // from COMBINING DOUBLED CIRCUMFLEX ACCENT
707- while (c <= 0x1ac0 ) // ..to COMBINING LATIN SMALL LETTER TURNED W BELOW
704+ while (c <= 0x1ace ) // ..to COMBINING LATIN SMALL LETTER INSULAR T
708705 charset[i++] = c++;
709706// 1B00..1B7F; Balinese
710707 c = 0x1b00; // from BALINESE SIGN ULU RICEM
711- while (c <= 0x1b4b ) // ..to BALINESE LETTER ASYURA SASAK
708+ while (c <= 0x1b4c ) // ..to BALINESE LETTER ARCHAIC JNYA
712709 charset[i++] = c++;
713- c = 0x1b50 ; // from BALINESE DIGIT ZERO
714- while (c <= 0x1b7c ) // ..to BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING
710+ c = 0x1b4e ; // from BALINESE INVERTED CARIK SIKI
711+ while (c <= 0x1b7f ) // ..to BALINESE PANTI BAWAK
715712 charset[i++] = c++;
716713// 1B80..1BBF; Sundanese
717714 c = 0x1b80; // from SUNDANESE SIGN PANYECEK
@@ -739,7 +736,7 @@ void init()
739736 charset[i++] = c++;
740737// 1C80..1C8F; Cyrillic Extended-C
741738 c = 0x1c80; // from CYRILLIC SMALL LETTER ROUNDED VE
742- while (c <= 0x1c88 ) // ..to CYRILLIC SMALL LETTER UNBLENDED UK
739+ while (c <= 0x1c8a ) // ..to CYRILLIC SMALL LETTER TJE
743740 charset[i++] = c++;
744741// 1C90..1CBF; Georgian Extended
745742 c = 0x1c90; // from GEORGIAN MTAVRULI CAPITAL LETTER AN
@@ -765,9 +762,6 @@ void init()
765762 charset[i++] = c++;
766763// 1DC0..1DFF; Combining Diacritical Marks Supplement
767764 c = 0x1dc0; // from COMBINING DOTTED GRAVE ACCENT
768- while (c <= 0x1df9) // ..to COMBINING WIDE INVERTED BRIDGE BELOW
769- charset[i++] = c++;
770- c = 0x1dfb; // from COMBINING DELETION MARK
771765 while (c <= 0x1dff) // ..to COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
772766 charset[i++] = c++;
773767// 1E00..1EFF; Latin Extended Additional
@@ -831,7 +825,7 @@ void init()
831825 charset[i++] = c++;
832826// 20A0..20CF; Currency Symbols
833827 c = 0x20a0; // from EURO-CURRENCY SIGN
834- while (c <= 0x20bf ) // ..to BITCOIN SIGN
828+ while (c <= 0x20c0 ) // ..to SOM SIGN
835829 charset[i++] = c++;
836830// 20D0..20FF; Combining Diacritical Marks for Symbols
837831 c = 0x20d0; // from COMBINING LEFT HARPOON ABOVE
@@ -859,7 +853,7 @@ void init()
859853 charset[i++] = c++;
860854// 2400..243F; Control Pictures
861855 c = 0x2400; // from SYMBOL FOR NULL
862- while (c <= 0x2426 ) // ..to SYMBOL FOR SUBSTITUTE FORM TWO
856+ while (c <= 0x2429 ) // ..to SYMBOL FOR DELETE MEDIUM SHADE FORM
863857 charset[i++] = c++;
864858// 2440..245F; Optical Character Recognition
865859 c = 0x2440; // from OCR HOOK
@@ -925,10 +919,7 @@ void init()
925919 charset[i++] = c++;
926920// 2C00..2C5F; Glagolitic
927921 c = 0x2c00; // from GLAGOLITIC CAPITAL LETTER AZU
928- while (c <= 0x2c2e) // ..to GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
929- charset[i++] = c++;
930- c = 0x2c30; // from GLAGOLITIC SMALL LETTER AZU
931- while (c <= 0x2c5e) // ..to GLAGOLITIC SMALL LETTER LATINATE MYSLITE
922+ while (c <= 0x2c5f) // ..to GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
932923 charset[i++] = c++;
933924// 2C60..2C7F; Latin Extended-C
934925 c = 0x2c60; // from LATIN CAPITAL LETTER L WITH DOUBLE BAR
@@ -989,7 +980,7 @@ void init()
989980 charset[i++] = c++;
990981// 2E00..2E7F; Supplemental Punctuation
991982 c = 0x2e00; // from RIGHT ANGLE SUBSTITUTION MARKER
992- while (c <= 0x2e52 ) // ..to TIRONIAN SIGN CAPITAL ET
983+ while (c <= 0x2e5d ) // ..to OBLIQUE HYPHEN
993984 charset[i++] = c++;
994985// 2E80..2EFF; CJK Radicals Supplement
995986 c = 0x2e80; // from CJK RADICAL REPEAT
@@ -1004,7 +995,7 @@ void init()
1004995 charset[i++] = c++;
1005996// 2FF0..2FFF; Ideographic Description Characters
1006997 c = 0x2ff0; // from IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT
1007- while (c <= 0x2ffb ) // ..to IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
998+ while (c <= 0x2fff ) // ..to IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
1008999 charset[i++] = c++;
10091000// 3000..303F; CJK Symbols and Punctuation
10101001 c = 0x3000; // from IDEOGRAPHIC SPACE
@@ -1039,8 +1030,9 @@ void init()
10391030 charset[i++] = c++;
10401031// 31C0..31EF; CJK Strokes
10411032 c = 0x31c0; // from CJK STROKE T
1042- while (c <= 0x31e3 ) // ..to CJK STROKE Q
1033+ while (c <= 0x31e5 ) // ..to CJK STROKE SZP
10431034 charset[i++] = c++;
1035+ charset[i++] = 0x31ef; // IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
10441036// 31F0..31FF; Katakana Phonetic Extensions
10451037 c = 0x31f0; // from KATAKANA LETTER SMALL KU
10461038 while (c <= 0x31ff) // ..to KATAKANA LETTER SMALL RO
@@ -1066,7 +1058,7 @@ void init()
10661058 charset[i++] = c++;
10671059// 4E00..9FFF; CJK Unified Ideographs
10681060 c = 0x4e00; // from <CJK Ideograph, First>
1069- while (c <= 0x9ffc ) // ..to <CJK Ideograph, Last>
1061+ while (c <= 0x9fff ) // ..to <CJK Ideograph, Last>
10701062 charset[i++] = c++;
10711063// A000..A48F; Yi Syllables
10721064 c = 0xa000; // from YI SYLLABLE IT
@@ -1098,12 +1090,14 @@ void init()
10981090 charset[i++] = c++;
10991091// A720..A7FF; Latin Extended-D
11001092 c = 0xa720; // from MODIFIER LETTER STRESS AND HIGH TONE
1101- while (c <= 0xa7bf ) // ..to LATIN SMALL LETTER GLOTTAL U
1093+ while (c <= 0xa7cd ) // ..to LATIN SMALL LETTER S WITH DIAGONAL STROKE
11021094 charset[i++] = c++;
1103- c = 0xa7c2; // from LATIN CAPITAL LETTER ANGLICANA W
1104- while (c <= 0xa7ca) // ..to LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
1095+ charset[i++] = 0xa7d0; // LATIN CAPITAL LETTER CLOSED INSULAR G
1096+ charset[i++] = 0xa7d1; // LATIN SMALL LETTER CLOSED INSULAR G
1097+ c = 0xa7d5; // from LATIN SMALL LETTER DOUBLE WYNN
1098+ while (c <= 0xa7dc) // ..to LATIN CAPITAL LETTER LAMBDA WITH STROKE
11051099 charset[i++] = c++;
1106- c = 0xa7f5 ; // from LATIN CAPITAL LETTER REVERSED HALF H
1100+ c = 0xa7f2 ; // from MODIFIER LETTER CAPITAL C
11071101 while (c <= 0xa7ff) // ..to LATIN EPIGRAPHIC LETTER ARCHAIC M
11081102 charset[i++] = c++;
11091103// A800..A82F; Syloti Nagri
@@ -1258,19 +1252,16 @@ void init()
12581252 charset[i++] = c++;
12591253// FB50..FDFF; Arabic Presentation Forms-A
12601254 c = 0xfb50; // from ARABIC LETTER ALEF WASLA ISOLATED FORM
1261- while (c <= 0xfbc1 ) // ..to ARABIC SYMBOL SMALL TAH BELOW
1255+ while (c <= 0xfbc2 ) // ..to ARABIC SYMBOL WASLA ABOVE
12621256 charset[i++] = c++;
12631257 c = 0xfbd3; // from ARABIC LETTER NG ISOLATED FORM
1264- while (c <= 0xfd3f) // ..to ORNATE RIGHT PARENTHESIS
1265- charset[i++] = c++;
1266- c = 0xfd50; // from ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM
12671258 while (c <= 0xfd8f) // ..to ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
12681259 charset[i++] = c++;
12691260 c = 0xfd92; // from ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM
12701261 while (c <= 0xfdc7) // ..to ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
12711262 charset[i++] = c++;
12721263 c = 0xfdf0; // from ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM
1273- while (c <= 0xfdfd ) // ..to ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
1264+ while (c <= 0xfdff ) // ..to ARABIC LIGATURE AZZA WA JALL
12741265 charset[i++] = c++;
12751266// FE00..FE0F; Variation Selectors
12761267 c = 0xfe00; // from VARIATION SELECTOR-1
0 commit comments