mideind · vthorsteinsson · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -29,11 +29,14 @@ jobs:
       run: |
         python -m pip install --upgrade uv
         uv pip install --system --upgrade wheel setuptools
-        uv pip install --system ".[dev]"
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then
+          uv pip install --system ".[dev]"
+        else
+          uv pip install --system ".[test]"
+        fi
 
     - name: Type check with mypy (only on oldest supported Python version)
       run: |
-        if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
         if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
 
     - name: Test with pytest

diff --git a/README.md b/README.md
@@ -118,7 +118,6 @@ Other options can be specified on the command line:
 | `-g`, `--keep_composite_glyphs` | Do not replace composite glyphs using Unicode COMBINING codes with their accented/umlaut counterparts. |
 | `-e`, `--replace_html_escapes` | HTML escape codes replaced by their meaning, such as `&aacute;` -> `á`. |
 | `-c`, `--convert_numbers`      | English-style decimal points and thousands separators in numbers changed to Icelandic style. |
-| `-k N`, `--handle_kludgy_ordinals N` | Kludgy ordinal handling defined. 0: Returns the original mixed word form, 1. Kludgy ordinal returned as pure word forms, 2: Kludgy ordinals returned as pure numbers. |
 
 Type `tokenize -h` or `tokenize --help` to get a short help message.
 
@@ -453,31 +452,6 @@ functions:
 
   The default value for the `replace_html_escapes` option is `False`.
 
-* `handle_kludgy_ordinals=[value]`
-
-  This options controls the way Tokenizer handles 'kludgy' ordinals, such as
-  *1sti*, *4ðu*, or *2ja*. By default, such ordinals are returned unmodified
-  ('passed through') as word tokens (`TOK.WORD`).
-  However, this can be modified as follows:
-
-  * `tokenizer.KLUDGY_ORDINALS_MODIFY`: Kludgy ordinals are corrected
-    to become 'proper' word tokens, i.e. *1sti* becomes *fyrsti* and
-    *2ja* becomes *tveggja*.
-
-  * `tokenizer.KLUDGY_ORDINALS_TRANSLATE`: Kludgy ordinals that represent
-    proper ordinal numbers are translated to ordinal tokens (`TOK.ORDINAL`),
-    with their original text and their ordinal value. *1sti* thus
-    becomes a `TOK.ORDINAL` token with a value of 1, and *3ja* becomes
-    a `TOK.ORDINAL` with a value of 3.
-
-  * `tokenizer.KLUDGY_ORDINALS_PASS_THROUGH` is the default value of
-    the option. It causes kludgy ordinals to be returned unmodified as
-    word tokens.
-
-  Note that versions of Tokenizer prior to 1.4 behaved as if
-  `handle_kludgy_ordinals` were set to
-  `tokenizer.KLUDGY_ORDINALS_TRANSLATE`.
-
 ## Dash and Hyphen Handling
 
 Tokenizer distinguishes between three dash types and handles them contextually:
@@ -578,9 +552,8 @@ with the following exceptions:
   can be disabled; see the `replace_composite_glyphs` option described
   above.)
 
-* If the appropriate options are specified (see above), it converts
-  kludgy ordinals (*3ja*) to proper ones (*þriðja*), and English-style
-  thousand and decimal separators to Icelandic ones
+* If the `convert_numbers` option is specified (see above), English-style
+  thousand and decimal separators are converted to Icelandic ones
   (*10,345.67* becomes *10.345,67*).
 
 * If the `replace_html_escapes` option is set, Tokenizer replaces
@@ -812,8 +785,8 @@ can be found in the file `test/toktest_normal_gold_expected.txt`.
   `TOK.SERIALNUMBER` token kinds; abbreviations can now have multiple
   meanings.
 * Version 1.4.0: Added the `**options` parameter to the
-  `tokenize()` function, giving control over the handling of numbers,
-  telephone numbers, and 'kludgy' ordinals.
+  `tokenize()` function, giving control over the handling of numbers
+  and telephone numbers.
 * Version 1.3.0: Added `TOK.DOMAIN` and `TOK.HASHTAG` token types; 
   improved handling of capitalized month name *Ágúst*, which is
   now recognized when following an ordinal number; improved recognition

diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,9 @@ Issues = "https://github.com/mideind/Tokenizer/issues"
 Changelog = "https://github.com/mideind/Tokenizer#changelog"
 
 [project.optional-dependencies]
+test = [
+    "pytest>=7.0",
+]
 dev = [
     "pytest>=7.0",
     "pytest-cov>=4.0",

diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -36,9 +36,6 @@
     TP_WORD,
     EN_DASH,
     EM_DASH,
-    KLUDGY_ORDINALS_PASS_THROUGH,
-    KLUDGY_ORDINALS_MODIFY,
-    KLUDGY_ORDINALS_TRANSLATE,
     BIN_Tuple,
     BIN_TupleList,
 )
@@ -80,9 +77,6 @@
     "EM_DASH",
     "EN_DASH",
     "generate_raw_tokens",
-    "KLUDGY_ORDINALS_MODIFY",
-    "KLUDGY_ORDINALS_PASS_THROUGH",
-    "KLUDGY_ORDINALS_TRANSLATE",
     "mark_paragraphs",
     "normalized_text_from_tokens",
     "normalized_text",

diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
@@ -605,55 +605,26 @@ class PersonNameTuple(NamedTuple):
 )
 
 
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_PASS_THROUGH, we do not convert
-# kludgy ordinals but pass them through as word tokens.
-KLUDGY_ORDINALS_PASS_THROUGH = 0
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_MODIFY, we convert '1sti' to 'fyrsti', etc.,
-# and return the modified word as a token.
-KLUDGY_ORDINALS_MODIFY = 1
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_TRANSLATE, we convert '1sti' to TOK.Ordinal('1sti', 1), etc.,
-# but otherwise pass the original word through as a word token ('2ja').
-KLUDGY_ORDINALS_TRANSLATE = 2
-
-# Incorrectly written ('kludgy') ordinals
-ORDINAL_ERRORS: Mapping[str, str] = {
-    "1sti": "fyrsti",
-    "1sta": "fyrsta",
-    "1stu": "fyrstu",
-    "3ji": "þriðji",
-    # "3ja": "þriðja",  # þriggja
-    "3ju": "þriðju",
-    "4ði": "fjórði",
-    "4ða": "fjórða",
-    "4ðu": "fjórðu",
-    "5ti": "fimmti",
-    "5ta": "fimmta",
-    "5tu": "fimmtu",
-    "2svar": "tvisvar",
-    "3svar": "þrisvar",
-    "2ja": "tveggja",
-    "3ja": "þriggja",
-    "4ra": "fjögurra",
-}
-
-# Translations of kludgy ordinal words into numbers
-ORDINAL_NUMBERS: Mapping[str, int] = {
-    "1sti": 1,
-    "1sta": 1,
-    "1stu": 1,
-    "3ji": 3,
-    "3ja": 3,
-    "3ju": 3,
-    "4ði": 4,
-    "4ða": 4,
-    "4ðu": 4,
-    "5ti": 5,
-    "5ta": 5,
-    "5tu": 5,
-}
+# Incorrectly written ('kludgy') ordinals: these are passed through unchanged
+# as word tokens, but they need to be recognized so they are not parsed as numbers
+KLUDGY_ORDINALS: tuple[str, ...] = (
+    "1sti",
+    "1sta",
+    "1stu",
+    "2svar",
+    "3svar",
+    "2ja",
+    "3ja",
+    "3ji",
+    "3ju",
+    "4ði",
+    "4ða",
+    "4ðu",
+    "4ra",
+    "5ti",
+    "5ta",
+    "5tu",
+)
 
 # Handling of Roman numerals
 

diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
@@ -149,19 +149,6 @@
     ),
 )
 
-parser.add_argument(
-    "-k",
-    "--handle_kludgy_ordinals",
-    type=int,
-    default=0,
-    help=(
-        "Kludgy ordinal handling defined.\n"
-        "\t0: Returns the original word form.\n"
-        "\t1: Ordinals returned as pure words.\n"
-        "\t2: Ordinals returned as numbers."
-    ),
-)
-
 parser.add_argument(
     "-v",
     "--version",
@@ -263,9 +250,6 @@ def val(t: Tok, quote_word: bool = False) -> Any:
     if args.one_sent_per_line:
         options["one_sent_per_line"] = True
 
-    if args.handle_kludgy_ordinals:
-        options["handle_kludgy_ordinals"] = args.handle_kludgy_ordinals
-
     if args.original:
         options["original"] = args.original
 

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -1730,43 +1730,24 @@ def _is_letter(self, char: str) -> bool:
 class NumberParser:
     """Parses a sequence of digits off the front of a raw token"""
 
-    def __init__(
-        self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
-    ) -> None:
+    def __init__(self, rt: Tok, convert_numbers: bool) -> None:
         self.rt = rt
-        self.handle_kludgy_ordinals = handle_kludgy_ordinals
         self.convert_numbers = convert_numbers
 
     def parse(self) -> Iterable[Tok]:
         """Parse the raw token, yielding result tokens"""
         # Handle kludgy ordinals: '3ji', '5ti', etc.
+        # Yield them unchanged as word tokens (pass-through behavior)
         rt = self.rt
-        handle_kludgy_ordinals = self.handle_kludgy_ordinals
         convert_numbers = self.convert_numbers
-        for key, val in ORDINAL_ERRORS.items():
-            rtxt = rt.txt
-            if rtxt.startswith(key):
-                # This is a kludgy ordinal
-                key_tok, rt = rt.split(len(key))
-                if handle_kludgy_ordinals == KLUDGY_ORDINALS_MODIFY:
-                    # Convert ordinals to corresponding word tokens:
-                    # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc.
-                    key_tok.substitute_longer((0, len(key)), val)
-                    yield TOK.Word(key_tok)
-                elif (
-                    handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE
-                    and key in ORDINAL_NUMBERS
-                ):
-                    # Convert word-form ordinals into ordinal tokens,
-                    # i.e. '1sti' -> TOK.Ordinal('1sti', 1),
-                    # but leave other kludgy constructs ('2ja')
-                    # as word tokens
-                    yield TOK.Ordinal(key_tok, ORDINAL_NUMBERS[key])
-                else:
-                    # No special handling of kludgy ordinals:
-                    # yield them unchanged as word tokens
+        rtxt = rt.txt
+        if rtxt.startswith(KLUDGY_ORDINALS):
+            # This is a kludgy ordinal: find which one matched and yield as word token
+            for key in KLUDGY_ORDINALS:
+                if rtxt.startswith(key):
+                    key_tok, rt = rt.split(len(key))
                     yield TOK.Word(key_tok)
-                break  # This skips the for loop 'else'
+                    break
         else:
             # Not a kludgy ordinal: eat tokens starting with a digit
             t, rt = parse_digits(rt, convert_numbers)
@@ -1898,7 +1879,6 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
 
 def parse_mixed(
     rt: Tok,
-    handle_kludgy_ordinals: int,
     convert_numbers: bool,
     replace_composite_glyphs: bool = True,
 ) -> Iterable[Tok]:
@@ -1994,7 +1974,7 @@ def parse_mixed(
             rtxt[0] in DIGITS_PREFIX
             or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
         ):
-            np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers)
+            np = NumberParser(rt, convert_numbers)
             yield from np.parse()
             rt = np.rt
             ate = True
@@ -2072,12 +2052,6 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
     replace_html_escapes: bool = options.get("replace_html_escapes", False)
     one_sent_per_line: bool = options.get("one_sent_per_line", False)
 
-    # The default behavior for kludgy ordinals is to pass them
-    # through as word tokens
-    handle_kludgy_ordinals: int = options.get(
-        "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH
-    )
-
     # This code proceeds roughly as follows:
     # 1) The text is split into raw tokens on whitespace boundaries.
     # 2) (By far the most common case:) Raw tokens that are purely
@@ -2178,9 +2152,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
                 yield TOK.Punctuation(punct, normalized="‚")
 
         # More complex case of mixed punctuation, letters and numbers
-        yield from parse_mixed(
-            rt, handle_kludgy_ordinals, convert_numbers, replace_composite_glyphs
-        )
+        yield from parse_mixed(rt, convert_numbers, replace_composite_glyphs)
 
     # Yield a sentinel token at the end that will be cut off by the final generator
     yield TOK.End_Sentinel()

diff --git a/test/test_cli.py b/test/test_cli.py
@@ -200,13 +200,4 @@ def test_cli(capsys: CaptureFixture[str], monkeypatch: MonkeyPatch) -> None:
         == "Hann fékk 7,5 í meðaleinkunn en bara 3,3 í íþróttum , og hlaut 2.000,5 USD fyrir ."
     )
 
-    # Handle kludgy ordinals
-    # --handle_kludgy_ordinals flag
-    t = "Hann var 1sti maðurinn til að heimsækja tunglið."
-    r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "1"], t)
-    assert r == "Hann var fyrsti maðurinn til að heimsækja tunglið ."
-    # TODO: Broken functionality, needs to be fixed
-    # r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "2"], t)
-    # assert r == "Hann var 1. maðurinn til að heimsækja tunglið ."
-
     # TODO: Add more tests for the CLI to achieve 100% coverage
diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py
@@ -638,26 +638,6 @@ def test_composite_phrases() -> None:
     assert byte_indexes == [0, 25, 26]
 
 
-def test_lengthening_substitutions() -> None:
-    s = "Þetta er 3ji báturinn!"
-    #    0123456789012345678901
-    #    ^    ^  ^   ^        ^
-    #    x             x
-    #             !             lengthening happens here (3ji->þriðji)
-    toks = tokenizer.parse_tokens(
-        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
-    )
-    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
-    assert char_indexes == [0, 5, 8, 12, 21]
-    assert byte_indexes == [0, 6, 9, 13, 23]
-    toks = tokenizer.parse_tokens(
-        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
-    )
-    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
-    assert char_indexes == [0, 5, 8, 12, 21, 22]
-    assert byte_indexes == [0, 6, 9, 13, 23, 24]
-
-
 def test_converted_measurements() -> None:
     s = "Stillið ofninn á 12° C til að baka kökuna."
     #    012345678901234567890123456789012345678901