From 24e77d79da474c2473794bdb5b4e668357a79558 Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 20:51:06 -0300 Subject: [PATCH 1/7] feat(resolver): asset classifier mapping identifiers to Wealthuman taxonomy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add resolve_asset(): turns any Brazilian asset identifier (ticker/CNPJ/ISIN/ name) into a classification mapped to the consolidation macro taxonomy (RF/RV/Multimercado/Alternativos/Estruturados) plus an orthogonal exposure axis (Brasil/Internacional), subclasse, underlying_nature, debenture Lei-12.431 facts, source, confidence, and an audit cascade. Deterministic, cacheable, no PII. Core is offline (curated ETF/global-fund seed + structural rules); external providers (Mais Retorno / CVM-B3 / web search) are an injectable fallback chain. Exposed over REST (/resolver/resolve) and as the resolve_asset MCP tool. Passes the Wealthuman spec test set: IFRA11→RF/Indexada à Inflação, ARBOR/WHG→RV+Internacional, DEB PETROBRAS IPCA+→RF incentivada+isento, COE→Estruturados (never ETF), "Crédito Estruturado"→RF (name-trap), IVVB11→RV+Internacional, FIIs→RV. Geography is modeled as the exposure axis, not a macro class (macro_class is pure asset class). Hardened after cross-host adversarial review: heuristic Lei-12.431 isento kept below the cascade short-circuit so a provider confirms by ISIN; bare-token collisions (IE/LC/LF, substring LCI) removed; API length caps; as_of stamped in America/Sao_Paulo. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 16 + docs/MCP_SURFACE.md | 9 +- pyproject.toml | 3 + src/findata/api/app.py | 2 + src/findata/api/mcp_app.py | 29 ++ src/findata/api/routers/resolver.py | 35 ++ src/findata/resolver/__init__.py | 35 ++ src/findata/resolver/engine.py | 565 ++++++++++++++++++++++++++++ src/findata/resolver/models.py | 124 ++++++ src/findata/resolver/normalize.py | 125 ++++++ src/findata/resolver/seed.py | 169 +++++++++ src/findata/resolver/taxonomy.py | 85 +++++ tests/test_mcp_surface.py | 4 +- tests/test_resolver.py | 232 ++++++++++++ 14 files changed, 1427 insertions(+), 6 deletions(-) create mode 100644 src/findata/api/routers/resolver.py create mode 100644 src/findata/resolver/__init__.py create mode 100644 src/findata/resolver/engine.py create mode 100644 src/findata/resolver/models.py create mode 100644 src/findata/resolver/normalize.py create mode 100644 src/findata/resolver/seed.py create mode 100644 src/findata/resolver/taxonomy.py create mode 100644 tests/test_resolver.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f12ff64..a2f2a67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,22 @@ adheres to [Semantic Versioning](https://semver.org/). ### Added +- **Asset-classification resolver** — `findata.resolver.resolve_asset()`, + `GET /resolver/resolve`, and the `resolve_asset` MCP tool. Turns any + Brazilian asset identifier (ticker/CNPJ/ISIN/name) into a classification + mapped to the consolidation macro taxonomy (Renda Fixa, Renda Variável, + Multimercado, Alternativos, Estruturados) plus an orthogonal `exposure` + axis (Brasil/Internacional), `subclasse`, `underlying_nature`, debenture + Lei-12.431 facts, `source`, `confidence`, and the `cascade` walked. + Deterministic and offline at its core (a curated ETF/global-fund seed + + structural rules), with an injectable external-provider chain (Mais + Retorno / CVM-B3 / restricted web search) for low-confidence fallback. + Classifies ETFs/funds by underlying (IFRA11 debêntures → RF; IVVB11 ações + → RV + Internacional), defends the COE-never-ETF and "Crédito Estruturado" + name-traps, and keeps the Lei-12.431 isento flag below the cascade + short-circuit when only inferred by heuristic. Hardened after adversarial + review: bare-token collisions (`IE`/`LC`/`LF`/substring `LCI`) removed, + API length caps, `as_of` stamped in America/Sao_Paulo. - **ANBIMA Títulos Públicos (TPF) secondary market** — `get_tpf()`, `GET /anbima/tpf`, and `findata anbima tpf`. Daily reference rates for outstanding federal government bonds (LTN, LFT, NTN-B, NTN-C, NTN-F) from diff --git a/docs/MCP_SURFACE.md b/docs/MCP_SURFACE.md index db9d067..fbbb90a 100644 --- a/docs/MCP_SURFACE.md +++ b/docs/MCP_SURFACE.md @@ -49,14 +49,15 @@ safe. **The 95 REST routes that back the CLI and HTTP consumers never change.** | | 1:1 (old) | curated (new) | |---|---:|---:| -| MCP tools | 95 | **24** (25 with code mode) | -| `tools/list` size | ~85k chars (~21k tok) | **~29k chars (~7k tok)** | -| REST operations | 95 | **95 (unchanged)** | +| MCP tools | 95 | **25** (26 with code mode) | +| `tools/list` size | ~85k chars (~21k tok) | **~30k chars (~7k tok)** | +| REST operations | 95 | **96** | -## The 24 curated tools +## The 25 curated tools ``` registry_lookup ← start here: CNPJ / ticker / code / name → entities +resolve_asset ← classify an asset into the macro taxonomy + exposure bcb_series bcb_ptax bcb_focus (BCB: 12 → 3) cvm_company cvm_financials cvm_fund cvm_structured_fund (CVM: 22 → 4) diff --git a/pyproject.toml b/pyproject.toml index edbe9aa..870d722 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,6 +137,9 @@ max-statements = 50 # Curated MCP layer: FastAPI Query() defaults (B008), wide consolidated tools # (PLR0913), and intentional flat dataset-dispatch switches (C901/PLR0912/PLR0911). "src/findata/api/mcp_app.py" = ["B008", "PLR0913", "C901", "PLR0912", "PLR0911"] +# Resolver engine: the classification cascade is an intentional flat +# rule-by-rule switch (one branch per instrument shape) — auditable by design. +"src/findata/resolver/engine.py" = ["C901", "PLR0912", "PLR0911"] # CLI commands are naturally wide (many typer.Option flags). "src/findata/cli.py" = ["PLR0913"] # Banner uses rich + sys.stdout directly — not a print-statement debug. diff --git a/src/findata/api/app.py b/src/findata/api/app.py index 741683b..700cbdf 100644 --- a/src/findata/api/app.py +++ b/src/findata/api/app.py @@ -26,6 +26,7 @@ openfinance, receita, registry, + resolver, susep, tesouro, yahoo, @@ -139,6 +140,7 @@ async def _value_error_handler(_: Request, exc: ValueError) -> JSONResponse: app.include_router(aneel.router) app.include_router(susep.router) app.include_router(registry.router) +app.include_router(resolver.router) app.include_router(yahoo.router) diff --git a/src/findata/api/mcp_app.py b/src/findata/api/mcp_app.py index a026e5f..b82115e 100644 --- a/src/findata/api/mcp_app.py +++ b/src/findata/api/mcp_app.py @@ -39,6 +39,7 @@ from findata.api._b3_common import MAX_TICKERS, resolve_quotes from findata.registry import lookup +from findata.resolver import resolve_asset from findata.sources.anbima import indices as anbima_src from findata.sources.aneel import leiloes from findata.sources.b3 import cotahist, indices @@ -97,6 +98,34 @@ async def registry_lookup( return await lookup(q, limit=limit) +@router.get( + "/resolver/resolve", + operation_id="resolve_asset", + response_model=None, + summary="Classify a Brazilian asset into the macro taxonomy (RF/RV/Multi/Intl/Alt/Estrut)", +) +async def resolve_asset_tool( + name: str | None = Query( + None, max_length=256, description="Asset name/label, e.g. 'FI ITAUINFRA CI'" + ), + ticker: str | None = Query(None, max_length=16, description="B3 ticker, e.g. IFRA11, PETR4"), + cnpj: str | None = Query(None, max_length=32, description="Fund CNPJ (masked or not)"), + isin: str | None = Query(None, max_length=16, description="ISIN, e.g. BR..."), +) -> Any: + """Turn any asset identifier into a classification already mapped to the + consolidation macro taxonomy: Renda Fixa, Renda Variável, Multimercado, + Internacional, Alternativos, Estruturados. + + Returns ``macro_class`` + ``subclasse`` + ``underlying_nature`` (splits + ETF-de-ações from ETF-de-debêntures), debenture/Lei-12.431 facts, ``source``, + ``confidence``, and the ``cascade`` walked — deterministic and cacheable. + Pass any subset of identifiers; a bare ticker/CNPJ given as ``name`` is + auto-detected. Use this (not ``registry_lookup``) when you need the asset's + macro class, not its registry entity. + """ + return await resolve_asset(name=name, ticker=ticker, cnpj=cnpj, isin=isin) + + # ── BCB: Banco Central ──────────────────────────────────────────── diff --git a/src/findata/api/routers/resolver.py b/src/findata/api/routers/resolver.py new file mode 100644 index 0000000..a47c8c3 --- /dev/null +++ b/src/findata/api/routers/resolver.py @@ -0,0 +1,35 @@ +"""Asset-classification resolver routes. + +Wraps :func:`findata.resolver.resolve_asset` over HTTP. The consolidator calls +this per asset (dozens per statement), so the handler is a thin, cacheable pass +through the deterministic core. No PII: only an asset identifier crosses the +boundary. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Query + +from findata.resolver import AssetClassification, resolve_asset + +router = APIRouter(prefix="/resolver", tags=["Resolver"]) + + +@router.get("/resolve") +async def resolve( + name: str | None = Query( + None, max_length=256, description="Nome/label do ativo (ex.: 'FI ITAUINFRA CI')" + ), + ticker: str | None = Query(None, max_length=16, description="Ticker B3 (ex.: IFRA11, PETR4)"), + cnpj: str | None = Query(None, max_length=32, description="CNPJ do fundo (com ou sem máscara)"), + isin: str | None = Query(None, max_length=16, description="ISIN (ex.: BR...)"), +) -> AssetClassification: + """Classifica um ativo na taxonomia macro Wealthuman. + + Aceita qualquer identificador (``name``/``ticker``/``cnpj``/``isin``) e + devolve ``macro_class`` já mapeada (Renda Fixa, Renda Variável, Multimercado, + Internacional, Alternativos, Estruturados) + subclasse, underlying, + debênture/Lei 12.431, ``source``, ``confidence`` e a cascata percorrida. + Determinístico e cacheável. + """ + return await resolve_asset(name=name, ticker=ticker, cnpj=cnpj, isin=isin) diff --git a/src/findata/resolver/__init__.py b/src/findata/resolver/__init__.py new file mode 100644 index 0000000..003c19b --- /dev/null +++ b/src/findata/resolver/__init__.py @@ -0,0 +1,35 @@ +"""Wealthuman asset-classification resolver. + +``resolve_asset(identifier)`` turns any Brazilian asset identifier (ticker, +CNPJ, ISIN, or bare name) into a classification already mapped to the Wealthuman +macro taxonomy (Renda Fixa, Renda Variável, Multimercado, Internacional, +Alternativos, Estruturados) plus subclasse, underlying nature, debenture / +Lei-12.431 facts, source, confidence, and an audit cascade. + +Deterministic, cacheable, auditable, no PII. See ``openfindata-mcp-spec.md``. +""" + +from __future__ import annotations + +from findata.resolver.engine import AssetProvider, classify, resolve_asset +from findata.resolver.models import ( + AssetClassification, + CvmInfo, + DebentureInfo, + IdentifierResolved, + TaxInfo, +) +from findata.resolver.normalize import NormalizedInput, normalize + +__all__ = [ + "AssetClassification", + "AssetProvider", + "CvmInfo", + "DebentureInfo", + "IdentifierResolved", + "NormalizedInput", + "TaxInfo", + "classify", + "normalize", + "resolve_asset", +] diff --git a/src/findata/resolver/engine.py b/src/findata/resolver/engine.py new file mode 100644 index 0000000..eabb757 --- /dev/null +++ b/src/findata/resolver/engine.py @@ -0,0 +1,565 @@ +"""The resolver engine: deterministic rule cascade + ``resolve_asset``. + +Classification is decided in this order, most-specific signal first: + +1. **Curated seed** (:mod:`findata.resolver.seed`) — only the non-derivable + cases (ETF underlying, global-mandate FIA). +2. **Structural rules** (this module) — name/ticker patterns that *are* + derivable: COE, debenture, CRA/CRI, bank paper, Tesouro, IE/global, + FII, FIA/Ações, Multimercado, FIDC/FIP, plain tickers. +3. **External providers** (optional, injected) — Mais Retorno MCP, CVM/B3, + restricted web search. Not bundled here (they are client-side / networked); + the resolver takes a chain of async callbacks so a deployment can wire them. + Each step that fires lowers ``confidence`` and is appended to ``cascade``. + +The seed + rules layers are pure and offline, so the spec's test set resolves +deterministically with no network. ``source`` is ``"openfindata"`` for every +core hit; an external provider that overrides a field updates ``source`` too. + +Key traps the ordering encodes (spec §Armadilhas): + * ``"Crédito Estruturado"`` is RF (credit), **not** Estruturados — checked + before any COE/Estruturados rule. + * **COE** is always Estruturados and **never** an ETF. + * an ETF/fund is classified by its **underlying** (IFRA11 debêntures → RF; + IVVB11 ações → RV). + * geography is the orthogonal ``exposure`` axis, never a macro class: a + global-mandate FIA is RV + exposure=Internacional; IVVB11 is RV + + Internacional; a BDR is RV + Internacional. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any, Protocol, cast +from zoneinfo import ZoneInfo + +from findata.resolver.models import ( + AssetClassification, + CvmInfo, + DebentureInfo, + Exposure, + IdentifierResolved, + TaxInfo, +) +from findata.resolver.normalize import NormalizedInput, normalize +from findata.resolver.seed import lookup_seed + +# Issuers with a well-known programme of Lei-12.431 infrastructure debentures. +# Used only as a *heuristic* signal (debenture + IPCA-linked + infra issuer → +# likely incentivada); the live ANBIMA/debentures.com.br step confirms by ISIN. +# Explicit list = auditable; not a claim that every issue from these is 12.431. +_INFRA_DEBENTURE_ISSUERS = frozenset( + { + "PETROBRAS", + "RUMO", + "ENGIE", + "TAESA", + "ISA", + "CTEEP", + "ECORODOVIAS", + "CPFL", + "ENEVA", + "AEGEA", + "EQUATORIAL", + "NEOENERGIA", + "SABESP", + "COPEL", + "ELETROBRAS", + "ENERGISA", + "OMEGA", + "AUREN", + "COMGAS", + "VIBRA", + "MOTIVA", + "SANEPAR", + "CEMIG", + } +) + +_GLOBAL_KEYWORDS = ( + "GLOBAL", + "GLOBAIS", + "WORLD", + "WORLDWIDE", + "INTERNACIONAL", + "INTERNATIONAL", + "EXTERIOR", +) + +_FUND_CONTEXT_TOKENS = ("FIA", "FIC", "FIM", "FUNDO", "FDO", "FUND", "MASTER", "FI") + +# ``as_of`` is stamped in Brazil time: the consolidation is a BR-market artifact, +# so a server in another timezone must not shift the audit date across midnight. +_BR_TZ = ZoneInfo("America/Sao_Paulo") + +# Above this confidence (and with a decided macro) the cascade short-circuits: +# no point spending an external round-trip to confirm a strong core hit. +_CONFIDENT_ENOUGH = 0.9 + + +class AssetProvider(Protocol): + """An external cascade step (Mais Retorno, CVM/B3, web search). + + Receives the normalized input and the best classification so far; returns an + enriched classification (new ``source``, possibly higher-detail fields) or + ``None`` to pass. Implementations live outside the library because they are + networked / client-side; the resolver only orchestrates them. + """ + + async def __call__( + self, norm: NormalizedInput, current: AssetClassification + ) -> AssetClassification | None: ... + + +# ── Small parsers ────────────────────────────────────────────────── + + +def parse_indexador(name_folded: str) -> str | None: + """Recover the index from a folded RF instrument name, or ``None``.""" + if "IPCA" in name_folded: + return "IPCA+" + if "CDI+" in name_folded or "CDI +" in name_folded: + return "CDI+" + if "%CDI" in name_folded or "% CDI" in name_folded or "DO CDI" in name_folded: + return "%CDI" + if "SELIC" in name_folded: + return "SELIC" + if "PREFIX" in name_folded or "PRE FIXAD" in name_folded: + return "PREFIXADO" + if "CDI" in name_folded: + return "%CDI" + return None + + +def _subclasse_from_indexador(indexador: str | None) -> str: + if indexador == "IPCA+": + return "Indexada à Inflação" + if indexador in {"%CDI", "CDI+", "SELIC"}: + return "Pós-fixada" + if indexador == "PREFIXADO": + return "Prefixada" + return "Crédito Privado" + + +def _infer_incentivada( + norm: NormalizedInput, indexador: str | None +) -> tuple[bool | None, str, str | None]: + """Decide Lei-12.431 incentivada for a debenture. Returns (flag, note, basis). + + ``basis`` is ``"explicit"`` for an in-name signal (high certainty), + ``"heuristic"`` for the IPCA+infra-issuer inference (must be confirmed by + ISIN, so the caller keeps confidence low and lets the cascade verify), or + ``None`` when there is no signal at all (we return ``None`` for the flag — + unknown, never assert False). + """ + if norm.name_contains("INCENTIVAD", "12.431", "12431", "INFRAESTRUTURA", "FI-INFRA") or ( + norm.has_token("INFRA") and norm.has_token("DEB", "DEBENTURE", "DEBENTURES") + ): + return ( + True, + "Incentivada Lei 12.431 (sinal explícito de infraestrutura no nome).", + "explicit", + ) + issuer_hit = any(t in _INFRA_DEBENTURE_ISSUERS for t in norm.tokens) + if issuer_hit and indexador == "IPCA+": + return ( + True, + ( + "Incentivada Lei 12.431 SÓ por heurística (debênture IPCA+ de emissor com " + "programa de infra); confidence baixa de propósito — confirmar por ISIN em " + "ANBIMA/debentures.com.br antes de tratar como isento." + ), + "heuristic", + ) + return None, "", None + + +# ── The rule cascade ─────────────────────────────────────────────── + + +def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: + """Run the structural rules; return a partial payload dict (always non-empty). + + The first matching rule wins. Every branch sets at least ``kind`` and + ``macro_class``; the assembler fills defaults for the rest. + """ + n = norm + + # 0) Name-trap: "Crédito Estruturado" is structured *credit* → RF, NOT + # Estruturados. Must precede the COE rule. + if n.name_contains("CREDITO ESTRUTURADO"): + return { + "kind": "fundo", + "macro_class": "Renda Fixa", + "subclasse": "Crédito Privado", + "exposure": "Brasil", + "underlying_nature": "credito", + "confidence": 0.9, + "notes": "Name-trap: 'Crédito Estruturado' é crédito (RF), não COE/Estruturados.", + } + + # 1) COE / operações estruturadas → Estruturados, never an ETF. + if n.has_token("COE") or n.name_contains( + "OPERACOES ESTRUTURADAS", + "OPERACAO ESTRUTURADA", + "CERTIFICADO DE OPERACOES", + "CERT DE OPERACOES", + "NOTA ESTRUTURADA", + "NOTAS ESTRUTURADAS", + ): + return { + "kind": "coe", + "macro_class": "Estruturados", + "subclasse": "COE", + "underlying_nature": "outro", + "estrutura": "COE", + "confidence": 0.95, + "notes": "COE (Certificado de Operações Estruturadas, CETIP) → Estruturados.", + } + + # 2) Debenture → RF; parse indexador + incentivada. + if n.has_token("DEB", "DEBENTURE", "DEBENTURES", "DEBENT"): + indexador = parse_indexador(n.name_folded) + incentivada, note, basis = _infer_incentivada(n, indexador) + deb: dict[str, Any] = {"indexador": indexador} + tax: dict[str, Any] = {} + if incentivada: + deb["incentivada_1243"] = True + tax["isento"] = True + # An *explicit* infra signal is high-confidence. The issuer+IPCA + # heuristic is deliberately kept below the cascade short-circuit + # threshold (_CONFIDENT_ENOUGH) so a wired provider re-checks the + # isento claim by ISIN instead of it being taken as fact. + if basis == "explicit": + confidence = 0.92 + elif basis == "heuristic": + confidence = 0.7 + else: + confidence = 0.88 + return { + "kind": "debenture", + "macro_class": "Renda Fixa", + "subclasse": _subclasse_from_indexador(indexador), + "exposure": "Brasil", + "underlying_nature": "credito", + "estrutura": "debenture", + "debenture": deb, + "tax": tax, + "confidence": confidence, + "notes": note or "Debênture → Renda Fixa.", + } + + # 3) Securitização (CRA/CRI) → RF. + if n.has_token("CRA", "CRI") or n.name_contains( + "CERT. RECEBIVEIS", "CERTIFICADO DE RECEBIVEIS" + ): + agro = n.has_token("CRA") or n.name_contains("AGRONEGOCIO") + return { + "kind": "cra" if agro else "cri", + "macro_class": "Renda Fixa", + "subclasse": "Crédito Privado", + "exposure": "Brasil", + "underlying_nature": "recebiveis", + "tax": {"isento": True}, # CRA/CRI: IR-exempt for PF + "confidence": 0.9, + "notes": "Securitização (recebíveis) → Renda Fixa, isento p/ PF.", + } + + # 4) Bank paper (CDB/RDB/LIG/Letra Financeira/Letra de Câmbio) → RF. + # NB: the bare 2-char tokens "LC"/"LF" are too collision-prone (they hit + # issuer names, share classes, internal codes), so they are matched only + # via their unambiguous phrases, never as bare tokens. + if n.has_token("CDB", "RDB", "LIG") or n.name_contains("LETRA FINANCEIRA", "LETRA DE CAMBIO"): + return { + "kind": "cdb", + "macro_class": "Renda Fixa", + "subclasse": _subclasse_from_indexador(parse_indexador(n.name_folded)), + "exposure": "Brasil", + "underlying_nature": "credito", + "confidence": 0.88, + "notes": "Emissão bancária → Renda Fixa.", + } + if n.has_token("LCI", "LCA") or n.name_contains( + "LETRA DE CREDITO IMOBILIARIO", "LETRA DE CREDITO DO AGRONEGOCIO" + ): + return { + "kind": "lci_lca", + "macro_class": "Renda Fixa", + "subclasse": _subclasse_from_indexador(parse_indexador(n.name_folded)), + "exposure": "Brasil", + "underlying_nature": "credito", + "tax": {"isento": True}, + "confidence": 0.9, + "notes": "LCI/LCA → Renda Fixa, isento p/ PF.", + } + + # 5) Tesouro / public bonds → RF. + if n.has_token("TESOURO", "NTN", "LTN", "LFT", "NTNB", "NTNF") or n.name_contains( + "TESOURO DIRETO", "TESOURO SELIC", "TESOURO IPCA", "TESOURO PREFIXADO" + ): + return { + "kind": "tesouro", + "macro_class": "Renda Fixa", + "subclasse": _subclasse_from_indexador(parse_indexador(n.name_folded)), + "exposure": "Brasil", + "underlying_nature": "tesouro", + "confidence": 0.95, + "notes": "Título público federal → Renda Fixa.", + } + + # 6) Internacional EXPOSURE — IE structure, or global keyword. Geography is + # the `exposure` axis, NOT a macro class: the asset class still comes from + # the fund type (equities→RV, dívida externa→RF, else Multimercado). BOTH + # triggers require a fund context: a bare "IE"/"GLOBAL" token outside a + # fund name is too collision-prone (e.g. "COMPANHIA IE ENERGIA SA"). + # Runs before FIA/Ações so "FIC FIA IE" / "GLOBAL FIM" land here. + fund_context = n.has_token(*_FUND_CONTEXT_TOKENS) + if fund_context and ( + n.has_token("IE") + or n.name_contains(*_GLOBAL_KEYWORDS, "INVESTIMENTO NO EXTERIOR", "INV EXTERIOR") + ): + equities = n.has_token("FIA") or n.name_contains("ACOES", "EQUITY") + rf = n.name_contains("DIVIDA EXTERNA", "RENDA FIXA", "BOND", "CREDITO", "DEBT") + if equities: + macro, subclasse, underlying = "Renda Variável", "Ações Global", "acoes" + elif rf: + macro, subclasse, underlying = "Renda Fixa", "Dívida Externa", "credito" + else: + macro, subclasse, underlying = "Multimercado", "Multimercado Global", "multiativos" + return { + "kind": "fundo", + "macro_class": macro, + "subclasse": subclasse, + "exposure": "Internacional", + "underlying_nature": underlying, + "estrutura": "IE" if n.has_token("IE") else "FIC", + "confidence": 0.9, + "notes": f"Mandato internacional (IE / global): {macro}, exposição Internacional.", + } + + # 7) FII (by name; ticker-only 11s are caught at step 12). + if n.has_token("FII") or n.name_contains( + "FUNDO IMOBILIARIO", + "FDO INV IMOB", + "FUNDO DE INVESTIMENTO IMOBILIARIO", + "INVESTIMENTO IMOBILIARIO", + ): + return { + "kind": "fii", + "macro_class": "Renda Variável", + "subclasse": "FII", + "exposure": "Brasil", + "underlying_nature": "imoveis", + "estrutura": "FII", + "confidence": 0.92, + "notes": "Fundo Imobiliário → Renda Variável (subclasse FII).", + } + + # 8) ETF by name, no curated hit → infer underlying from name keywords. + if n.has_token("ETF") or n.name_contains("ISHARES", "INDEX FUND"): + rf = n.name_contains("RENDA FIXA", "DEBENTURE", "BOND", "IMA-", "IRF-", "TESOURO", "INFRA") + if rf: + sovereign = n.name_contains("TESOURO", "IMA-", "IRF-", "LFT", "NTN", "LTN") + credit = n.name_contains("DEBENTURE", "INFRA") + return { + "kind": "etf", + "macro_class": "Renda Fixa", + "subclasse": "ETF de renda fixa", + "exposure": "Brasil", + "underlying_nature": "debentures" + if credit + else ("tesouro" if sovereign else "credito"), + "estrutura": "ETF", + "confidence": 0.78, + "notes": "ETF com underlying de renda fixa (inferido do nome).", + } + intl = n.name_contains(*_GLOBAL_KEYWORDS, "S&P", "SP500", "NASDAQ", "MSCI", "EUA", "US ") + return { + "kind": "etf", + "macro_class": "Renda Variável", + "subclasse": "ETF de ações internacional" if intl else "ETF de ações", + "exposure": "Internacional" if intl else "Brasil", + "underlying_nature": "acoes", + "estrutura": "ETF", + "confidence": 0.72, + "notes": "ETF sem ticker no seed; underlying assumido = ações. Confirmar.", + } + + # 9) FIDC → RF (direitos creditórios, natureza de crédito). + if n.has_token("FIDC") or n.name_contains("DIREITOS CREDITORIOS"): + return { + "kind": "fundo", + "macro_class": "Renda Fixa", + "subclasse": "Crédito Estruturado", + "exposure": "Brasil", + "underlying_nature": "recebiveis", + "estrutura": "FIDC", + "confidence": 0.85, + "notes": "FIDC (direitos creditórios) → Renda Fixa (crédito).", + } + + # 10) FIP → Alternativos (private equity). + if n.has_token("FIP") or n.name_contains("PARTICIPACOES", "PRIVATE EQUITY"): + return { + "kind": "fundo", + "macro_class": "Alternativos", + "subclasse": "Private Equity", + "underlying_nature": "private_equity", + "estrutura": "FIP", + "confidence": 0.88, + "notes": "FIP (participações) → Alternativos.", + } + + # 11) Multimercado. + if n.has_token("FIM") or n.name_contains("MULTIMERCADO", "MULTIESTRATEGIA", "MACRO"): + return { + "kind": "fundo", + "macro_class": "Multimercado", + "subclasse": "Multimercado", + "underlying_nature": "multiativos", + "estrutura": "FIM", + "confidence": 0.85, + "notes": "Multimercado.", + } + + # 12) Ações / FIA (domestic equities). + if n.has_token("FIA") or n.name_contains("FUNDO DE ACOES", "ACOES", "EQUITY"): + return { + "kind": "fundo", + "macro_class": "Renda Variável", + "subclasse": "Ações", + "exposure": "Brasil", + "underlying_nature": "acoes", + "estrutura": "FIA", + "confidence": 0.85, + "notes": "Fundo de Ações → Renda Variável.", + } + + # 13) Ticker shapes (no name signal won above). + suffix = n.ticker_digits_suffix + if n.ticker: + # 11 not in any curated ETF/RF list → overwhelmingly a FII. + if suffix == "11": + return { + "kind": "fii", + "macro_class": "Renda Variável", + "subclasse": "FII", + "exposure": "Brasil", + "underlying_nature": "imoveis", + "estrutura": "FII", + "confidence": 0.72, + "notes": "Ticker terminado em 11 fora do seed de ETFs → FII (heurística).", + } + # BDR (34/35): recibo de ação estrangeira. RV por classe, mas o holder + # carrega risco cambial/exterior → Internacional por exposição (default; + # BDRs de empresa brasileira no exterior são exceção, não a regra). + if suffix in {"34", "35"}: + return { + "kind": "bdr", + "macro_class": "Renda Variável", + "subclasse": "BDR", + "exposure": "Internacional", + "underlying_nature": "acoes", + "confidence": 0.8, + "notes": "BDR (recibo de ação estrangeira) → RV, exposição Internacional.", + } + # 3-8: ordinary/preferred share — ação brasileira. + return { + "kind": "acao", + "macro_class": "Renda Variável", + "subclasse": "Ações", + "exposure": "Brasil", + "underlying_nature": "acoes", + "confidence": 0.85, + "notes": "Ação listada na B3 → Renda Variável.", + } + + # 14) Nothing matched — honest "I don't know" for HITL review. + return { + "kind": "outro", + "macro_class": "Indefinido", + "confidence": 0.2, + "notes": "Sem sinal estrutural suficiente; requer revisão (human-in-the-loop).", + } + + +# ── Assembly ─────────────────────────────────────────────────────── + + +def _resolve_exposure(payload: dict[str, Any]) -> Exposure | None: + """The geography axis, taken from the rule/seed payload. ``None`` when the + rule could not decide (e.g. a COE whose underlying may be either).""" + explicit = payload.get("exposure") + return cast(Exposure, explicit) if explicit is not None else None + + +def _assemble(norm: NormalizedInput, payload: dict[str, Any], step: str) -> AssetClassification: + """Turn a rule/seed payload dict into the typed output contract.""" + deb = payload.get("debenture") + tax = payload.get("tax") or {} + return AssetClassification( + identifier_resolved=IdentifierResolved( + cnpj=norm.cnpj, ticker=norm.ticker, isin=norm.isin, name=norm.name_raw + ), + kind=payload["kind"], + cvm=CvmInfo( + classe=payload.get("cvm_classe"), + anbima_categoria=payload.get("anbima_categoria"), + estrutura=payload.get("estrutura"), + ), + macro_class=payload["macro_class"], + subclasse=payload.get("subclasse"), + exposure=_resolve_exposure(payload), + underlying_nature=payload.get("underlying_nature"), + debenture=DebentureInfo(**deb) if deb else None, + tax=TaxInfo(**tax), + source=payload.get("source", "openfindata"), + confidence=payload.get("confidence", 0.5), + as_of=datetime.now(_BR_TZ).date().isoformat(), + cascade=[step], + notes=payload.get("notes"), + ) + + +def classify(norm: NormalizedInput) -> AssetClassification: + """Pure, offline classification: curated seed → structural rules. + + Always returns a record (``Indefinido`` when nothing matches). This is the + deterministic core that the spec test set exercises with no network. + """ + seed = lookup_seed(ticker=norm.ticker, cnpj=norm.cnpj, name_folded=norm.name_folded) + if seed is not None: + return _assemble(norm, seed.payload, step="openfindata:curated") + return _assemble(norm, _rule_payload(norm), step="openfindata:rules") + + +async def resolve_asset( + name: str | None = None, + *, + cnpj: str | None = None, + ticker: str | None = None, + isin: str | None = None, + providers: list[AssetProvider] | None = None, +) -> AssetClassification: + """Resolve an asset to its Wealthuman classification. + + Runs the deterministic core (curated seed → structural rules), then walks the + optional external provider chain (Mais Retorno → CVM/B3 → restricted web + search) only while the result is still weak (``Indefinido`` or low + confidence). Each provider that fires is appended to ``cascade`` and may lower + confidence; the deepest one to set a field owns ``source``. + + No PII: callers pass only an asset identifier, never client data. + """ + norm = normalize(name=name, cnpj=cnpj, ticker=ticker, isin=isin) + result = classify(norm) + + for provider in providers or []: + # Stop early once we are confident — saves the network round-trips. + if result.macro_class != "Indefinido" and result.confidence >= _CONFIDENT_ENOUGH: + break + enriched = await provider(norm, result) + if enriched is not None: + enriched.cascade = [*result.cascade, *enriched.cascade] + result = enriched + return result diff --git a/src/findata/resolver/models.py b/src/findata/resolver/models.py new file mode 100644 index 0000000..386dfcf --- /dev/null +++ b/src/findata/resolver/models.py @@ -0,0 +1,124 @@ +"""Output contract for ``resolve_asset`` — the Wealthuman classification. + +The resolver's job is to turn *any* asset identifier (ticker, CNPJ, ISIN, or +bare name) into a classification **already mapped to the Wealthuman macro +taxonomy**, not the raw CVM/ANBIMA category. Every field that can drive a +human-in-the-loop decision (``source``, ``confidence``, ``as_of``, ``cascade``) +is explicit, so a consolidated statement can be audited line by line. + +Shapes mirror the spec in ``openfindata-mcp-spec.md`` §Output. Kept in lockstep +with the engine in :mod:`findata.resolver.engine`. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + +# ── Controlled vocabularies ──────────────────────────────────────── + +# Veículo / instrumento. Mirrors the spec's ``kind`` enum. +Kind = Literal[ + "fundo", + "acao", + "fii", + "etf", + "bdr", + "debenture", + "cra", + "cri", + "cdb", + "lci_lca", + "tesouro", + "coe", + "outro", +] + +# Wealthuman macro taxonomy — PURE asset class. Geography is NOT a macro value: +# "Internacional" lives only on the orthogonal ``Exposure`` axis. So an offshore +# equity fund is RV + exposure=Internacional, offshore debt is RF + Internacional. +# ``Indefinido`` is the honest answer when no layer can decide (drives HITL review). +MacroClass = Literal[ + "Renda Fixa", + "Renda Variável", + "Multimercado", + "Alternativos", + "Estruturados", + "Indefinido", +] + +# Geography/strategy axis — *where the economic exposure sits*, orthogonal to +# the asset class. A B3-listed equity ETF on the S&P 500 (IVVB11) is RV by class +# but Internacional by exposure; a BDR is RV but the holder bears USD/foreign +# risk → Internacional. The B3 listing is only the asset's domicile, not its +# exposure. ``None`` when the resolver cannot decide. +Exposure = Literal["Brasil", "Internacional"] + +# Economic nature of the underlying. For ETFs/funds this is what splits an +# ETF-de-ações (RV) from an ETF-de-debêntures (RF) — see IFRA11 vs IVVB11. +UnderlyingNature = Literal[ + "acoes", + "debentures", + "credito", + "recebiveis", + "imoveis", + "multiativos", + "tesouro", + "cambio", + "private_equity", + "outro", +] + + +class IdentifierResolved(BaseModel): + """The identifiers the resolver could normalize/confirm from the input.""" + + cnpj: str | None = None + ticker: str | None = None + isin: str | None = None + name: str | None = None + + +class CvmInfo(BaseModel): + """Raw upstream classification, kept for audit alongside the mapped macro.""" + + classe: str | None = None + anbima_categoria: str | None = None + estrutura: str | None = None # FIA | FIM | FIC | FIDC | FIP | FII | IE | ETF | ... + + +class DebentureInfo(BaseModel): + """Debenture-specific facts. Only populated when ``kind == 'debenture'`` + (or an FI-Infra ETF whose underlying *is* incentivada debentures).""" + + incentivada_1243: bool | None = None # Lei 12.431 (infra) — IR-exempt for PF + indexador: str | None = None # IPCA+ | CDI+ | %CDI | PREFIXADO | SELIC + vencimento: str | None = None # YYYY-MM when known + + +class TaxInfo(BaseModel): + """Tax treatment for the typical PF holder.""" + + isento: bool | None = None # True for Lei 12.431 / LCI-LCA / FII dividends etc. + + +class AssetClassification(BaseModel): + """The full resolver output. One asset in → one auditable record out.""" + + identifier_resolved: IdentifierResolved + kind: Kind + cvm: CvmInfo = Field(default_factory=CvmInfo) + macro_class: MacroClass + subclasse: str | None = None + exposure: Exposure | None = None # geography/strategy axis (Brasil vs Internacional) + underlying_nature: UnderlyingNature | None = None + debenture: DebentureInfo | None = None + tax: TaxInfo = Field(default_factory=TaxInfo) + source: str # openfindata | maisretorno | cvm | b3 | web_search + confidence: float = Field(ge=0.0, le=1.0) + as_of: str # YYYY-MM-DD + # Audit trail: ordered list of resolution steps actually attempted. + cascade: list[str] = Field(default_factory=list) + # Free-text rationale, e.g. which trap was avoided or which signal decided. + notes: str | None = None diff --git a/src/findata/resolver/normalize.py b/src/findata/resolver/normalize.py new file mode 100644 index 0000000..a6188c7 --- /dev/null +++ b/src/findata/resolver/normalize.py @@ -0,0 +1,125 @@ +"""Identifier normalization for the resolver. + +Turns the loose input (``{name, cnpj, ticker, isin}`` — any subset) into a +canonical :class:`NormalizedInput` the rule engine can pattern-match against: +folded/uppercased name tokens, a digits-only CNPJ, an uppercased ticker, and an +ISIN. Pure, deterministic, no I/O — so it is trivially cacheable. +""" + +from __future__ import annotations + +import re +import unicodedata +from dataclasses import dataclass, field + +# A B3 ticker: 4 letters + 1-2 digits (PETR4, IVVB11, HGLG11). Fractional and +# subscription receipts (F, suffixes) are out of scope for classification. +_TICKER_RE = re.compile(r"^[A-Z]{4}\d{1,2}$") +# BDR: 4 letters + 34/35 (level I / level II). e.g. AAPL34, MSFT34. +_BDR_RE = re.compile(r"^[A-Z]{4}3[45]$") +# ISIN: 2-letter country + 9 alnum + 1 check digit. Brazil = BR.... +_ISIN_RE = re.compile(r"^[A-Z]{2}[A-Z0-9]{9}\d$") +_CNPJ_LEN = 14 + + +def fold(text: str) -> str: + """ASCII-fold + uppercase, the canonical form for keyword matching. + + ``"Crédito Estruturado"`` → ``"CREDITO ESTRUTURADO"``. Mirrors how the + registry stores tokens, so comparisons line up. + """ + nfkd = unicodedata.normalize("NFKD", text) + ascii_only = "".join(c for c in nfkd if not unicodedata.combining(c)) + return ascii_only.upper().strip() + + +def _digits(text: str) -> str: + return re.sub(r"\D", "", text) + + +def tokenize(name: str) -> list[str]: + """Split a folded name into alphanumeric tokens (keeps ``+`` joined runs out). + + ``"DEB PETROBRAS IPCA+"`` → ``["DEB", "PETROBRAS", "IPCA"]``. The ``+`` is + dropped from tokens but preserved in the raw folded name, which the indexador + parser reads, so ``IPCA+`` is still recoverable. + """ + return [t for t in re.split(r"[^A-Z0-9]+", fold(name)) if t] + + +@dataclass(frozen=True) +class NormalizedInput: + """Canonical, deterministic view of the caller's identifiers.""" + + name_raw: str | None = None # original, for echo-back + name_folded: str = "" # ASCII-folded + uppercased full string + tokens: tuple[str, ...] = field(default_factory=tuple) + cnpj: str | None = None # 14 digits, or None + ticker: str | None = None # uppercased B3 ticker, or None + isin: str | None = None + + def has_token(self, *candidates: str) -> bool: + """True if any candidate appears as a whole token.""" + tset = set(self.tokens) + return any(c in tset for c in candidates) + + def name_contains(self, *needles: str) -> bool: + """True if any needle is a substring of the folded name (phrase match).""" + return any(n in self.name_folded for n in needles) + + @property + def ticker_digits_suffix(self) -> str | None: + """The trailing digits of the ticker (``"11"`` for HGLG11), or None.""" + if not self.ticker: + return None + m = re.search(r"(\d{1,2})$", self.ticker) + return m.group(1) if m else None + + +def normalize( + *, + name: str | None = None, + cnpj: str | None = None, + ticker: str | None = None, + isin: str | None = None, +) -> NormalizedInput: + """Build a :class:`NormalizedInput` from any subset of identifiers. + + A bare ``name`` that is itself a ticker/CNPJ/ISIN is promoted to the right + field, so callers can pass a single opaque string and still get structured + signals (the consolidator often only has the statement label). + """ + # Promote a bare identifier passed as `name` into its typed slot. + if name and not (ticker or cnpj or isin): + candidate = fold(name) + if _TICKER_RE.match(candidate) or _BDR_RE.match(candidate): + ticker = candidate + elif _ISIN_RE.match(candidate): + isin = candidate + elif len(_digits(name)) == _CNPJ_LEN and not re.search(r"[A-Za-z]", name): + cnpj = name + + cnpj_norm = None + if cnpj: + d = _digits(cnpj) + cnpj_norm = d if len(d) == _CNPJ_LEN else None + + ticker_norm = None + if ticker: + t = fold(ticker) + ticker_norm = t if (_TICKER_RE.match(t) or _BDR_RE.match(t)) else None + + isin_norm = None + if isin: + i = fold(isin) + isin_norm = i if _ISIN_RE.match(i) else None + + folded = fold(name) if name else "" + return NormalizedInput( + name_raw=name, + name_folded=folded, + tokens=tuple(tokenize(name)) if name else (), + cnpj=cnpj_norm, + ticker=ticker_norm, + isin=isin_norm, + ) diff --git a/src/findata/resolver/seed.py b/src/findata/resolver/seed.py new file mode 100644 index 0000000..c83a60c --- /dev/null +++ b/src/findata/resolver/seed.py @@ -0,0 +1,169 @@ +"""Curated knowledge base for classifications that are *not derivable* from the +identifier alone. + +Two honest cases need a curated table, and only these: + +1. **ETFs** — an ETF's macro follows its *underlying*, and the ticker carries no + underlying signal. ``IVVB11`` (S&P 500 equities → RV) and ``IFRA11`` + (infra debentures → RF) both end in ``11``; nothing in the symbol separates + them. The B3 ETF universe is small (~100 listed) and stable, so a curated + ticker→underlying map is the deterministic, auditable answer. +2. **Global-mandate funds with no structural tell** — ``ARBOR FIC FIA`` is an + equities wrapper (FIA → RV) whose mandate is global, but the name has no + ``IE`` and no "global"/"world" keyword. Only fund-level knowledge sets its + exposure=Internacional (economic nature beats the wrapper); the asset class + stays Renda Variável. + +Everything else is settled by the structural rules in +:mod:`findata.resolver.engine` and never reaches this table. Keep this list +small and sourced — it is a maintenance liability, not a dumping ground. + +``confidence`` here is intentionally high (curated, manually verified) but < 1.0: +the underlying universe can change (an ETF can be delisted, a fund can change +mandate), so a curated hit is strong, not infallible. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from findata.resolver.normalize import fold + + +@dataclass(frozen=True) +class SeedEntry: + """One curated classification, matched by ticker / CNPJ / name substrings.""" + + payload: dict[str, Any] + ticker: str | None = None + cnpj: str | None = None + name_substrings: tuple[str, ...] = field(default_factory=tuple) + + +# ── B3 ETFs: ticker → (index, exposure) ──────────────────────────── +# RV ETFs (equity underlying). macro is always Renda Variável (asset class); +# the geography is the orthogonal *exposure* axis. IVVB11 is the spec case: it +# is RV by class but its exposure is Internacional (tracks the S&P 500) — the B3 +# listing is only the asset's domicile, not where the risk sits. +_EQUITY_ETFS = { + # Brazilian-equity exposure. + "BOVA11": ("Ibovespa", "Brasil"), + "SMAL11": ("Small Caps", "Brasil"), + "BOVV11": ("Ibovespa", "Brasil"), + "PIBB11": ("IBrX-50", "Brasil"), + "DIVO11": ("Dividendos", "Brasil"), + "BOVB11": ("Ibovespa", "Brasil"), + # International-equity exposure (B3-listed, foreign underlying). + "IVVB11": ("S&P 500", "Internacional"), + "XINA11": ("China (MSCI)", "Internacional"), + "NASD11": ("Nasdaq-100", "Internacional"), + "SPXI11": ("S&P 500", "Internacional"), + "EURP11": ("Europa", "Internacional"), + "ACWI11": ("Global (ACWI)", "Internacional"), +} + +# RF ETFs (fixed-income underlying). IFRA11 is the headline case: an FI-Infra +# ETF holding Lei-12.431 infra debentures → Renda Fixa, IR-exempt for PF. +_FIXED_INCOME_ETFS = { + "IFRA11": ("debentures", "Indexada à Inflação", True), + "IB5M11": ("tesouro", "Indexada à Inflação", False), + "IMAB11": ("tesouro", "Indexada à Inflação", False), + "B5P211": ("tesouro", "Indexada à Inflação", False), + "IRFM11": ("tesouro", "Prefixada", False), + "FIXA11": ("tesouro", "Prefixada", False), + "LFTS11": ("tesouro", "Pós-fixada", False), + "B5MB11": ("tesouro", "Indexada à Inflação", False), +} + + +def _build_etf_seed() -> list[SeedEntry]: + entries: list[SeedEntry] = [] + for ticker, (idx, exposure) in _EQUITY_ETFS.items(): + intl = exposure == "Internacional" + entries.append( + SeedEntry( + ticker=ticker, + payload={ + "kind": "etf", + "macro_class": "Renda Variável", + "subclasse": "ETF de ações internacional" if intl else "ETF de ações", + "exposure": exposure, + "underlying_nature": "acoes", + "estrutura": "ETF", + "notes": f"Curated: ETF de ações ({idx}); classe RV, exposição {exposure}.", + "confidence": 0.97, + }, + ) + ) + for ticker, (underlying, subclasse, incentivada) in _FIXED_INCOME_ETFS.items(): + note = "ETF de renda fixa; classifica pelo underlying (→ RF)." + if incentivada: + note = ( + "ETF de debêntures de infraestrutura (FI-Infra, Lei 12.431); " + "underlying = debêntures incentivadas → RF, isento p/ PF." + ) + payload = { + "kind": "etf", + "macro_class": "Renda Fixa", + "subclasse": subclasse, + "exposure": "Brasil", + "underlying_nature": underlying, + "estrutura": "ETF", + "notes": f"Curated: {note}", + "confidence": 0.97, + } + if incentivada: + payload["debenture"] = {"incentivada_1243": True, "indexador": "IPCA+"} + payload["tax"] = {"isento": True} + entries.append(SeedEntry(ticker=ticker, payload=payload)) + return entries + + +# ── Global-mandate funds with no structural tell ─────────────────── +_GLOBAL_FUNDS = [ + SeedEntry( + # Require both the brand token AND the FIA structure so an unrelated + # "ARBOR Crédito Privado FIM" is NOT swept into the global-equity seed. + name_substrings=("ARBOR", "FIA"), + payload={ + "kind": "fundo", + "macro_class": "Renda Variável", + "subclasse": "Ações Global", + "exposure": "Internacional", + "underlying_nature": "acoes", + "estrutura": "FIA", + "notes": ( + "Curated: FIC FIA de mandato global sem sufixo IE; classe RV " + "(ações), exposição Internacional pela natureza econômica." + ), + "confidence": 0.93, + }, + ), +] + + +SEED_ENTRIES: list[SeedEntry] = _build_etf_seed() + _GLOBAL_FUNDS + +# Index by ticker for O(1) hits (the common path). +_BY_TICKER: dict[str, SeedEntry] = {e.ticker: e for e in SEED_ENTRIES if e.ticker} +_BY_CNPJ: dict[str, SeedEntry] = {e.cnpj: e for e in SEED_ENTRIES if e.cnpj} +_NAME_ENTRIES: list[SeedEntry] = [e for e in SEED_ENTRIES if e.name_substrings] + + +def lookup_seed(*, ticker: str | None, cnpj: str | None, name_folded: str) -> SeedEntry | None: + """Return the curated entry for this identifier, or ``None``. + + Ticker and CNPJ are exact; name match requires every configured substring to + be present in the folded name (so ``("ARBOR",)`` matches "ARBOR FIC FIA"). + """ + if ticker and ticker in _BY_TICKER: + return _BY_TICKER[ticker] + if cnpj and cnpj in _BY_CNPJ: + return _BY_CNPJ[cnpj] + if name_folded: + folded = fold(name_folded) + for entry in _NAME_ENTRIES: + if all(sub in folded for sub in entry.name_substrings): + return entry + return None diff --git a/src/findata/resolver/taxonomy.py b/src/findata/resolver/taxonomy.py new file mode 100644 index 0000000..97aebeb --- /dev/null +++ b/src/findata/resolver/taxonomy.py @@ -0,0 +1,85 @@ +"""CVM/ANBIMA → Wealthuman macro mapping. + +The spec is explicit (§Regras-chave #1): *the* mapping from the raw CVM class / +ANBIMA category to the Wealthuman macro taxonomy lives **in the resolver**, not +in the caller. This module is that mapping, kept as plain data so it is +auditable and easy to extend when ANBIMA renames a category. + +It is consulted by the registry-enrichment step of the cascade, when we have a +CVM ``classe`` / ``CLASSE_ANBIMA`` string but the structural rules in +:mod:`findata.resolver.engine` did not already settle the macro from the name. +""" + +from __future__ import annotations + +from findata.resolver.normalize import fold + +# ── CVM legal class (campo CLASSE do cad_fi) → macro (asset class) ── +# The CVM "classe" is the legal/regulatory bucket. macro is PURE asset class; +# geography (Internacional) is a separate axis — so "Fundo de Dívida Externa" is +# Renda Fixa here, with its Internacional exposure set by the exposure map below. +CVM_CLASSE_TO_MACRO: dict[str, str] = { + "FUNDO DE ACOES": "Renda Variável", + "FUNDO DE RENDA FIXA": "Renda Fixa", + "FUNDO MULTIMERCADO": "Multimercado", + "FUNDO CAMBIAL": "Multimercado", # câmbio puro — banker treats as Multi/Alt + "FUNDO DE CURTO PRAZO": "Renda Fixa", + "FUNDO REFERENCIADO": "Renda Fixa", + "FUNDO DE DIVIDA EXTERNA": "Renda Fixa", # asset class RF; exposure Internacional + # FI-Infra (debêntures incentivadas) — RF by underlying. + "FI-INFRA": "Renda Fixa", + "FIC FI-INFRA": "Renda Fixa", +} + +# ── ANBIMA category (CLASSE_ANBIMA) → macro (asset class) ────────── +# Richer than the legal class: the ANBIMA category encodes the mandate. Matched +# by substring on the folded string. macro is the asset class only — the +# "investimento no exterior" / "dívida externa" mandate feeds the EXPOSURE map, +# not macro, so an "Ações Investimento no Exterior" is RV + Internacional. +ANBIMA_SUBSTRING_TO_MACRO: tuple[tuple[str, str], ...] = ( + # Core asset classes (geography handled separately). + ("DIVIDA EXTERNA", "Renda Fixa"), + ("RENDA FIXA", "Renda Fixa"), + ("ACOES", "Renda Variável"), + ("MULTIMERCADO", "Multimercado"), + ("CAMBIAL", "Multimercado"), + # Structured / private-market vehicles. + ("FIP", "Alternativos"), + ("PRIVATE EQUITY", "Alternativos"), + ("FIDC", "Renda Fixa"), # direitos creditórios — credit nature → RF + ("IMOBILIARIO", "Renda Variável"), # FII +) + +# Substrings in a CVM/ANBIMA category that mark Internacional exposure. +_INTERNACIONAL_MARKERS: tuple[str, ...] = ( + "INVESTIMENTO NO EXTERIOR", + "DIVIDA EXTERNA", + "EXTERIOR", + "GLOBAL", +) + + +def map_cvm_classe(classe: str | None) -> str | None: + """Map a raw CVM legal ``CLASSE`` to a Wealthuman macro (asset class), or ``None``.""" + if not classe: + return None + return CVM_CLASSE_TO_MACRO.get(fold(classe)) + + +def map_anbima_categoria(categoria: str | None) -> str | None: + """Map a raw ANBIMA category to a Wealthuman macro by first-match substring.""" + if not categoria: + return None + folded = fold(categoria) + for needle, macro in ANBIMA_SUBSTRING_TO_MACRO: + if needle in folded: + return macro + return None + + +def map_exposure(categoria: str | None) -> str | None: + """Detect Internacional exposure from a CVM/ANBIMA category, else ``None``.""" + if not categoria: + return None + folded = fold(categoria) + return "Internacional" if any(m in folded for m in _INTERNACIONAL_MARKERS) else None diff --git a/tests/test_mcp_surface.py b/tests/test_mcp_surface.py index ebffc61..f662847 100644 --- a/tests/test_mcp_surface.py +++ b/tests/test_mcp_surface.py @@ -19,8 +19,8 @@ from findata.api.app import app from findata.api.mcp_app import mcp_app -EXPECTED_TOOLS = 24 # curated tools with code mode OFF (the default) -EXPECTED_REST_OPERATIONS = 95 # all REST routes (unconditional); bump when the surface changes +EXPECTED_TOOLS = 25 # curated tools with code mode OFF (the default) +EXPECTED_REST_OPERATIONS = 96 # all REST routes (unconditional); bump when the surface changes _HTTP_METHODS = {"get", "post", "put", "delete", "patch"} diff --git a/tests/test_resolver.py b/tests/test_resolver.py new file mode 100644 index 0000000..fc42dab --- /dev/null +++ b/tests/test_resolver.py @@ -0,0 +1,232 @@ +"""Resolver test set — the canonical cases from ``openfindata-mcp-spec.md``. + +Every assertion is offline and deterministic: the curated seed + structural +rules settle each case with no network. The 8 spec cases plus the explicit +traps the ordering must defend. +""" + +from __future__ import annotations + +import asyncio + +import pytest + +from findata.resolver import classify, normalize, resolve_asset + + +def _resolve(**kw): + return asyncio.run(resolve_asset(**kw)) + + +# ── The spec test set (§Test set) ────────────────────────────────── + + +def test_ifra11_is_renda_fixa_inflation_etf_of_debentures(): + r = _resolve(ticker="IFRA11", name="FI ITAUINFRA CI") + assert r.macro_class == "Renda Fixa" + assert r.kind == "etf" + assert r.subclasse == "Indexada à Inflação" + assert r.underlying_nature == "debentures" + assert r.debenture and r.debenture.incentivada_1243 is True + assert r.tax.isento is True + + +def test_arbor_fic_fia_global_mandate_is_rv_exposure_internacional(): + # Geography is the exposure axis: a global equities FIA is RV by class. + r = _resolve(name="ARBOR FIC FIA") + assert r.macro_class == "Renda Variável" + assert r.exposure == "Internacional" + + +def test_whg_global_fic_fia_ie_is_rv_exposure_internacional(): + r = _resolve(name="WHG GLOBAL EQUITY FIC FIA IE") + assert r.macro_class == "Renda Variável" + assert r.exposure == "Internacional" + + +def test_deb_petrobras_ipca_is_incentivada_isento_rf(): + r = _resolve(name="DEB PETROBRAS IPCA+") + assert r.macro_class == "Renda Fixa" + assert r.kind == "debenture" + assert r.debenture and r.debenture.incentivada_1243 is True + assert r.debenture.indexador == "IPCA+" + assert r.tax.isento is True + + +def test_coe_is_estruturados_never_etf(): + r = _resolve(name="INVEST. ESTRUTURADOS COE BTG") + assert r.macro_class == "Estruturados" + assert r.kind == "coe" + assert r.kind != "etf" + + +def test_credito_estruturado_name_trap_is_renda_fixa(): + # "Crédito Estruturado" (Warren/AMW) is RF credit, NOT Estruturados. + r = _resolve(name="AMW CREDITO ESTRUTURADO FIC FIM CP") + assert r.macro_class == "Renda Fixa" + assert r.macro_class != "Estruturados" + + +def test_ivvb11_sp500_etf_is_renda_variavel_exposure_internacional(): + # Asset class is RV (spec); the international S&P 500 exposure lives on the + # orthogonal `exposure` axis — B3 listing is domicile, not where the risk is. + r = _resolve(ticker="IVVB11") + assert r.macro_class == "Renda Variável" + assert r.exposure == "Internacional" + assert r.kind == "etf" + assert r.underlying_nature == "acoes" + + +@pytest.mark.parametrize("ticker", ["HGLG11", "MXRF11"]) +def test_fiis_are_renda_variavel_subclasse_fii(ticker): + r = _resolve(ticker=ticker) + assert r.macro_class == "Renda Variável" + assert r.subclasse == "FII" + assert r.kind == "fii" + + +# ── Trap regressions (spec §Armadilhas) ──────────────────────────── + + +def test_acao_ticker_is_rv(): + r = _resolve(ticker="PETR4") + assert r.macro_class == "Renda Variável" + assert r.kind == "acao" + assert r.exposure == "Brasil" + + +def test_bdr_is_rv_exposure_internacional(): + # BDR: RV by class, but the holder bears foreign/USD risk → Internacional. + r = _resolve(ticker="AAPL34") + assert r.macro_class == "Renda Variável" + assert r.kind == "bdr" + assert r.exposure == "Internacional" + + +def test_domestic_etf_is_brasil_exposure(): + r = _resolve(ticker="BOVA11") + assert r.macro_class == "Renda Variável" + assert r.exposure == "Brasil" + + +def test_internacional_funds_carry_internacional_exposure(): + # Asset class varies (RV here), but the exposure axis flags Internacional. + for kw in ("ARBOR FIC FIA", "WHG GLOBAL EQUITY FIC FIA IE"): + r = _resolve(name=kw) + assert r.exposure == "Internacional" + assert r.macro_class != "Indefinido" + + +def test_macro_class_has_no_internacional_value(): + # Geography is exposure-only; "Internacional" must never appear as macro. + from findata.resolver import classify, normalize + + for ident in ("ARBOR FIC FIA", "WHG GLOBAL FIC FIA IE", "VINCI GLOBAL FIM IE"): + r = classify(normalize(name=ident)) + assert r.macro_class != "Internacional" + + +def test_cra_cri_are_rf_isento(): + r = _resolve(name="CRA AGRONEGOCIO RAIZEN IPCA") + assert r.macro_class == "Renda Fixa" + assert r.kind == "cra" + assert r.tax.isento is True + + +def test_tesouro_ipca_is_rf_inflation(): + r = _resolve(name="Tesouro IPCA+ 2035") + assert r.macro_class == "Renda Fixa" + assert r.kind == "tesouro" + assert r.subclasse == "Indexada à Inflação" + + +def test_multimercado(): + r = _resolve(name="KAPITALO ZETA FIC FIM") + assert r.macro_class == "Multimercado" + + +def test_fip_is_alternativos(): + r = _resolve(name="SPX FIP MULTIESTRATEGIA PARTICIPACOES") + assert r.macro_class == "Alternativos" + + +# ── Adversarial-review regressions (token-collision traps) ───────── + + +def test_bare_ie_token_outside_fund_is_not_internacional(): + # "IE" must mean "Investimento no Exterior" only in a fund context. + r = _resolve(name="COMPANHIA IE ENERGIA SA") + assert r.macro_class != "Internacional" + + +def test_bare_lc_lf_tokens_do_not_force_renda_fixa(): + # Short tokens LC/LF used to misfire as bank paper. + r = _resolve(name="FUNDO GLOBAL LC MASTER FIC FIM") + assert r.kind != "cdb" + + +def test_alcione_substring_is_not_lci(): + # Substring "LCI" inside "ALCIONE" must not classify as LCI/LCA. + r = _resolve(name="ALCIONE FUNDO DE ACOES") + assert r.kind != "lci_lca" + assert r.macro_class == "Renda Variável" + + +def test_arbor_credito_is_not_swept_into_global_equity_seed(): + # ARBOR brand without the FIA structure must not hit the curated global seed. + r = _resolve(name="ARBOR CREDITO PRIVADO FIC FIM") + assert r.macro_class != "Internacional" + + +def test_debenture_issuer_heuristic_keeps_confidence_below_short_circuit(): + # Heuristic incentivada must stay below the cascade short-circuit so a wired + # provider can confirm the isento claim by ISIN. + r = _resolve(name="DEB PETROBRAS IPCA+") + assert r.debenture.incentivada_1243 is True # spec still satisfied + assert r.confidence < 0.9 # but flagged for confirmation + + +def test_unknown_is_indefinido_low_confidence(): + r = _resolve(name="????") + assert r.macro_class == "Indefinido" + assert r.confidence < 0.5 + + +# ── Contract / determinism ───────────────────────────────────────── + + +def test_output_carries_audit_fields(): + r = _resolve(ticker="IFRA11") + assert r.source == "openfindata" + assert r.cascade == ["openfindata:curated"] + assert 0.0 <= r.confidence <= 1.0 + assert r.as_of # YYYY-MM-DD + + +def test_classify_is_deterministic(): + norm = normalize(ticker="IVVB11") + a, b = classify(norm), classify(norm) + assert a.model_dump(exclude={"as_of"}) == b.model_dump(exclude={"as_of"}) + + +def test_bare_ticker_passed_as_name_is_promoted(): + # The consolidator often only has the statement label. + r = _resolve(name="IVVB11") + assert r.identifier_resolved.ticker == "IVVB11" + assert r.macro_class == "Renda Variável" + + +def test_provider_chain_enriches_only_when_weak(): + calls = {"n": 0} + + async def fake_provider(norm, current): + calls["n"] += 1 + return None # noqa: RET501 — explicit "pass" signal in the provider protocol + + # Confident core result → provider must be skipped. + asyncio.run(resolve_asset(ticker="IFRA11", providers=[fake_provider])) + assert calls["n"] == 0 + + # Weak result → provider is consulted. + asyncio.run(resolve_asset(name="????", providers=[fake_provider])) + assert calls["n"] == 1 From d2e8ebcbd9648eb878f310eddf4e9b7a390954b7 Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 21:06:24 -0300 Subject: [PATCH 2/7] feat(resolver): add fiscal-certainty axis to debenture/tax classification The flat incentivada_1243/isento bools cannot tell a structurally certain infra signal (explicit name / FI-Infra ETF) apart from a weak issuer+IPCA heuristic. Add two Literal status fields that carry that certainty: - DebentureInfo.lei_12431_status: confirmed | candidate | not_applicable | unknown - TaxInfo.isento_status: confirmed_exempt | candidate_exempt | confirmed_taxable | unknown Existing bool fields are kept unchanged (spec/test contract). Engine debenture rule, CRA/CRI and LCI/LCA statutory exemptions, and the IFRA11 seed now stamp these statuses; Tesouro-backed RF ETFs get not_applicable. Co-Authored-By: Claude Opus 4.8 --- src/findata/resolver/engine.py | 25 +++++++++++++++++++++++-- src/findata/resolver/models.py | 21 +++++++++++++++++++++ src/findata/resolver/seed.py | 12 ++++++++++-- tests/test_resolver.py | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/findata/resolver/engine.py b/src/findata/resolver/engine.py index eabb757..4234c33 100644 --- a/src/findata/resolver/engine.py +++ b/src/findata/resolver/engine.py @@ -174,6 +174,23 @@ def _infer_incentivada( return None, "", None +def _apply_fiscal_certainty(basis: str | None, deb: dict[str, Any], tax: dict[str, Any]) -> None: + """Stamp the fiscal certainty axis on the debenture/tax sub-dicts. + + An explicit infra signal is structurally certain (confirmed exempt); the + issuer+IPCA heuristic is only a candidate; with no incentivada signal it is a + plain debenture (12.431 not applicable, tax treatment still unknown). + """ + if basis == "explicit": + deb["lei_12431_status"] = "confirmed" + tax["isento_status"] = "confirmed_exempt" + elif basis == "heuristic": + deb["lei_12431_status"] = "candidate" + tax["isento_status"] = "candidate_exempt" + else: + deb["lei_12431_status"] = "not_applicable" + + # ── The rule cascade ─────────────────────────────────────────────── @@ -226,6 +243,7 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: if incentivada: deb["incentivada_1243"] = True tax["isento"] = True + _apply_fiscal_certainty(basis, deb, tax) # An *explicit* infra signal is high-confidence. The issuer+IPCA # heuristic is deliberately kept below the cascade short-circuit # threshold (_CONFIDENT_ENOUGH) so a wired provider re-checks the @@ -260,7 +278,10 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "subclasse": "Crédito Privado", "exposure": "Brasil", "underlying_nature": "recebiveis", - "tax": {"isento": True}, # CRA/CRI: IR-exempt for PF + "tax": { + "isento": True, + "isento_status": "confirmed_exempt", + }, # CRA/CRI: IR-exempt for PF "confidence": 0.9, "notes": "Securitização (recebíveis) → Renda Fixa, isento p/ PF.", } @@ -288,7 +309,7 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "subclasse": _subclasse_from_indexador(parse_indexador(n.name_folded)), "exposure": "Brasil", "underlying_nature": "credito", - "tax": {"isento": True}, + "tax": {"isento": True, "isento_status": "confirmed_exempt"}, "confidence": 0.9, "notes": "LCI/LCA → Renda Fixa, isento p/ PF.", } diff --git a/src/findata/resolver/models.py b/src/findata/resolver/models.py index 386dfcf..4940910 100644 --- a/src/findata/resolver/models.py +++ b/src/findata/resolver/models.py @@ -70,6 +70,21 @@ "outro", ] +# Lei-12.431 certainty axis for a debenture (or FI-Infra ETF underlying). The +# legacy ``incentivada_1243`` bool cannot tell a *structurally certain* infra +# signal apart from a *weak* issuer+IPCA heuristic — this status carries that +# certainty: "confirmed" (explicit infra signal), "candidate" (heuristic, needs +# ISIN confirmation), "not_applicable" (it is a debenture but not infra), +# "unknown" (no debenture context decided it). +Lei12431Status = Literal["confirmed", "candidate", "not_applicable", "unknown"] + +# Tax-exemption certainty axis for the PF holder. The legacy ``isento`` bool +# cannot distinguish a statutory exemption (CRA/CRI/LCI-LCA/explicit 12.431) from +# a merely *candidate* exemption resting on a heuristic — this status carries +# that certainty: "confirmed_exempt", "candidate_exempt", "confirmed_taxable", +# "unknown". +IsentoStatus = Literal["confirmed_exempt", "candidate_exempt", "confirmed_taxable", "unknown"] + class IdentifierResolved(BaseModel): """The identifiers the resolver could normalize/confirm from the input.""" @@ -93,6 +108,9 @@ class DebentureInfo(BaseModel): (or an FI-Infra ETF whose underlying *is* incentivada debentures).""" incentivada_1243: bool | None = None # Lei 12.431 (infra) — IR-exempt for PF + # Certainty axis the bool can't carry: "confirmed" vs heuristic "candidate" + # vs "not_applicable" (a debenture, just not infra) vs "unknown". + lei_12431_status: Lei12431Status = "unknown" indexador: str | None = None # IPCA+ | CDI+ | %CDI | PREFIXADO | SELIC vencimento: str | None = None # YYYY-MM when known @@ -101,6 +119,9 @@ class TaxInfo(BaseModel): """Tax treatment for the typical PF holder.""" isento: bool | None = None # True for Lei 12.431 / LCI-LCA / FII dividends etc. + # Certainty axis the bool can't carry: statutory "confirmed_exempt" vs + # heuristic "candidate_exempt" vs "confirmed_taxable" vs "unknown". + isento_status: IsentoStatus = "unknown" class AssetClassification(BaseModel): diff --git a/src/findata/resolver/seed.py b/src/findata/resolver/seed.py index c83a60c..2a5a89d 100644 --- a/src/findata/resolver/seed.py +++ b/src/findata/resolver/seed.py @@ -114,8 +114,16 @@ def _build_etf_seed() -> list[SeedEntry]: "confidence": 0.97, } if incentivada: - payload["debenture"] = {"incentivada_1243": True, "indexador": "IPCA+"} - payload["tax"] = {"isento": True} + payload["debenture"] = { + "incentivada_1243": True, + "lei_12431_status": "confirmed", + "indexador": "IPCA+", + } + payload["tax"] = {"isento": True, "isento_status": "confirmed_exempt"} + else: + # Tesouro-backed RF ETFs: not infra debentures, no statutory + # exemption. isento_status stays the default "unknown". + payload["debenture"] = {"lei_12431_status": "not_applicable"} entries.append(SeedEntry(ticker=ticker, payload=payload)) return entries diff --git a/tests/test_resolver.py b/tests/test_resolver.py index fc42dab..5eb82e9 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -192,6 +192,40 @@ def test_unknown_is_indefinido_low_confidence(): assert r.confidence < 0.5 +# ── Fiscal-certainty axis (lei_12431_status / isento_status) ─────── + + +def test_deb_petrobras_heuristic_carries_candidate_certainty(): + r = _resolve(name="DEB PETROBRAS IPCA+") + assert r.debenture and r.debenture.incentivada_1243 is True + assert r.debenture.lei_12431_status == "candidate" + assert r.tax.isento is True + assert r.tax.isento_status == "candidate_exempt" + + +def test_ifra11_carries_confirmed_certainty(): + r = _resolve(ticker="IFRA11", name="FI ITAUINFRA CI") + assert r.debenture and r.debenture.lei_12431_status == "confirmed" + assert r.tax.isento_status == "confirmed_exempt" + + +def test_explicit_infra_debenture_is_confirmed(): + r = _resolve(name="DEB INFRA ENERGIA INCENTIVADA IPCA+") + assert r.debenture and r.debenture.lei_12431_status == "confirmed" + + +def test_cra_isento_status_is_confirmed_exempt(): + r = _resolve(name="CRA AGRONEGOCIO RAIZEN IPCA") + assert r.tax.isento_status == "confirmed_exempt" + + +def test_plain_non_infra_debenture_is_not_applicable(): + r = _resolve(name="DEB LOJAS RENNER CDI+") + assert r.debenture and r.debenture.incentivada_1243 is None + assert r.debenture.lei_12431_status == "not_applicable" + assert r.tax.isento_status == "unknown" + + # ── Contract / determinism ───────────────────────────────────────── From 7aec2ab1132344fe798f96a9b62af37a57ad947d Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 21:07:25 -0300 Subject: [PATCH 3/7] fix(resolver): keep debenture=None for Tesouro-backed RF ETFs A non-debenture ETF should not carry a stub DebentureInfo just to hold lei_12431_status=not_applicable; None is the honest 'no debenture facts' shape. Status not_applicable remains on actual non-infra debentures. Co-Authored-By: Claude Opus 4.8 --- src/findata/resolver/seed.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/findata/resolver/seed.py b/src/findata/resolver/seed.py index 2a5a89d..e67418e 100644 --- a/src/findata/resolver/seed.py +++ b/src/findata/resolver/seed.py @@ -120,10 +120,8 @@ def _build_etf_seed() -> list[SeedEntry]: "indexador": "IPCA+", } payload["tax"] = {"isento": True, "isento_status": "confirmed_exempt"} - else: - # Tesouro-backed RF ETFs: not infra debentures, no statutory - # exemption. isento_status stays the default "unknown". - payload["debenture"] = {"lei_12431_status": "not_applicable"} + # Tesouro-backed RF ETFs hold no debenture, so `debenture` stays None + # (the honest "no debenture facts" shape) rather than a stub object. entries.append(SeedEntry(ticker=ticker, payload=payload)) return entries From f67cb822bde10caa41083dcedaabca56a0784a46 Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 21:14:20 -0300 Subject: [PATCH 4/7] feat(resolver): structured signals audit trail Add a machine-readable `signals` list to AssetClassification alongside the free-text `notes` and coarse `cascade`, so an auditor sees WHICH rule fired and WHAT concrete token/phrase/ticker matched. Each `_rule_payload` branch records its rule id + real matched evidence (via `_first_matching_token`/`_phrase` helpers); the curated-seed path synthesizes a `curated_seed` signal in `classify()` without mutating the frozen seed entry. Co-Authored-By: Claude Opus 4.8 --- src/findata/resolver/engine.py | 378 +++++++++++++++++++++------------ src/findata/resolver/models.py | 15 ++ tests/test_resolver.py | 41 ++++ 3 files changed, 299 insertions(+), 135 deletions(-) diff --git a/src/findata/resolver/engine.py b/src/findata/resolver/engine.py index 4234c33..aa03b0a 100644 --- a/src/findata/resolver/engine.py +++ b/src/findata/resolver/engine.py @@ -39,6 +39,7 @@ DebentureInfo, Exposure, IdentifierResolved, + Signal, TaxInfo, ) from findata.resolver.normalize import NormalizedInput, normalize @@ -191,6 +192,170 @@ def _apply_fiscal_certainty(basis: str | None, deb: dict[str, Any], tax: dict[st deb["lei_12431_status"] = "not_applicable" +# ── Signal helpers ───────────────────────────────────────────────── + + +def _first_matching_token(n: NormalizedInput, candidates: tuple[str, ...]) -> str | None: + """Return the first candidate that appears as a whole token, or ``None``.""" + tset = set(n.tokens) + for c in candidates: + if c in tset: + return c + return None + + +def _first_matching_phrase(n: NormalizedInput, candidates: tuple[str, ...]) -> str | None: + """Return the first candidate that is a substring of the folded name, or ``None``.""" + for c in candidates: + if c in n.name_folded: + return c + return None + + +def _signal(rule: str, evidence: str, detail: str | None = None) -> list[dict[str, Any]]: + """Build the single-entry ``signals`` list a rule branch records.""" + entry: dict[str, Any] = {"rule": rule, "evidence": evidence} + if detail is not None: + entry["detail"] = detail + return [entry] + + +def _debenture_payload(n: NormalizedInput, deb_evidence: str) -> dict[str, Any]: + """Classify a debenture → RF; parse indexador + Lei-12.431 incentivada.""" + indexador = parse_indexador(n.name_folded) + incentivada, note, basis = _infer_incentivada(n, indexador) + deb: dict[str, Any] = {"indexador": indexador} + tax: dict[str, Any] = {} + if incentivada: + deb["incentivada_1243"] = True + tax["isento"] = True + _apply_fiscal_certainty(basis, deb, tax) + # An *explicit* infra signal is high-confidence. The issuer+IPCA heuristic is + # deliberately kept below the cascade short-circuit threshold + # (_CONFIDENT_ENOUGH) so a wired provider re-checks the isento claim by ISIN + # instead of it being taken as fact. + if basis == "explicit": + confidence = 0.92 + elif basis == "heuristic": + confidence = 0.7 + else: + confidence = 0.88 + return { + "kind": "debenture", + "macro_class": "Renda Fixa", + "subclasse": _subclasse_from_indexador(indexador), + "exposure": "Brasil", + "underlying_nature": "credito", + "estrutura": "debenture", + "debenture": deb, + "tax": tax, + "confidence": confidence, + "notes": note or "Debênture → Renda Fixa.", + "signals": _signal( + "debenture", deb_evidence, detail=f"basis={basis};indexador={indexador}" + ), + } + + +def _internacional_payload(n: NormalizedInput, intl_evidence: str) -> dict[str, Any]: + """Classify an internacional-mandate fund (IE / global keyword in a fund name).""" + equities = n.has_token("FIA") or n.name_contains("ACOES", "EQUITY") + rf = n.name_contains("DIVIDA EXTERNA", "RENDA FIXA", "BOND", "CREDITO", "DEBT") + if equities: + macro, subclasse, underlying = "Renda Variável", "Ações Global", "acoes" + elif rf: + macro, subclasse, underlying = "Renda Fixa", "Dívida Externa", "credito" + else: + macro, subclasse, underlying = "Multimercado", "Multimercado Global", "multiativos" + return { + "kind": "fundo", + "macro_class": macro, + "subclasse": subclasse, + "exposure": "Internacional", + "underlying_nature": underlying, + "estrutura": "IE" if n.has_token("IE") else "FIC", + "confidence": 0.9, + "notes": f"Mandato internacional (IE / global): {macro}, exposição Internacional.", + "signals": _signal("internacional", intl_evidence, detail=f"macro={macro}"), + } + + +def _etf_payload(n: NormalizedInput, etf_evidence: str) -> dict[str, Any]: + """Classify an ETF matched by name → infer underlying from name keywords.""" + rf = n.name_contains("RENDA FIXA", "DEBENTURE", "BOND", "IMA-", "IRF-", "TESOURO", "INFRA") + if rf: + sovereign = n.name_contains("TESOURO", "IMA-", "IRF-", "LFT", "NTN", "LTN") + credit = n.name_contains("DEBENTURE", "INFRA") + return { + "kind": "etf", + "macro_class": "Renda Fixa", + "subclasse": "ETF de renda fixa", + "exposure": "Brasil", + "underlying_nature": "debentures" + if credit + else ("tesouro" if sovereign else "credito"), + "estrutura": "ETF", + "confidence": 0.78, + "notes": "ETF com underlying de renda fixa (inferido do nome).", + "signals": _signal("etf_name", etf_evidence, detail="underlying=rf"), + } + intl = n.name_contains(*_GLOBAL_KEYWORDS, "S&P", "SP500", "NASDAQ", "MSCI", "EUA", "US ") + return { + "kind": "etf", + "macro_class": "Renda Variável", + "subclasse": "ETF de ações internacional" if intl else "ETF de ações", + "exposure": "Internacional" if intl else "Brasil", + "underlying_nature": "acoes", + "estrutura": "ETF", + "confidence": 0.72, + "notes": "ETF sem ticker no seed; underlying assumido = ações. Confirmar.", + "signals": _signal("etf_name", etf_evidence, detail="underlying=acoes"), + } + + +def _ticker_payload(n: NormalizedInput) -> dict[str, Any]: + """Classify a bare ticker by its digit suffix (no name signal won).""" + suffix = n.ticker_digits_suffix + # 11 not in any curated ETF/RF list → overwhelmingly a FII. + if suffix == "11": + return { + "kind": "fii", + "macro_class": "Renda Variável", + "subclasse": "FII", + "exposure": "Brasil", + "underlying_nature": "imoveis", + "estrutura": "FII", + "confidence": 0.72, + "notes": "Ticker terminado em 11 fora do seed de ETFs → FII (heurística).", + "signals": _signal("ticker_suffix_11", f"ticker={n.ticker}"), + } + # BDR (34/35): recibo de ação estrangeira. RV por classe, mas o holder + # carrega risco cambial/exterior → Internacional por exposição (default; + # BDRs de empresa brasileira no exterior são exceção, não a regra). + if suffix in {"34", "35"}: + return { + "kind": "bdr", + "macro_class": "Renda Variável", + "subclasse": "BDR", + "exposure": "Internacional", + "underlying_nature": "acoes", + "confidence": 0.8, + "notes": "BDR (recibo de ação estrangeira) → RV, exposição Internacional.", + "signals": _signal("bdr", f"ticker={n.ticker}"), + } + # 3-8: ordinary/preferred share — ação brasileira. + return { + "kind": "acao", + "macro_class": "Renda Variável", + "subclasse": "Ações", + "exposure": "Brasil", + "underlying_nature": "acoes", + "confidence": 0.85, + "notes": "Ação listada na B3 → Renda Variável.", + "signals": _signal("acao", f"ticker={n.ticker}"), + } + + # ── The rule cascade ─────────────────────────────────────────────── @@ -213,17 +378,20 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "underlying_nature": "credito", "confidence": 0.9, "notes": "Name-trap: 'Crédito Estruturado' é crédito (RF), não COE/Estruturados.", + "signals": _signal("credito_estruturado_trap", "CREDITO ESTRUTURADO"), } # 1) COE / operações estruturadas → Estruturados, never an ETF. - if n.has_token("COE") or n.name_contains( + _coe_phrases = ( "OPERACOES ESTRUTURADAS", "OPERACAO ESTRUTURADA", "CERTIFICADO DE OPERACOES", "CERT DE OPERACOES", "NOTA ESTRUTURADA", "NOTAS ESTRUTURADAS", - ): + ) + if n.has_token("COE") or n.name_contains(*_coe_phrases): + coe_evidence = _first_matching_token(n, ("COE",)) or _first_matching_phrase(n, _coe_phrases) return { "kind": "coe", "macro_class": "Estruturados", @@ -232,45 +400,20 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "estrutura": "COE", "confidence": 0.95, "notes": "COE (Certificado de Operações Estruturadas, CETIP) → Estruturados.", + "signals": _signal("coe", coe_evidence or "COE"), } # 2) Debenture → RF; parse indexador + incentivada. - if n.has_token("DEB", "DEBENTURE", "DEBENTURES", "DEBENT"): - indexador = parse_indexador(n.name_folded) - incentivada, note, basis = _infer_incentivada(n, indexador) - deb: dict[str, Any] = {"indexador": indexador} - tax: dict[str, Any] = {} - if incentivada: - deb["incentivada_1243"] = True - tax["isento"] = True - _apply_fiscal_certainty(basis, deb, tax) - # An *explicit* infra signal is high-confidence. The issuer+IPCA - # heuristic is deliberately kept below the cascade short-circuit - # threshold (_CONFIDENT_ENOUGH) so a wired provider re-checks the - # isento claim by ISIN instead of it being taken as fact. - if basis == "explicit": - confidence = 0.92 - elif basis == "heuristic": - confidence = 0.7 - else: - confidence = 0.88 - return { - "kind": "debenture", - "macro_class": "Renda Fixa", - "subclasse": _subclasse_from_indexador(indexador), - "exposure": "Brasil", - "underlying_nature": "credito", - "estrutura": "debenture", - "debenture": deb, - "tax": tax, - "confidence": confidence, - "notes": note or "Debênture → Renda Fixa.", - } + _deb_tokens = ("DEB", "DEBENTURE", "DEBENTURES", "DEBENT") + if n.has_token(*_deb_tokens): + return _debenture_payload(n, _first_matching_token(n, _deb_tokens) or "DEB") # 3) Securitização (CRA/CRI) → RF. - if n.has_token("CRA", "CRI") or n.name_contains( - "CERT. RECEBIVEIS", "CERTIFICADO DE RECEBIVEIS" - ): + _cra_phrases = ("CERT. RECEBIVEIS", "CERTIFICADO DE RECEBIVEIS") + if n.has_token("CRA", "CRI") or n.name_contains(*_cra_phrases): + cra_evidence = _first_matching_token(n, ("CRA", "CRI")) or _first_matching_phrase( + n, _cra_phrases + ) agro = n.has_token("CRA") or n.name_contains("AGRONEGOCIO") return { "kind": "cra" if agro else "cri", @@ -284,13 +427,18 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: }, # CRA/CRI: IR-exempt for PF "confidence": 0.9, "notes": "Securitização (recebíveis) → Renda Fixa, isento p/ PF.", + "signals": _signal("cra_cri", cra_evidence or "CRA/CRI"), } # 4) Bank paper (CDB/RDB/LIG/Letra Financeira/Letra de Câmbio) → RF. # NB: the bare 2-char tokens "LC"/"LF" are too collision-prone (they hit # issuer names, share classes, internal codes), so they are matched only # via their unambiguous phrases, never as bare tokens. - if n.has_token("CDB", "RDB", "LIG") or n.name_contains("LETRA FINANCEIRA", "LETRA DE CAMBIO"): + _bank_phrases = ("LETRA FINANCEIRA", "LETRA DE CAMBIO") + if n.has_token("CDB", "RDB", "LIG") or n.name_contains(*_bank_phrases): + bank_evidence = _first_matching_token(n, ("CDB", "RDB", "LIG")) or _first_matching_phrase( + n, _bank_phrases + ) return { "kind": "cdb", "macro_class": "Renda Fixa", @@ -299,10 +447,13 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "underlying_nature": "credito", "confidence": 0.88, "notes": "Emissão bancária → Renda Fixa.", + "signals": _signal("bank_paper", bank_evidence or "CDB"), } - if n.has_token("LCI", "LCA") or n.name_contains( - "LETRA DE CREDITO IMOBILIARIO", "LETRA DE CREDITO DO AGRONEGOCIO" - ): + _lci_phrases = ("LETRA DE CREDITO IMOBILIARIO", "LETRA DE CREDITO DO AGRONEGOCIO") + if n.has_token("LCI", "LCA") or n.name_contains(*_lci_phrases): + lci_evidence = _first_matching_token(n, ("LCI", "LCA")) or _first_matching_phrase( + n, _lci_phrases + ) return { "kind": "lci_lca", "macro_class": "Renda Fixa", @@ -312,12 +463,16 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "tax": {"isento": True, "isento_status": "confirmed_exempt"}, "confidence": 0.9, "notes": "LCI/LCA → Renda Fixa, isento p/ PF.", + "signals": _signal("lci_lca", lci_evidence or "LCI/LCA"), } # 5) Tesouro / public bonds → RF. - if n.has_token("TESOURO", "NTN", "LTN", "LFT", "NTNB", "NTNF") or n.name_contains( - "TESOURO DIRETO", "TESOURO SELIC", "TESOURO IPCA", "TESOURO PREFIXADO" - ): + _tesouro_tokens = ("TESOURO", "NTN", "LTN", "LFT", "NTNB", "NTNF") + _tesouro_phrases = ("TESOURO DIRETO", "TESOURO SELIC", "TESOURO IPCA", "TESOURO PREFIXADO") + if n.has_token(*_tesouro_tokens) or n.name_contains(*_tesouro_phrases): + tesouro_evidence = _first_matching_token(n, _tesouro_tokens) or _first_matching_phrase( + n, _tesouro_phrases + ) return { "kind": "tesouro", "macro_class": "Renda Fixa", @@ -326,6 +481,7 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "underlying_nature": "tesouro", "confidence": 0.95, "notes": "Título público federal → Renda Fixa.", + "signals": _signal("tesouro", tesouro_evidence or "TESOURO"), } # 6) Internacional EXPOSURE — IE structure, or global keyword. Geography is @@ -335,36 +491,22 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: # fund name is too collision-prone (e.g. "COMPANHIA IE ENERGIA SA"). # Runs before FIA/Ações so "FIC FIA IE" / "GLOBAL FIM" land here. fund_context = n.has_token(*_FUND_CONTEXT_TOKENS) - if fund_context and ( - n.has_token("IE") - or n.name_contains(*_GLOBAL_KEYWORDS, "INVESTIMENTO NO EXTERIOR", "INV EXTERIOR") - ): - equities = n.has_token("FIA") or n.name_contains("ACOES", "EQUITY") - rf = n.name_contains("DIVIDA EXTERNA", "RENDA FIXA", "BOND", "CREDITO", "DEBT") - if equities: - macro, subclasse, underlying = "Renda Variável", "Ações Global", "acoes" - elif rf: - macro, subclasse, underlying = "Renda Fixa", "Dívida Externa", "credito" - else: - macro, subclasse, underlying = "Multimercado", "Multimercado Global", "multiativos" - return { - "kind": "fundo", - "macro_class": macro, - "subclasse": subclasse, - "exposure": "Internacional", - "underlying_nature": underlying, - "estrutura": "IE" if n.has_token("IE") else "FIC", - "confidence": 0.9, - "notes": f"Mandato internacional (IE / global): {macro}, exposição Internacional.", - } + _intl_phrases = (*_GLOBAL_KEYWORDS, "INVESTIMENTO NO EXTERIOR", "INV EXTERIOR") + if fund_context and (n.has_token("IE") or n.name_contains(*_intl_phrases)): + intl_evidence = _first_matching_token(n, ("IE",)) or _first_matching_phrase( + n, _intl_phrases + ) + return _internacional_payload(n, intl_evidence or "IE") # 7) FII (by name; ticker-only 11s are caught at step 12). - if n.has_token("FII") or n.name_contains( + _fii_phrases = ( "FUNDO IMOBILIARIO", "FDO INV IMOB", "FUNDO DE INVESTIMENTO IMOBILIARIO", "INVESTIMENTO IMOBILIARIO", - ): + ) + if n.has_token("FII") or n.name_contains(*_fii_phrases): + fii_evidence = _first_matching_token(n, ("FII",)) or _first_matching_phrase(n, _fii_phrases) return { "kind": "fii", "macro_class": "Renda Variável", @@ -374,40 +516,18 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "estrutura": "FII", "confidence": 0.92, "notes": "Fundo Imobiliário → Renda Variável (subclasse FII).", + "signals": _signal("fii_name", fii_evidence or "FII"), } # 8) ETF by name, no curated hit → infer underlying from name keywords. - if n.has_token("ETF") or n.name_contains("ISHARES", "INDEX FUND"): - rf = n.name_contains("RENDA FIXA", "DEBENTURE", "BOND", "IMA-", "IRF-", "TESOURO", "INFRA") - if rf: - sovereign = n.name_contains("TESOURO", "IMA-", "IRF-", "LFT", "NTN", "LTN") - credit = n.name_contains("DEBENTURE", "INFRA") - return { - "kind": "etf", - "macro_class": "Renda Fixa", - "subclasse": "ETF de renda fixa", - "exposure": "Brasil", - "underlying_nature": "debentures" - if credit - else ("tesouro" if sovereign else "credito"), - "estrutura": "ETF", - "confidence": 0.78, - "notes": "ETF com underlying de renda fixa (inferido do nome).", - } - intl = n.name_contains(*_GLOBAL_KEYWORDS, "S&P", "SP500", "NASDAQ", "MSCI", "EUA", "US ") - return { - "kind": "etf", - "macro_class": "Renda Variável", - "subclasse": "ETF de ações internacional" if intl else "ETF de ações", - "exposure": "Internacional" if intl else "Brasil", - "underlying_nature": "acoes", - "estrutura": "ETF", - "confidence": 0.72, - "notes": "ETF sem ticker no seed; underlying assumido = ações. Confirmar.", - } + _etf_phrases = ("ISHARES", "INDEX FUND") + if n.has_token("ETF") or n.name_contains(*_etf_phrases): + etf_evidence = _first_matching_token(n, ("ETF",)) or _first_matching_phrase(n, _etf_phrases) + return _etf_payload(n, etf_evidence or "ETF") # 9) FIDC → RF (direitos creditórios, natureza de crédito). if n.has_token("FIDC") or n.name_contains("DIREITOS CREDITORIOS"): + fidc_evidence = _first_matching_token(n, ("FIDC",)) or "DIREITOS CREDITORIOS" return { "kind": "fundo", "macro_class": "Renda Fixa", @@ -417,10 +537,13 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "estrutura": "FIDC", "confidence": 0.85, "notes": "FIDC (direitos creditórios) → Renda Fixa (crédito).", + "signals": _signal("fidc", fidc_evidence), } # 10) FIP → Alternativos (private equity). - if n.has_token("FIP") or n.name_contains("PARTICIPACOES", "PRIVATE EQUITY"): + _fip_phrases = ("PARTICIPACOES", "PRIVATE EQUITY") + if n.has_token("FIP") or n.name_contains(*_fip_phrases): + fip_evidence = _first_matching_token(n, ("FIP",)) or _first_matching_phrase(n, _fip_phrases) return { "kind": "fundo", "macro_class": "Alternativos", @@ -429,10 +552,13 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "estrutura": "FIP", "confidence": 0.88, "notes": "FIP (participações) → Alternativos.", + "signals": _signal("fip", fip_evidence or "FIP"), } # 11) Multimercado. - if n.has_token("FIM") or n.name_contains("MULTIMERCADO", "MULTIESTRATEGIA", "MACRO"): + _mm_phrases = ("MULTIMERCADO", "MULTIESTRATEGIA", "MACRO") + if n.has_token("FIM") or n.name_contains(*_mm_phrases): + mm_evidence = _first_matching_token(n, ("FIM",)) or _first_matching_phrase(n, _mm_phrases) return { "kind": "fundo", "macro_class": "Multimercado", @@ -441,10 +567,13 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "estrutura": "FIM", "confidence": 0.85, "notes": "Multimercado.", + "signals": _signal("multimercado", mm_evidence or "FIM"), } # 12) Ações / FIA (domestic equities). - if n.has_token("FIA") or n.name_contains("FUNDO DE ACOES", "ACOES", "EQUITY"): + _fia_phrases = ("FUNDO DE ACOES", "ACOES", "EQUITY") + if n.has_token("FIA") or n.name_contains(*_fia_phrases): + fia_evidence = _first_matching_token(n, ("FIA",)) or _first_matching_phrase(n, _fia_phrases) return { "kind": "fundo", "macro_class": "Renda Variável", @@ -454,46 +583,12 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "estrutura": "FIA", "confidence": 0.85, "notes": "Fundo de Ações → Renda Variável.", + "signals": _signal("fia", fia_evidence or "FIA"), } # 13) Ticker shapes (no name signal won above). - suffix = n.ticker_digits_suffix if n.ticker: - # 11 not in any curated ETF/RF list → overwhelmingly a FII. - if suffix == "11": - return { - "kind": "fii", - "macro_class": "Renda Variável", - "subclasse": "FII", - "exposure": "Brasil", - "underlying_nature": "imoveis", - "estrutura": "FII", - "confidence": 0.72, - "notes": "Ticker terminado em 11 fora do seed de ETFs → FII (heurística).", - } - # BDR (34/35): recibo de ação estrangeira. RV por classe, mas o holder - # carrega risco cambial/exterior → Internacional por exposição (default; - # BDRs de empresa brasileira no exterior são exceção, não a regra). - if suffix in {"34", "35"}: - return { - "kind": "bdr", - "macro_class": "Renda Variável", - "subclasse": "BDR", - "exposure": "Internacional", - "underlying_nature": "acoes", - "confidence": 0.8, - "notes": "BDR (recibo de ação estrangeira) → RV, exposição Internacional.", - } - # 3-8: ordinary/preferred share — ação brasileira. - return { - "kind": "acao", - "macro_class": "Renda Variável", - "subclasse": "Ações", - "exposure": "Brasil", - "underlying_nature": "acoes", - "confidence": 0.85, - "notes": "Ação listada na B3 → Renda Variável.", - } + return _ticker_payload(n) # 14) Nothing matched — honest "I don't know" for HITL review. return { @@ -501,6 +596,7 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "macro_class": "Indefinido", "confidence": 0.2, "notes": "Sem sinal estrutural suficiente; requer revisão (human-in-the-loop).", + "signals": _signal("fallback", "no_structural_signal"), } @@ -538,6 +634,7 @@ def _assemble(norm: NormalizedInput, payload: dict[str, Any], step: str) -> Asse confidence=payload.get("confidence", 0.5), as_of=datetime.now(_BR_TZ).date().isoformat(), cascade=[step], + signals=[Signal(**s) for s in payload.get("signals", [])], notes=payload.get("notes"), ) @@ -550,7 +647,18 @@ def classify(norm: NormalizedInput) -> AssetClassification: """ seed = lookup_seed(ticker=norm.ticker, cnpj=norm.cnpj, name_folded=norm.name_folded) if seed is not None: - return _assemble(norm, seed.payload, step="openfindata:curated") + # Synthesize the curated_seed signal here (the frozen seed entry must not + # be mutated): describe HOW the entry matched — by ticker, then CNPJ, then + # name substrings. A copy keeps the seed payload immutable. + if seed.ticker and norm.ticker == seed.ticker: + evidence = f"ticker={norm.ticker}" + elif seed.cnpj and norm.cnpj == seed.cnpj: + evidence = f"cnpj={norm.cnpj}" + else: + evidence = f"name:{'+'.join(seed.name_substrings)}" + payload = {**seed.payload} + payload.setdefault("signals", [{"rule": "curated_seed", "evidence": evidence}]) + return _assemble(norm, payload, step="openfindata:curated") return _assemble(norm, _rule_payload(norm), step="openfindata:rules") diff --git a/src/findata/resolver/models.py b/src/findata/resolver/models.py index 4940910..2d00ee7 100644 --- a/src/findata/resolver/models.py +++ b/src/findata/resolver/models.py @@ -86,6 +86,19 @@ IsentoStatus = Literal["confirmed_exempt", "candidate_exempt", "confirmed_taxable", "unknown"] +class Signal(BaseModel): + """One structured audit entry: which rule fired and what evidence matched. + + Unlike the free-text ``notes``, a ``Signal`` is machine-readable so an + auditor can see WHICH rule decided and WHAT concrete token/phrase/ticker + triggered it (e.g. rule="debenture", evidence="DEB"). + """ + + rule: str # the rule id that fired (e.g. "debenture", "curated_seed") + evidence: str # the concrete token/phrase/ticker that matched (e.g. "DEB") + detail: str | None = None # optional extra (e.g. "basis=heuristic", "indexador=IPCA+") + + class IdentifierResolved(BaseModel): """The identifiers the resolver could normalize/confirm from the input.""" @@ -141,5 +154,7 @@ class AssetClassification(BaseModel): as_of: str # YYYY-MM-DD # Audit trail: ordered list of resolution steps actually attempted. cascade: list[str] = Field(default_factory=list) + # Structured audit trail: which rule fired and what evidence matched. + signals: list[Signal] = Field(default_factory=list) # Free-text rationale, e.g. which trap was avoided or which signal decided. notes: str | None = None diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 5eb82e9..4b863a0 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -264,3 +264,44 @@ async def fake_provider(norm, current): # Weak result → provider is consulted. asyncio.run(resolve_asset(name="????", providers=[fake_provider])) assert calls["n"] == 1 + + +# ── Structured signals trail ─────────────────────────────────────── + + +def test_curated_seed_emits_curated_signal(): + r = _resolve(ticker="IFRA11") + assert r.signals + assert r.signals[0].rule == "curated_seed" + assert "IFRA11" in r.signals[0].evidence + + +def test_debenture_signal_records_evidence_and_detail(): + r = _resolve(name="DEB PETROBRAS IPCA+") + deb = [s for s in r.signals if s.rule == "debenture"] + assert deb + assert deb[0].evidence == "DEB" + assert deb[0].detail is not None + assert "basis=" in deb[0].detail + assert "IPCA+" in deb[0].detail + + +def test_coe_signal_fires(): + r = _resolve(name="INVEST. ESTRUTURADOS COE BTG") + assert any(s.rule == "coe" for s in r.signals) + + +def test_credito_estruturado_trap_signal_carries_phrase(): + r = _resolve(name="AMW CREDITO ESTRUTURADO FIC FIM CP") + trap = [s for s in r.signals if s.rule == "credito_estruturado_trap"] + assert trap + assert "CREDITO ESTRUTURADO" in trap[0].evidence + + +@pytest.mark.parametrize( + "ident", + ["PETR4", "HGLG11", "Tesouro IPCA+ 2035", "KAPITALO ZETA FIC FIM"], +) +def test_every_result_carries_at_least_one_signal(ident): + r = classify(normalize(name=ident)) + assert len(r.signals) >= 1 From 70361eb799d2636d07daba8aefec242a4035fad5 Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 21:23:39 -0300 Subject: [PATCH 5/7] fix(resolver): address review-bot findings + doc drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-host CI reviewers (Gemini, CodeRabbit) on PR #33: - Gate the FIP/Multimercado/FIA keyword paths ("PARTICIPACOES", "MACRO", "ACOES"/"EQUITY") on a fund context so company/trade names ("Randon Participações SA", "Macro Atacadista") are not misclassified as funds. - Derive the Tesouro subclasse from the bond code when the name lacks the index word: NTN-B → Indexada à Inflação, LFT → Pós-fixada, LTN/NTN-F → Prefixada; unmapped public bonds → "Título Público", never "Crédito Privado". - Drop the redundant second fold() in lookup_seed (input is pre-folded). - Fix doc drift: macro_class no longer lists "Internacional" (it is the exposure axis) in the REST router, MCP tool summary/docstring, package docstring, and MCP_SURFACE.md; add a language to the fenced block. 6 new regression tests. 272 → 278 passed, 15 deselected. ruff + mypy clean. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 33 ++++++++++++++---------- docs/MCP_SURFACE.md | 4 +-- src/findata/api/mcp_app.py | 25 ++++++++++-------- src/findata/api/routers/resolver.py | 7 ++--- src/findata/resolver/__init__.py | 10 +++++--- src/findata/resolver/engine.py | 40 ++++++++++++++++++++++------- src/findata/resolver/seed.py | 7 +++-- tests/test_resolver.py | 38 +++++++++++++++++++++++++++ 8 files changed, 118 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2f2a67..e24b8cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,19 +21,26 @@ adheres to [Semantic Versioning](https://semver.org/). - **Asset-classification resolver** — `findata.resolver.resolve_asset()`, `GET /resolver/resolve`, and the `resolve_asset` MCP tool. Turns any Brazilian asset identifier (ticker/CNPJ/ISIN/name) into a classification - mapped to the consolidation macro taxonomy (Renda Fixa, Renda Variável, - Multimercado, Alternativos, Estruturados) plus an orthogonal `exposure` - axis (Brasil/Internacional), `subclasse`, `underlying_nature`, debenture - Lei-12.431 facts, `source`, `confidence`, and the `cascade` walked. - Deterministic and offline at its core (a curated ETF/global-fund seed + - structural rules), with an injectable external-provider chain (Mais - Retorno / CVM-B3 / restricted web search) for low-confidence fallback. - Classifies ETFs/funds by underlying (IFRA11 debêntures → RF; IVVB11 ações - → RV + Internacional), defends the COE-never-ETF and "Crédito Estruturado" - name-traps, and keeps the Lei-12.431 isento flag below the cascade - short-circuit when only inferred by heuristic. Hardened after adversarial - review: bare-token collisions (`IE`/`LC`/`LF`/substring `LCI`) removed, - API length caps, `as_of` stamped in America/Sao_Paulo. + mapped to the consolidation taxonomy. `macro_class` is the asset class + (Renda Fixa, Renda Variável, Multimercado, Alternativos, Estruturados); + geography is the orthogonal `exposure` axis (Brasil/Internacional), so + IVVB11 and BDRs are RV + Internacional and a global-mandate FIA is RV + + Internacional. Also returns `subclasse`, `underlying_nature`, debenture + Lei-12.431 facts with a **certainty status** (`lei_12431_status`: + confirmed/candidate/not_applicable; `isento_status`), `source`, + `confidence`, the `cascade` walked, and a structured `signals` trail + (which rule fired on what evidence). Deterministic and offline at its core + (a curated ETF/global-fund seed + structural rules), with an injectable + external-provider chain (Mais Retorno / CVM-B3 / restricted web search) + for low-confidence fallback. Classifies ETFs/funds by underlying (IFRA11 + debêntures → RF; IVVB11 ações → RV), defends the COE-never-ETF and "Crédito + Estruturado" name-traps, and keeps a heuristic Lei-12.431 isento as a + `candidate` below the cascade short-circuit so a provider can confirm it by + ISIN. Hardened after cross-host adversarial review and CI review bots: + bare-token collisions (`IE`/`LC`/`LF`/`MACRO`/`ACOES`/`PARTICIPACOES`, + substring `LCI`) gated on fund context, public-bond subclasse derived from + the bond code (NTN-B → inflation), API length caps, `as_of` stamped in + America/Sao_Paulo. - **ANBIMA Títulos Públicos (TPF) secondary market** — `get_tpf()`, `GET /anbima/tpf`, and `findata anbima tpf`. Daily reference rates for outstanding federal government bonds (LTN, LFT, NTN-B, NTN-C, NTN-F) from diff --git a/docs/MCP_SURFACE.md b/docs/MCP_SURFACE.md index fbbb90a..31c9961 100644 --- a/docs/MCP_SURFACE.md +++ b/docs/MCP_SURFACE.md @@ -55,9 +55,9 @@ safe. **The 95 REST routes that back the CLI and HTTP consumers never change.** ## The 25 curated tools -``` +```text registry_lookup ← start here: CNPJ / ticker / code / name → entities -resolve_asset ← classify an asset into the macro taxonomy + exposure +resolve_asset ← classify an asset: macro asset class + exposure bcb_series bcb_ptax bcb_focus (BCB: 12 → 3) cvm_company cvm_financials cvm_fund cvm_structured_fund (CVM: 22 → 4) diff --git a/src/findata/api/mcp_app.py b/src/findata/api/mcp_app.py index b82115e..abef0e4 100644 --- a/src/findata/api/mcp_app.py +++ b/src/findata/api/mcp_app.py @@ -102,7 +102,7 @@ async def registry_lookup( "/resolver/resolve", operation_id="resolve_asset", response_model=None, - summary="Classify a Brazilian asset into the macro taxonomy (RF/RV/Multi/Intl/Alt/Estrut)", + summary="Classify a Brazilian asset: asset class + Brasil/Internacional exposure", ) async def resolve_asset_tool( name: str | None = Query( @@ -112,16 +112,19 @@ async def resolve_asset_tool( cnpj: str | None = Query(None, max_length=32, description="Fund CNPJ (masked or not)"), isin: str | None = Query(None, max_length=16, description="ISIN, e.g. BR..."), ) -> Any: - """Turn any asset identifier into a classification already mapped to the - consolidation macro taxonomy: Renda Fixa, Renda Variável, Multimercado, - Internacional, Alternativos, Estruturados. - - Returns ``macro_class`` + ``subclasse`` + ``underlying_nature`` (splits - ETF-de-ações from ETF-de-debêntures), debenture/Lei-12.431 facts, ``source``, - ``confidence``, and the ``cascade`` walked — deterministic and cacheable. - Pass any subset of identifiers; a bare ticker/CNPJ given as ``name`` is - auto-detected. Use this (not ``registry_lookup``) when you need the asset's - macro class, not its registry entity. + """Turn any asset identifier into a classification mapped to the + consolidation taxonomy. ``macro_class`` is the asset class only — Renda Fixa, + Renda Variável, Multimercado, Alternativos, Estruturados; geography is the + separate ``exposure`` axis (Brasil/Internacional), so e.g. IVVB11 is RV + + Internacional. + + Returns ``macro_class`` + ``exposure`` + ``subclasse`` + ``underlying_nature`` + (splits ETF-de-ações from ETF-de-debêntures), debenture/Lei-12.431 facts (with + a confirmed/candidate certainty status), ``source``, ``confidence``, the + ``cascade`` walked, and structured ``signals`` (which rule fired on what + evidence) — deterministic and cacheable. Pass any subset of identifiers; a + bare ticker/CNPJ given as ``name`` is auto-detected. Use this (not + ``registry_lookup``) when you need the asset's class, not its registry entity. """ return await resolve_asset(name=name, ticker=ticker, cnpj=cnpj, isin=isin) diff --git a/src/findata/api/routers/resolver.py b/src/findata/api/routers/resolver.py index a47c8c3..ea14447 100644 --- a/src/findata/api/routers/resolver.py +++ b/src/findata/api/routers/resolver.py @@ -27,9 +27,10 @@ async def resolve( """Classifica um ativo na taxonomia macro Wealthuman. Aceita qualquer identificador (``name``/``ticker``/``cnpj``/``isin``) e - devolve ``macro_class`` já mapeada (Renda Fixa, Renda Variável, Multimercado, - Internacional, Alternativos, Estruturados) + subclasse, underlying, - debênture/Lei 12.431, ``source``, ``confidence`` e a cascata percorrida. + devolve ``macro_class`` (classe de ativo: Renda Fixa, Renda Variável, + Multimercado, Alternativos, Estruturados) + ``exposure`` (eixo ortogonal de + geografia: Brasil/Internacional) + subclasse, underlying, debênture/Lei + 12.431, ``source``, ``confidence``, ``signals`` e a cascata percorrida. Determinístico e cacheável. """ return await resolve_asset(name=name, ticker=ticker, cnpj=cnpj, isin=isin) diff --git a/src/findata/resolver/__init__.py b/src/findata/resolver/__init__.py index 003c19b..a75afa9 100644 --- a/src/findata/resolver/__init__.py +++ b/src/findata/resolver/__init__.py @@ -1,10 +1,12 @@ """Wealthuman asset-classification resolver. ``resolve_asset(identifier)`` turns any Brazilian asset identifier (ticker, -CNPJ, ISIN, or bare name) into a classification already mapped to the Wealthuman -macro taxonomy (Renda Fixa, Renda Variável, Multimercado, Internacional, -Alternativos, Estruturados) plus subclasse, underlying nature, debenture / -Lei-12.431 facts, source, confidence, and an audit cascade. +CNPJ, ISIN, or bare name) into a classification mapped to the Wealthuman +taxonomy: ``macro_class`` is the asset class (Renda Fixa, Renda Variável, +Multimercado, Alternativos, Estruturados); geography is the orthogonal +``exposure`` axis (Brasil/Internacional). Plus subclasse, underlying nature, +debenture / Lei-12.431 facts (with a certainty status), source, confidence, an +audit cascade, and structured signals. Deterministic, cacheable, auditable, no PII. See ``openfindata-mcp-spec.md``. """ diff --git a/src/findata/resolver/engine.py b/src/findata/resolver/engine.py index aa03b0a..6cf0646 100644 --- a/src/findata/resolver/engine.py +++ b/src/findata/resolver/engine.py @@ -132,14 +132,30 @@ def parse_indexador(name_folded: str) -> str | None: return None -def _subclasse_from_indexador(indexador: str | None) -> str: +def _subclasse_from_indexador(indexador: str | None, default: str = "Crédito Privado") -> str: if indexador == "IPCA+": return "Indexada à Inflação" if indexador in {"%CDI", "CDI+", "SELIC"}: return "Pós-fixada" if indexador == "PREFIXADO": return "Prefixada" - return "Crédito Privado" + return default + + +# Public-bond type → indexador, for names that carry the bond code but not the +# index word (e.g. "NTN-B 2035" has no "IPCA"). Folded substrings, so "NTN-B" +# and "NTNB" both hit. NTN-C (IGP-M) is left to the generic path. +def _tesouro_indexador(n: NormalizedInput) -> str | None: + explicit = parse_indexador(n.name_folded) + if explicit is not None: + return explicit + if n.name_contains("NTN-B", "NTNB"): + return "IPCA+" + if n.name_contains("LFT"): + return "SELIC" + if n.name_contains("NTN-F", "NTNF", "LTN"): + return "PREFIXADO" + return None def _infer_incentivada( @@ -476,7 +492,9 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: return { "kind": "tesouro", "macro_class": "Renda Fixa", - "subclasse": _subclasse_from_indexador(parse_indexador(n.name_folded)), + # Public bonds carry the index in their type code, not always a word; + # default to "Título Público", never the credit-private subclasse. + "subclasse": _subclasse_from_indexador(_tesouro_indexador(n), default="Título Público"), "exposure": "Brasil", "underlying_nature": "tesouro", "confidence": 0.95, @@ -540,9 +558,11 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "signals": _signal("fidc", fidc_evidence), } - # 10) FIP → Alternativos (private equity). + # 10) FIP → Alternativos (private equity). The FIP token is unambiguous; the + # "PARTICIPACOES"/"PRIVATE EQUITY" phrases need a fund context so a holding + # company ("XYZ Participações SA") is not classified as a fund. _fip_phrases = ("PARTICIPACOES", "PRIVATE EQUITY") - if n.has_token("FIP") or n.name_contains(*_fip_phrases): + if n.has_token("FIP") or (fund_context and n.name_contains(*_fip_phrases)): fip_evidence = _first_matching_token(n, ("FIP",)) or _first_matching_phrase(n, _fip_phrases) return { "kind": "fundo", @@ -555,9 +575,10 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "signals": _signal("fip", fip_evidence or "FIP"), } - # 11) Multimercado. + # 11) Multimercado. FIM token is unambiguous; the phrases (esp. "MACRO", + # common in trade names like "Macro Atacadista") need a fund context. _mm_phrases = ("MULTIMERCADO", "MULTIESTRATEGIA", "MACRO") - if n.has_token("FIM") or n.name_contains(*_mm_phrases): + if n.has_token("FIM") or (fund_context and n.name_contains(*_mm_phrases)): mm_evidence = _first_matching_token(n, ("FIM",)) or _first_matching_phrase(n, _mm_phrases) return { "kind": "fundo", @@ -570,9 +591,10 @@ def _rule_payload(norm: NormalizedInput) -> dict[str, Any]: "signals": _signal("multimercado", mm_evidence or "FIM"), } - # 12) Ações / FIA (domestic equities). + # 12) Ações / FIA (domestic equities). FIA token is unambiguous; the bare + # "ACOES"/"EQUITY" keywords need a fund context (avoid company names). _fia_phrases = ("FUNDO DE ACOES", "ACOES", "EQUITY") - if n.has_token("FIA") or n.name_contains(*_fia_phrases): + if n.has_token("FIA") or (fund_context and n.name_contains(*_fia_phrases)): fia_evidence = _first_matching_token(n, ("FIA",)) or _first_matching_phrase(n, _fia_phrases) return { "kind": "fundo", diff --git a/src/findata/resolver/seed.py b/src/findata/resolver/seed.py index e67418e..05f9758 100644 --- a/src/findata/resolver/seed.py +++ b/src/findata/resolver/seed.py @@ -28,8 +28,6 @@ from dataclasses import dataclass, field from typing import Any -from findata.resolver.normalize import fold - @dataclass(frozen=True) class SeedEntry: @@ -168,8 +166,9 @@ def lookup_seed(*, ticker: str | None, cnpj: str | None, name_folded: str) -> Se if cnpj and cnpj in _BY_CNPJ: return _BY_CNPJ[cnpj] if name_folded: - folded = fold(name_folded) + # ``name_folded`` is already ASCII-folded/uppercased by the caller + # (normalize()), so match directly — no second fold. for entry in _NAME_ENTRIES: - if all(sub in folded for sub in entry.name_substrings): + if all(sub in name_folded for sub in entry.name_substrings): return entry return None diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 4b863a0..b2b304f 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -192,6 +192,44 @@ def test_unknown_is_indefinido_low_confidence(): assert r.confidence < 0.5 +# ── Review-bot regressions (token collisions + Tesouro subclasse) ── + + +def test_ntnb_bond_code_maps_to_inflation_subclasse(): + # "NTN-B" carries no "IPCA" word, but it is an inflation-linked public bond. + r = _resolve(name="NTN-B 2035") + assert r.kind == "tesouro" + assert r.subclasse == "Indexada à Inflação" + + +def test_lft_and_ltn_bond_codes_map_to_right_subclasse(): + assert _resolve(name="LFT 2029").subclasse == "Pós-fixada" + assert _resolve(name="LTN 2028").subclasse == "Prefixada" + + +def test_tesouro_without_indexador_is_titulo_publico_not_credito_privado(): + # NTN-C (IGP-M) isn't mapped → generic public-bond subclasse, never credit. + r = _resolve(name="NTN-C 2031") + assert r.kind == "tesouro" + assert r.subclasse == "Título Público" + + +def test_holding_company_participacoes_is_not_fip(): + # "Participações" in a company name (no fund context) must not be a FIP. + r = _resolve(name="RANDON PARTICIPACOES SA") + assert r.macro_class != "Alternativos" + + +def test_macro_trade_name_is_not_multimercado(): + r = _resolve(name="MACRO ATACADISTA DISTRIBUIDORA SA") + assert r.macro_class != "Multimercado" + + +def test_bare_acoes_keyword_without_fund_context_is_not_fia(): + r = _resolve(name="EMPRESA BRASILEIRA ACOES ON SA") + assert not (r.kind == "fundo" and r.subclasse == "Ações") + + # ── Fiscal-certainty axis (lei_12431_status / isento_status) ─────── From c80207619f2f69163a75f1e2854d983126af025e Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 21:24:43 -0300 Subject: [PATCH 6/7] docs(resolver): client-facing contract for resolve_asset (Wealthuman deliverable) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add docs/RESOLVER.md: the MCP/REST/Python contract the consolidator calls — input, full output schema with the two-axis model (asset class + exposure), fiscal-certainty status, cascade, verified test-set table, and the pre-prod pending list. Co-Authored-By: Claude Opus 4.8 --- docs/RESOLVER.md | 132 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 docs/RESOLVER.md diff --git a/docs/RESOLVER.md b/docs/RESOLVER.md new file mode 100644 index 0000000..26c482b --- /dev/null +++ b/docs/RESOLVER.md @@ -0,0 +1,132 @@ +# `resolve_asset` — classificador de ativos (taxonomia Wealthuman) + +> Entrega para o demandante (Wealthuman / consolidação de extratos). Define o +> contrato que o consolidador chama por ativo (dezenas por extrato). Implementado +> em [`src/findata/resolver/`](../src/findata/resolver/), exposto por REST, MCP e +> biblioteca Python. + +## Problema + +A consolidação classifica cada ativo na taxonomia macro do banker. O agente +antigo buscava ANBIMA/debentures.com.br no brave: lento e errava (chutava RV pelo +"11" de um ETF de debênture, perdia mandato global sem "IE", confundia "Crédito +Estruturado" com COE). `resolve_asset` devolve a classificação **determinística, +cacheável e auditável**, já na taxonomia do cliente. + +## Como chamar + +Três superfícies, mesmo núcleo: + +| Superfície | Chamada | +|---|---| +| REST | `GET /resolver/resolve?ticker=IFRA11&name=FI%20ITAUINFRA` | +| MCP | tool `resolve_asset` (args `name`/`ticker`/`cnpj`/`isin`) | +| Python | `await findata.resolver.resolve_asset(ticker="IFRA11")` | + +**Input** — qualquer subconjunto de identificadores; o resolver normaliza e +promove um identificador "pelado" passado em `name` (o extrato às vezes só tem o +label): + +```json +{ "name": "FI ITAUINFRA CI", "ticker": "IFRA11", "cnpj": null, "isin": null } +``` + +Sem PII: o resolver recebe **só** identificador de ativo, nunca dado de cliente. +Limites de tamanho no boundary (`name` 256, `ticker` 16, `cnpj` 32, `isin` 16). + +## Contrato de saída + +```jsonc +{ + "identifier_resolved": { "cnpj": null, "ticker": "IFRA11", "isin": null, "name": "FI ITAUINFRA CI" }, + "kind": "etf", // fundo|acao|fii|etf|bdr|debenture|cra|cri|cdb|lci_lca|tesouro|coe|outro + "cvm": { "classe": null, "anbima_categoria": null, "estrutura": "ETF" }, + "macro_class": "Renda Fixa", // CLASSE DE ATIVO (ver eixo 1 abaixo) + "subclasse": "Indexada à Inflação", + "exposure": "Brasil", // GEOGRAFIA (ver eixo 2) — Brasil|Internacional|null + "underlying_nature": "debentures", // acoes|debentures|credito|recebiveis|imoveis|multiativos|tesouro|cambio|private_equity|outro + "debenture": { // só quando há debênture + "incentivada_1243": true, + "lei_12431_status": "confirmed", // confirmed|candidate|not_applicable|unknown + "indexador": "IPCA+", + "vencimento": null + }, + "tax": { "isento": true, "isento_status": "confirmed_exempt" }, + "source": "openfindata", // openfindata|maisretorno|cvm|b3|web_search + "confidence": 0.97, // 0..1; baixa => human-in-the-loop + "as_of": "2026-06-29", // carimbado em America/Sao_Paulo + "cascade": ["openfindata:curated"],// trilha de fontes percorrida + "signals": [ // trilha estruturada: que regra disparou e com qual evidência + { "rule": "curated_seed", "evidence": "ticker=IFRA11", "detail": null } + ], + "notes": "Curated: ETF de debêntures de infraestrutura (FI-Infra, Lei 12.431)…" +} +``` + +### Dois eixos ortogonais (decisão de modelo) + +1. **`macro_class` = classe de ativo**: `Renda Fixa`, `Renda Variável`, + `Multimercado`, `Alternativos`, `Estruturados` (+ `Indefinido` quando o + resolver não decide). Geografia **não** é valor de macro. +2. **`exposure` = geografia/estratégia**: `Brasil` | `Internacional` | `null`. É + onde a exposição econômica está, independente da classe. A B3 é o domicílio do + ativo, não a exposição. Logo: + - **IVVB11** (ETF de S&P 500 listado na B3) → `RV` + `exposure=Internacional` + - **BDR** → `RV` + `exposure=Internacional` (risco cambial/exterior) + - **FIA de mandato global** (ARBOR, WHG) → `RV` + `exposure=Internacional` + +### Eixo de certeza fiscal + +Os bools `incentivada_1243`/`isento` respondem "sim/não". Os status carregam a +**certeza** que o bool não carrega: + +- `lei_12431_status`: `confirmed` (sinal explícito de infra / FI-Infra), + `candidate` (heurística emissor+IPCA, **confirmar por ISIN** antes de tratar + como isento), `not_applicable` (é debênture, mas não infra), `unknown`. +- `isento_status`: `confirmed_exempt` (estatutário: CRA/CRI, LCI/LCA, 12.431 + confirmada), `candidate_exempt` (heurística), `confirmed_taxable`, `unknown`. + +Quando `confidence < ~0.9` ou status `candidate`, é gancho de revisão humana. + +## Cascata de fontes (fallback) + +1. **openfindata** (primário, offline): seed curado + regras estruturais. Resolve + o test set sem rede. +2. **Mais Retorno MCP** (dados BR de fundo/CNPJ/classe CVM). +3. **outro provider** (CVM dados abertos / B3). +4. **web_search restrito** a `maisretorno.com`, `b3.com.br`, + `yahoofinance.com.br`, `debentures.com.br`. + +Cada degrau preenche o que o anterior não trouxe e **baixa a confidence**; +`source` reflete a origem final; `cascade` loga o caminho. Os degraus 2 a 4 são +um ponto de extensão injetável (`AssetProvider`), consultado só quando o +resultado do núcleo está fraco. No estado atual deste PR, **só o degrau 1 está +ligado** (os externos são stubs a conectar no deploy). + +## Test set (passa 100%, offline) + +| Identificador | macro_class | exposure | nota | +|---|---|---|---| +| IFRA11 / FI ITAUINFRA | Renda Fixa | Brasil | ETF de debêntures de infra; "Indexada à Inflação"; isento confirmado | +| ARBOR FIC FIA | Renda Variável | Internacional | mandato global sem "IE" | +| WHG GLOBAL FIC FIA IE | Renda Variável | Internacional | estrutura IE | +| DEB PETROBRAS IPCA+ | Renda Fixa | Brasil | debênture; incentivada **candidate** (confirmar ISIN) | +| COE | Estruturados | (n/a) | `kind=coe`, **nunca** ETF | +| "Crédito Estruturado" (Warren/AMW) | Renda Fixa | Brasil | name-trap: é crédito, não Estruturados | +| IVVB11 | Renda Variável | Internacional | ETF de ações S&P 500 | +| HGLG11 / MXRF11 | Renda Variável | Brasil | subclasse FII | + +## Não-funcionais + +- **Determinístico + cacheável**: mesmo identificador → mesma classificação + (exceto `as_of`); CNPJ/ticker mudam de classe raramente, cachear agressivo. +- **Latência baixa**: núcleo é offline, sem I/O. +- **Auditável**: sempre `source` + `as_of` + `cascade` + `signals`. +- **Sem PII**: só identificador de ativo cruza o boundary. + +## Pendências antes de produção + +- Conectar os providers externos reais (Mais Retorno MCP, web search restrito). +- Confirmação ISIN-level da incentivada (12.431) via ANBIMA/debentures.com.br no + degrau de cascata — hoje fica `candidate`. +- Ampliar o seed curado de ETFs conforme novos ETFs forem listados na B3. From 63b5b3f8e36746314eed2104b5466a0899281963 Mon Sep 17 00:00:00 2001 From: Roberto Date: Mon, 29 Jun 2026 22:12:48 -0300 Subject: [PATCH 7/7] fix(resolver): address remaining CodeRabbit review threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - models: strict ConfigDict(extra="forbid") base for the contract models, so a typo in an internally-built DebentureInfo/TaxInfo payload fails loudly instead of silently dropping a field. - engine: per-provider try/except in the cascade — a raising provider logs provider_error: on the cascade and continues, never nuking the deterministic core result. - engine: tighten the ticker fallback — only suffixes 3-8 map to Ações; other suffixes (subscription rights / odd codes) defer to Indefinido/HITL. Bare ...11 stays FII (spec test set: HGLG11/MXRF11). - seed: token-aware name-marker matching, so ("ARBOR","FIA") matches "ARBOR FIC FIA" but not "ARBOR FIAGRO" (FIA not a token of FIAGRO). - normalize: fold candidate args inside has_token/name_contains so natural-string callers cannot silently miss rules. - tests: regressions for each of the above. Co-Authored-By: Claude Opus 4.8 --- src/findata/resolver/engine.py | 35 ++++++++++++++++++++-------- src/findata/resolver/models.py | 27 ++++++++++++++++------ src/findata/resolver/normalize.py | 15 ++++++++---- src/findata/resolver/seed.py | 13 +++++++++-- tests/test_resolver.py | 38 +++++++++++++++++++++++++++++++ 5 files changed, 106 insertions(+), 22 deletions(-) diff --git a/src/findata/resolver/engine.py b/src/findata/resolver/engine.py index 6cf0646..81159d7 100644 --- a/src/findata/resolver/engine.py +++ b/src/findata/resolver/engine.py @@ -360,15 +360,25 @@ def _ticker_payload(n: NormalizedInput) -> dict[str, Any]: "signals": _signal("bdr", f"ticker={n.ticker}"), } # 3-8: ordinary/preferred share — ação brasileira. + if suffix in {"3", "4", "5", "6", "7", "8"}: + return { + "kind": "acao", + "macro_class": "Renda Variável", + "subclasse": "Ações", + "exposure": "Brasil", + "underlying_nature": "acoes", + "confidence": 0.85, + "notes": "Ação listada na B3 → Renda Variável.", + "signals": _signal("acao", f"ticker={n.ticker}"), + } + # Other suffixes (1/2/9/10/12/13… subscription rights, receipts, odd codes) + # carry no reliable structural signal → defer to HITL/provider cascade. return { - "kind": "acao", - "macro_class": "Renda Variável", - "subclasse": "Ações", - "exposure": "Brasil", - "underlying_nature": "acoes", - "confidence": 0.85, - "notes": "Ação listada na B3 → Renda Variável.", - "signals": _signal("acao", f"ticker={n.ticker}"), + "kind": "outro", + "macro_class": "Indefinido", + "confidence": 0.2, + "notes": "Ticker com sufixo sem sinal estrutural suficiente; requer revisão (HITL).", + "signals": _signal("ticker_suffix_unknown", f"ticker={n.ticker}"), } @@ -709,7 +719,14 @@ async def resolve_asset( # Stop early once we are confident — saves the network round-trips. if result.macro_class != "Indefinido" and result.confidence >= _CONFIDENT_ENOUGH: break - enriched = await provider(norm, result) + # Providers are best-effort enrichment: a flaky network/provider must not + # nuke the deterministic core result. Isolate the failure, log it on the + # cascade, and keep the last good classification. + try: + enriched = await provider(norm, result) + except Exception as exc: # any provider failure is non-fatal + result.cascade.append(f"provider_error:{type(exc).__name__}") + continue if enriched is not None: enriched.cascade = [*result.cascade, *enriched.cascade] result = enriched diff --git a/src/findata/resolver/models.py b/src/findata/resolver/models.py index 2d00ee7..2b6f3b0 100644 --- a/src/findata/resolver/models.py +++ b/src/findata/resolver/models.py @@ -14,7 +14,20 @@ from typing import Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field + + +class _StrictModel(BaseModel): + """Base for the resolver contract models. + + ``extra="forbid"`` so a typo in an internally-built payload (the engine + constructs ``DebentureInfo(**deb)`` / ``TaxInfo(**tax)`` from dicts) raises a + validation error instead of silently dropping the field and emitting a + partially empty classification. + """ + + model_config = ConfigDict(extra="forbid") + # ── Controlled vocabularies ──────────────────────────────────────── @@ -86,7 +99,7 @@ IsentoStatus = Literal["confirmed_exempt", "candidate_exempt", "confirmed_taxable", "unknown"] -class Signal(BaseModel): +class Signal(_StrictModel): """One structured audit entry: which rule fired and what evidence matched. Unlike the free-text ``notes``, a ``Signal`` is machine-readable so an @@ -99,7 +112,7 @@ class Signal(BaseModel): detail: str | None = None # optional extra (e.g. "basis=heuristic", "indexador=IPCA+") -class IdentifierResolved(BaseModel): +class IdentifierResolved(_StrictModel): """The identifiers the resolver could normalize/confirm from the input.""" cnpj: str | None = None @@ -108,7 +121,7 @@ class IdentifierResolved(BaseModel): name: str | None = None -class CvmInfo(BaseModel): +class CvmInfo(_StrictModel): """Raw upstream classification, kept for audit alongside the mapped macro.""" classe: str | None = None @@ -116,7 +129,7 @@ class CvmInfo(BaseModel): estrutura: str | None = None # FIA | FIM | FIC | FIDC | FIP | FII | IE | ETF | ... -class DebentureInfo(BaseModel): +class DebentureInfo(_StrictModel): """Debenture-specific facts. Only populated when ``kind == 'debenture'`` (or an FI-Infra ETF whose underlying *is* incentivada debentures).""" @@ -128,7 +141,7 @@ class DebentureInfo(BaseModel): vencimento: str | None = None # YYYY-MM when known -class TaxInfo(BaseModel): +class TaxInfo(_StrictModel): """Tax treatment for the typical PF holder.""" isento: bool | None = None # True for Lei 12.431 / LCI-LCA / FII dividends etc. @@ -137,7 +150,7 @@ class TaxInfo(BaseModel): isento_status: IsentoStatus = "unknown" -class AssetClassification(BaseModel): +class AssetClassification(_StrictModel): """The full resolver output. One asset in → one auditable record out.""" identifier_resolved: IdentifierResolved diff --git a/src/findata/resolver/normalize.py b/src/findata/resolver/normalize.py index a6188c7..d08662a 100644 --- a/src/findata/resolver/normalize.py +++ b/src/findata/resolver/normalize.py @@ -59,13 +59,20 @@ class NormalizedInput: isin: str | None = None def has_token(self, *candidates: str) -> bool: - """True if any candidate appears as a whole token.""" + """True if any candidate appears as a whole token. + + Candidates are folded internally, so callers may pass natural strings + ("Ações") or pre-folded markers ("ACOES") interchangeably. + """ tset = set(self.tokens) - return any(c in tset for c in candidates) + return any(fold(c) in tset for c in candidates) def name_contains(self, *needles: str) -> bool: - """True if any needle is a substring of the folded name (phrase match).""" - return any(n in self.name_folded for n in needles) + """True if any needle is a substring of the folded name (phrase match). + + Needles are folded internally (see :meth:`has_token`). + """ + return any(fold(n) in self.name_folded for n in needles) @property def ticker_digits_suffix(self) -> str | None: diff --git a/src/findata/resolver/seed.py b/src/findata/resolver/seed.py index 05f9758..e6a074c 100644 --- a/src/findata/resolver/seed.py +++ b/src/findata/resolver/seed.py @@ -28,6 +28,8 @@ from dataclasses import dataclass, field from typing import Any +from findata.resolver.normalize import tokenize + @dataclass(frozen=True) class SeedEntry: @@ -167,8 +169,15 @@ def lookup_seed(*, ticker: str | None, cnpj: str | None, name_folded: str) -> Se return _BY_CNPJ[cnpj] if name_folded: # ``name_folded`` is already ASCII-folded/uppercased by the caller - # (normalize()), so match directly — no second fold. + # (normalize()), so no second fold. Single-word markers must match a + # whole token — so ("ARBOR", "FIA") matches "ARBOR FIC FIA" but NOT + # "ARBOR FIAGRO" (FIA ⊄ FIAGRO as a token). Multi-word / non-alnum + # markers fall back to substring. + tokens = set(tokenize(name_folded)) for entry in _NAME_ENTRIES: - if all(sub in name_folded for sub in entry.name_substrings): + if all( + (sub in tokens) if sub.isalnum() else (sub in name_folded) + for sub in entry.name_substrings + ): return entry return None diff --git a/tests/test_resolver.py b/tests/test_resolver.py index b2b304f..2c37076 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -343,3 +343,41 @@ def test_credito_estruturado_trap_signal_carries_phrase(): def test_every_result_carries_at_least_one_signal(ident): r = classify(normalize(name=ident)) assert len(r.signals) >= 1 + + +# ── Review-bot regressions (PR #33 threads) ──────────────────────── + + +def test_unknown_ticker_suffix_is_indefinido_not_acao(): + # CodeRabbit: bare ...13 (subscription receipt / odd code) must not be forced + # into RV/Ações; it has no structural signal → defer to HITL. + r = classify(normalize(ticker="XPTO13")) + assert r.macro_class == "Indefinido" + assert r.kind == "outro" + + +def test_arbor_fiagro_does_not_match_global_equity_seed(): + # CodeRabbit: the ("ARBOR","FIA") seed must match by token, so "ARBOR FIAGRO" + # (a real-estate/agro vehicle, FIA ⊄ FIAGRO) is NOT swept into RV/Internacional. + r = classify(normalize(name="ARBOR FIAGRO FII")) + assert not (r.macro_class == "Renda Variável" and r.exposure == "Internacional") + + +def test_provider_failure_does_not_abort_resolution(): + # CodeRabbit: a raising provider must not nuke the deterministic core result. + async def boom(norm, current): + raise RuntimeError("network down") + + r = asyncio.run(resolve_asset(name="????", providers=[boom])) + assert r is not None + assert any(c.startswith("provider_error:") for c in r.cascade) + + +def test_contract_models_forbid_unknown_keys(): + # CodeRabbit: a typo in an internally-built payload must fail loudly. + from pydantic import ValidationError + + from findata.resolver.models import DebentureInfo + + with pytest.raises(ValidationError): + DebentureInfo(incentivado_1243=True) # typo: incentivADO