From 7e180d290450a6826caaa9b5478c3df94dedb48b Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Tue, 24 Mar 2026 00:44:55 +0100
Subject: [PATCH 01/27] fix(tee_registry): randomize TEE selection to improve
 load distribution

Previously get_llm_tee() always returned tees[0], the first TEE in the
registry list. This caused all clients to hit the same TEE, providing no
load distribution and no resilience when that TEE starts failing.

Fix: use random.choice(tees) so each LLM() construction independently
picks from all currently active TEEs. Successive retries or re-initializations
will therefore naturally spread across the healthy pool.

Closes #200
---
 src/opengradient/client/tee_registry.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/src/opengradient/client/tee_registry.py b/src/opengradient/client/tee_registry.py
index 9ad3cfd7..b791a383 100644
--- a/src/opengradient/client/tee_registry.py
+++ b/src/opengradient/client/tee_registry.py
@@ -1,6 +1,7 @@
 """TEE Registry client for fetching verified TEE endpoints and TLS certificates."""
 
 import logging
+import random
 import ssl
 from dataclasses import dataclass
 from typing import List, NamedTuple, Optional
@@ -109,17 +110,32 @@ def get_active_tees_by_type(self, tee_type: int) -> List[TEEEndpoint]:
 
     def get_llm_tee(self) -> Optional[TEEEndpoint]:
         """
-        Return the first active LLM proxy TEE from the registry.
+        Return a randomly selected active LLM proxy TEE from the registry.
+
+        Randomizing the selection distributes load across all healthy TEEs and
+        avoids repeatedly routing to the same TEE when it starts failing
+        (addresses issue #200 — improve TEE selection/retry logic).
 
         Returns:
-            TEEEndpoint for an active LLM proxy TEE, or None if none are available.
+            TEEEndpoint for a randomly chosen active LLM proxy TEE, or None if
+            none are available.
         """
         tees = self.get_active_tees_by_type(TEE_TYPE_LLM_PROXY)
         if not tees:
             logger.warning("No active LLM proxy TEEs found in registry")
             return None
 
-        return tees[0]
+        # Randomly select from all active TEEs to distribute load and improve
+        # resilience — if one TEE is failing, successive LLM() constructions
+        # will eventually land on a healthy one.
+        selected = random.choice(tees)
+        logger.debug(
+            "Selected TEE %s (endpoint=%s) from %d active LLM proxy TEE(s)",
+            selected.tee_id,
+            selected.endpoint,
+            len(tees),
+        )
+        return selected
 
 
 def build_ssl_context_from_der(der_cert: bytes) -> ssl.SSLContext:

From cddb41354a063be79df701c3b7aaae6bdb99fdbe Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Tue, 24 Mar 2026 00:46:40 +0100
Subject: [PATCH 02/27] fix(llm): surface HTTP 402 as actionable error with OPG
 approval hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously any HTTP error from the TEE (including 402 Payment Required)
was silently wrapped into a generic RuntimeError("TEE LLM chat failed: ...").
This caused the confusing traceback seen in issue #188 — the real cause
(insufficient OPG allowance) was buried inside the exception message.

Fix:
- Add `import httpx` and intercept httpx.HTTPStatusError before the
  generic `except Exception` handler in both _chat_request() and
  completion().
- When status == 402, raise a RuntimeError with a clear, actionable hint
  telling the user to call llm.ensure_opg_approval(opg_amount=<amount>).
- Also handle 402 in _parse_sse_response() for the streaming path.
- All other HTTP errors continue to surface as before.

Closes #188
---
 src/opengradient/client/llm.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py
index eecfa148..d66bf009 100644
--- a/src/opengradient/client/llm.py
+++ b/src/opengradient/client/llm.py
@@ -1,11 +1,12 @@
 """LLM chat and completion via TEE-verified execution with x402 payments."""
 
-import json
+import json as _json
 import logging
 import ssl
 from dataclasses import dataclass
 from typing import AsyncGenerator, Dict, List, Optional, Union
 
+import httpx
 from eth_account import Account
 from eth_account.account import LocalAccount
 from x402 import x402Client
@@ -31,6 +32,12 @@
 _COMPLETION_ENDPOINT = "/v1/completions"
 _REQUEST_TIMEOUT = 60
 
+_402_HINT = (
+    "Payment required (HTTP 402): your wallet may have insufficient OPG token allowance. "
+    "Call llm.ensure_opg_approval(opg_amount=<amount>) to approve Permit2 spending "
+    "before making requests. Minimum amount is 0.05 OPG."
+)
+
 
 @dataclass
 class _ChatParams:
@@ -267,7 +274,7 @@ async def completion(
             )
             response.raise_for_status()
             raw_body = await response.aread()
-            result = json.loads(raw_body.decode())
+            result = _json.loads(raw_body.decode())
             return TextGenerationOutput(
                 transaction_hash="external",
                 completion_output=result.get("completion"),
@@ -277,6 +284,10 @@ async def completion(
             )
         except RuntimeError:
             raise
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code == 402:
+                raise RuntimeError(_402_HINT) from e
+            raise RuntimeError(f"TEE LLM completion failed: {e}") from e
         except Exception as e:
             raise RuntimeError(f"TEE LLM completion failed: {e}") from e
 
@@ -354,7 +365,7 @@ async def _chat_request(self, params: _ChatParams, messages: List[Dict]) -> Text
             )
             response.raise_for_status()
             raw_body = await response.aread()
-            result = json.loads(raw_body.decode())
+            result = _json.loads(raw_body.decode())
 
             choices = result.get("choices")
             if not choices:
@@ -377,6 +388,12 @@ async def _chat_request(self, params: _ChatParams, messages: List[Dict]) -> Text
             )
         except RuntimeError:
             raise
+        except httpx.HTTPStatusError as e:
+            # Provide an actionable error message for the very common 402 case
+            # (issue #188 — users see a cryptic RuntimeError instead of guidance).
+            if e.response.status_code == 402:
+                raise RuntimeError(_402_HINT) from e
+            raise RuntimeError(f"TEE LLM chat failed: {e}") from e
         except Exception as e:
             raise RuntimeError(f"TEE LLM chat failed: {e}") from e
 
@@ -425,6 +442,8 @@ async def _parse_sse_response(self, response) -> AsyncGenerator[StreamChunk, Non
         status_code = getattr(response, "status_code", None)
         if status_code is not None and status_code >= 400:
             body = await response.aread()
+            if status_code == 402:
+                raise RuntimeError(_402_HINT)
             raise RuntimeError(f"TEE LLM streaming request failed with status {status_code}: {body.decode('utf-8', errors='replace')}")
 
         buffer = b""
@@ -452,8 +471,8 @@ async def _parse_sse_response(self, response) -> AsyncGenerator[StreamChunk, Non
                     return
 
                 try:
-                    data = json.loads(data_str)
-                except json.JSONDecodeError:
+                    data = _json.loads(data_str)
+                except _json.JSONDecodeError:
                     continue
 
                 chunk = StreamChunk.from_sse_data(data)

From d345fdde234b883382b735c778ebccf99601477a Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Tue, 24 Mar 2026 00:47:43 +0100
Subject: [PATCH 03/27] fix(_conversions): return int for zero-decimal
 fixed-point values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously convert_to_float32() always returned np.float32 regardless of
the decimals field, forcing callers to manually cast integer results
(issue #103 — add proper type conversions from Solidity contract to Python).

Fix:
- Add convert_fixed_point_to_python(value, decimals) that returns int when
  decimals == 0 and np.float32 otherwise.  np.array() automatically picks
  the correct dtype (int64 vs float32) based on the element types.
- Update both convert_to_model_output() and convert_array_to_model_output()
  to call the new function.
- Keep convert_to_float32() as a deprecated backward-compatible alias so
  any external code that imports it directly continues to work.

Partially closes #103
---
 src/opengradient/client/_conversions.py | 49 +++++++++++++++++++------
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/opengradient/client/_conversions.py b/src/opengradient/client/_conversions.py
index 495f663b..1b9e7535 100644
--- a/src/opengradient/client/_conversions.py
+++ b/src/opengradient/client/_conversions.py
@@ -3,7 +3,7 @@
 import json
 import logging
 from decimal import Decimal
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union
 
 import numpy as np
 from web3.datastructures import AttributeDict
@@ -36,11 +36,34 @@ def convert_to_fixed_point(number: float) -> Tuple[int, int]:
     return value, decimals
 
 
+def convert_fixed_point_to_python(value: int, decimals: int) -> Union[int, np.float32]:
+    """
+    Converts a fixed-point representation back to a native Python/NumPy type.
+
+    Returns int when decimals == 0 (preserving integer semantics for
+    tensors that were originally integers — fixes issue #103 where callers
+    expecting int results received np.float32 and had to cast manually).
+    Returns np.float32 for all other cases.
+
+    Args:
+        value:    The integer significand stored on-chain.
+        decimals: The scale factor exponent (value / 10**decimals).
+
+    Returns:
+        int if decimals == 0, np.float32 otherwise.
+    """
+    if decimals == 0:
+        return int(value)
+    return np.float32(Decimal(value) / (10 ** Decimal(decimals)))
+
+
 def convert_to_float32(value: int, decimals: int) -> np.float32:
     """
-    Converts fixed point back into floating point
+    Deprecated: use convert_fixed_point_to_python() instead.
 
-    Returns an np.float32 type
+    Kept for backwards compatibility — always returns np.float32 regardless
+    of the decimals value.  New callers should use convert_fixed_point_to_python
+    which correctly returns int when decimals == 0.
     """
     return np.float32(Decimal(value) / (10 ** Decimal(decimals)))
 
@@ -61,11 +84,11 @@ def convert_to_model_input(inputs: Dict[str, np.ndarray]) -> Tuple[List[Tuple[st
     for tensor_name, tensor_data in inputs.items():
         # Convert to NP array if list or single object
         if isinstance(tensor_data, list):
-            logging.debug(f"\tConverting {tensor_data} to np array")
+            logging.debug(f"	Converting {tensor_data} to np array")
             tensor_data = np.array(tensor_data)
 
         if isinstance(tensor_data, (str, int, float)):
-            logging.debug(f"\tConverting single entry {tensor_data} to a list")
+            logging.debug(f"	Converting single entry {tensor_data} to a list")
             tensor_data = np.array([tensor_data])
 
         # Check if type is np array
@@ -84,7 +107,7 @@ def convert_to_model_input(inputs: Dict[str, np.ndarray]) -> Tuple[List[Tuple[st
             converted_tensor_data = np.array([convert_to_fixed_point(i) for i in flat_data], dtype=data_type)
 
             input = (tensor_name, converted_tensor_data.tolist(), shape)
-            logging.debug("\tFloating tensor input: %s", input)
+            logging.debug("	Floating tensor input: %s", input)
 
             number_tensors.append(input)
         elif issubclass(tensor_data.dtype.type, np.integer):
@@ -93,13 +116,13 @@ def convert_to_model_input(inputs: Dict[str, np.ndarray]) -> Tuple[List[Tuple[st
             converted_tensor_data = np.array([convert_to_fixed_point(int(i)) for i in flat_data], dtype=data_type)
 
             input = (tensor_name, converted_tensor_data.tolist(), shape)
-            logging.debug("\tInteger tensor input: %s", input)
+            logging.debug("	Integer tensor input: %s", input)
 
             number_tensors.append(input)
         elif issubclass(tensor_data.dtype.type, np.str_):
             # TODO (Kyle): Add shape into here as well
             input = (tensor_name, [s for s in flat_data])
-            logging.debug("\tString tensor input: %s", input)
+            logging.debug("	String tensor input: %s", input)
 
             string_tensors.append(input)
         else:
@@ -131,10 +154,11 @@ def convert_to_model_output(event_data: AttributeDict) -> Dict[str, np.ndarray]:
                 name = tensor.get("name")
                 shape = tensor.get("shape")
                 values = []
-                # Convert from fixed point back into np.float32
+                # Use convert_fixed_point_to_python so integer tensors (decimals==0)
+                # come back as int instead of np.float32 (fixes issue #103).
                 for v in tensor.get("values", []):
                     if isinstance(v, (AttributeDict, dict)):
-                        values.append(convert_to_float32(value=int(v.get("value")), decimals=int(v.get("decimals"))))
+                        values.append(convert_fixed_point_to_python(value=int(v.get("value")), decimals=int(v.get("decimals"))))
                     else:
                         logging.warning(f"Unexpected number type: {type(v)}")
                 output_dict[name] = np.array(values).reshape(shape)
@@ -183,10 +207,11 @@ def convert_array_to_model_output(array_data: List) -> ModelOutput:
         values = tensor[1]
         shape = tensor[2]
 
-        # Convert from fixed point into np.float32
+        # Use convert_fixed_point_to_python so integer tensors (decimals==0)
+        # come back as int instead of np.float32 (fixes issue #103).
         converted_values = []
         for value in values:
-            converted_values.append(convert_to_float32(value=value[0], decimals=value[1]))
+            converted_values.append(convert_fixed_point_to_python(value=value[0], decimals=value[1]))
 
         number_data[name] = np.array(converted_values).reshape(shape)
 

From 72a6279468c24b795ad7f76061d17d66fc536642 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Tue, 24 Mar 2026 00:49:18 +0100
Subject: [PATCH 04/27] fix(alpha): guard against None from inference node +
 use estimate_gas in run_workflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug 4 — None crash in infer():
  _get_inference_result_from_node() can legitimately return None when the
  inference node has no result yet.  Previously this None was passed
  directly to convert_to_model_output(), which calls event_data.get(),
  causing an AttributeError: 'NoneType' object has no attribute 'get'.

  Fix: after calling _get_inference_result_from_node(), check for None
  and raise a clear RuntimeError with a human-readable message and the
  inference_id so callers know what to retry.

  Also guard the precompile log fetch: if parsed_logs is empty, raise
  RuntimeError instead of crashing with an IndexError on parsed_logs[0].

Bug 5 — hardcoded 30M gas in run_workflow():
  run_workflow() built the transaction with gas=30000000 (30 million)
  unconditionally.  This is wasteful (users overpay for gas) and can
  fail on networks with a lower block gas limit.

  Fix: call estimate_gas() first (consistent with infer()), then multiply
  by 3 for headroom.  Fall back to 30M only if estimation itself fails.

Also fixes new_workflow() deploy to use INFERENCE_TX_TIMEOUT constant
instead of the previous hardcoded 60 seconds.
---
 src/opengradient/client/alpha.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/opengradient/client/alpha.py b/src/opengradient/client/alpha.py
index a4e633ec..967f2774 100644
--- a/src/opengradient/client/alpha.py
+++ b/src/opengradient/client/alpha.py
@@ -119,9 +119,19 @@ def execute_transaction():
             model_output = convert_to_model_output(parsed_logs[0]["args"])
             if len(model_output) == 0:
                 # check inference directly from node
-                parsed_logs = precompile_contract.events.ModelInferenceEvent().process_receipt(tx_receipt, errors=DISCARD)
-                inference_id = parsed_logs[0]["args"]["inferenceID"]
+                precompile_logs = precompile_contract.events.ModelInferenceEvent().process_receipt(tx_receipt, errors=DISCARD)
+                if not precompile_logs:
+                    raise RuntimeError(
+                        "ModelInferenceEvent not found in transaction logs. "
+                        "Cannot fall back to node-side inference result."
+                    )
+                inference_id = precompile_logs[0]["args"]["inferenceID"]
                 inference_result = self._get_inference_result_from_node(inference_id, inference_mode)
+                if inference_result is None:
+                    raise RuntimeError(
+                        f"Inference node returned no result for inference ID {inference_id!r}. "
+                        "The result may not be available yet — retry after a short delay."
+                    )
                 model_output = convert_to_model_output(inference_result)
 
             return InferenceResult(tx_hash.hex(), model_output)
@@ -315,7 +325,7 @@ def deploy_transaction():
             signed_txn = self._wallet_account.sign_transaction(transaction)
             tx_hash = self._blockchain.eth.send_raw_transaction(signed_txn.raw_transaction)
 
-            tx_receipt = self._blockchain.eth.wait_for_transaction_receipt(tx_hash, timeout=60)
+            tx_receipt = self._blockchain.eth.wait_for_transaction_receipt(tx_hash, timeout=INFERENCE_TX_TIMEOUT)
 
             if tx_receipt["status"] == 0:
                 raise Exception(f"Contract deployment failed, transaction hash: {tx_hash.hex()}")
@@ -419,11 +429,20 @@ def run_workflow(self, contract_address: str) -> ModelOutput:
         nonce = self._blockchain.eth.get_transaction_count(self._wallet_account.address, "pending")
 
         run_function = contract.functions.run()
+
+        # Estimate gas instead of using a hardcoded 30M limit, which is wasteful
+        # and may exceed the block gas limit on some networks.
+        try:
+            estimated_gas = run_function.estimate_gas({"from": self._wallet_account.address})
+            gas_limit = int(estimated_gas * 3)
+        except Exception:
+            gas_limit = 30000000  # Conservative fallback if estimation fails
+
         transaction = run_function.build_transaction(
             {
                 "from": self._wallet_account.address,
                 "nonce": nonce,
-                "gas": 30000000,
+                "gas": gas_limit,
                 "gasPrice": self._blockchain.eth.gas_price,
                 "chainId": self._blockchain.eth.chain_id,
             }

From 7c786b12ba0d3a8272719a814220652839e17c3f Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 00:41:45 +0100
Subject: [PATCH 05/27] fix: make convert_fixed_point_to_python type-stable
 (always np.float32), replace literal tabs with \t

---
 src/opengradient/client/_conversions.py | 32 ++++++++++++-------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/opengradient/client/_conversions.py b/src/opengradient/client/_conversions.py
index 1b9e7535..895408d6 100644
--- a/src/opengradient/client/_conversions.py
+++ b/src/opengradient/client/_conversions.py
@@ -36,24 +36,22 @@ def convert_to_fixed_point(number: float) -> Tuple[int, int]:
     return value, decimals
 
 
-def convert_fixed_point_to_python(value: int, decimals: int) -> Union[int, np.float32]:
+def convert_fixed_point_to_python(value: int, decimals: int) -> np.float32:
     """
-    Converts a fixed-point representation back to a native Python/NumPy type.
+    Converts a fixed-point representation back to a NumPy float32.
 
-    Returns int when decimals == 0 (preserving integer semantics for
-    tensors that were originally integers — fixes issue #103 where callers
-    expecting int results received np.float32 and had to cast manually).
-    Returns np.float32 for all other cases.
+    This function is intentionally type-stable and always returns np.float32,
+    regardless of the value of `decimals`. Callers that require integer
+    semantics should perform an explicit cast (e.g., int(...)) based on
+    their own dtype metadata or application logic.
 
     Args:
         value:    The integer significand stored on-chain.
         decimals: The scale factor exponent (value / 10**decimals).
 
     Returns:
-        int if decimals == 0, np.float32 otherwise.
+        np.float32 corresponding to `value / 10**decimals`.
     """
-    if decimals == 0:
-        return int(value)
     return np.float32(Decimal(value) / (10 ** Decimal(decimals)))
 
 
@@ -61,9 +59,9 @@ def convert_to_float32(value: int, decimals: int) -> np.float32:
     """
     Deprecated: use convert_fixed_point_to_python() instead.
 
-    Kept for backwards compatibility — always returns np.float32 regardless
-    of the decimals value.  New callers should use convert_fixed_point_to_python
-    which correctly returns int when decimals == 0.
+    Kept for backwards compatibility. New callers should use
+    convert_fixed_point_to_python which is type-stable and always
+    returns np.float32.
     """
     return np.float32(Decimal(value) / (10 ** Decimal(decimals)))
 
@@ -84,11 +82,11 @@ def convert_to_model_input(inputs: Dict[str, np.ndarray]) -> Tuple[List[Tuple[st
     for tensor_name, tensor_data in inputs.items():
         # Convert to NP array if list or single object
         if isinstance(tensor_data, list):
-            logging.debug(f"	Converting {tensor_data} to np array")
+            logging.debug(f"\tConverting {tensor_data} to np array")
             tensor_data = np.array(tensor_data)
 
         if isinstance(tensor_data, (str, int, float)):
-            logging.debug(f"	Converting single entry {tensor_data} to a list")
+            logging.debug(f"\tConverting single entry {tensor_data} to a list")
             tensor_data = np.array([tensor_data])
 
         # Check if type is np array
@@ -107,7 +105,7 @@ def convert_to_model_input(inputs: Dict[str, np.ndarray]) -> Tuple[List[Tuple[st
             converted_tensor_data = np.array([convert_to_fixed_point(i) for i in flat_data], dtype=data_type)
 
             input = (tensor_name, converted_tensor_data.tolist(), shape)
-            logging.debug("	Floating tensor input: %s", input)
+            logging.debug("\tFloating tensor input: %s", input)
 
             number_tensors.append(input)
         elif issubclass(tensor_data.dtype.type, np.integer):
@@ -116,13 +114,13 @@ def convert_to_model_input(inputs: Dict[str, np.ndarray]) -> Tuple[List[Tuple[st
             converted_tensor_data = np.array([convert_to_fixed_point(int(i)) for i in flat_data], dtype=data_type)
 
             input = (tensor_name, converted_tensor_data.tolist(), shape)
-            logging.debug("	Integer tensor input: %s", input)
+            logging.debug("\tInteger tensor input: %s", input)
 
             number_tensors.append(input)
         elif issubclass(tensor_data.dtype.type, np.str_):
             # TODO (Kyle): Add shape into here as well
             input = (tensor_name, [s for s in flat_data])
-            logging.debug("	String tensor input: %s", input)
+            logging.debug("\tString tensor input: %s", input)
 
             string_tensors.append(input)
         else:

From deff27c98c1e6f92238658416440a4effcbba18f Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 00:41:46 +0100
Subject: [PATCH 06/27] fix: separate ContractLogicError from generic Exception
 in gas estimation fallback

---
 src/opengradient/client/alpha.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/opengradient/client/alpha.py b/src/opengradient/client/alpha.py
index 967f2774..094efaa3 100644
--- a/src/opengradient/client/alpha.py
+++ b/src/opengradient/client/alpha.py
@@ -435,8 +435,18 @@ def run_workflow(self, contract_address: str) -> ModelOutput:
         try:
             estimated_gas = run_function.estimate_gas({"from": self._wallet_account.address})
             gas_limit = int(estimated_gas * 3)
+        except ContractLogicError as exc:
+            # Estimation failed due to a contract revert — simulate the call to
+            # surface the revert reason and avoid sending a transaction that will fail.
+            try:
+                run_function.call({"from": self._wallet_account.address})
+            except ContractLogicError as call_exc:
+                # Re-raise the detailed revert reason from the simulated call.
+                raise call_exc
+            # If the simulated call somehow doesn't raise, re-raise the original error.
+            raise exc
         except Exception:
-            gas_limit = 30000000  # Conservative fallback if estimation fails
+            gas_limit = 30000000  # Conservative fallback for transient/RPC estimation errors
 
         transaction = run_function.build_transaction(
             {

From 5dcecd14e4dd24a15322a0db0f22e3e6d5319706 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 00:41:46 +0100
Subject: [PATCH 07/27] test: fix TestGetLlmTee to patch random.choice for
 deterministic CI

---
 tests/tee_registry_test.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/tee_registry_test.py b/tests/tee_registry_test.py
index 189ad26c..a577d43b 100644
--- a/tests/tee_registry_test.py
+++ b/tests/tee_registry_test.py
@@ -1,6 +1,7 @@
 import os
 import ssl
 import sys
+import random
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -148,19 +149,36 @@ def test_validator_type_label(self, mock_contract):
 
 
 class TestGetLlmTee:
-    def test_returns_first_active_tee(self, mock_contract):
+    def test_returns_active_tee_from_pool(self, mock_contract):
+        """get_llm_tee uses random.choice; patch it to make the test deterministic."""
         registry, contract = mock_contract
 
-        contract.functions.getActiveTEEs.return_value.call.return_value = [
+        tee_infos = [
             _make_tee_info(endpoint="https://tee-1.example.com"),
             _make_tee_info(endpoint="https://tee-2.example.com"),
         ]
+        contract.functions.getActiveTEEs.return_value.call.return_value = tee_infos
 
-        result = registry.get_llm_tee()
+        with patch("src.opengradient.client.tee_registry.random.choice", side_effect=lambda seq: seq[0]):
+            result = registry.get_llm_tee()
 
         assert result is not None
         assert result.endpoint == "https://tee-1.example.com"
 
+    def test_returns_any_active_tee(self, mock_contract):
+        """Without patching random.choice, result must still be one of the active TEEs."""
+        registry, contract = mock_contract
+
+        endpoints = {"https://tee-1.example.com", "https://tee-2.example.com"}
+        contract.functions.getActiveTEEs.return_value.call.return_value = [
+            _make_tee_info(endpoint=ep) for ep in sorted(endpoints)
+        ]
+
+        result = registry.get_llm_tee()
+
+        assert result is not None
+        assert result.endpoint in endpoints
+
     def test_returns_none_when_no_tees(self, mock_contract):
         registry, contract = mock_contract
 

From 6cfbb68d565eb1dc9d6032fd005afc07b22c23f1 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 00:50:46 +0100
Subject: [PATCH 08/27] =?UTF-8?q?fix:=20merge=20main=20into=20llm.py=20?=
 =?UTF-8?q?=E2=80=94=20restore=20=5Fconnect=5Ftee,=20=5Frefresh=5Ftee,=20?=
 =?UTF-8?q?=5Fcall=5Fwith=5Ftee=5Fretry;=20keep=20402=20hint,=20json=20ali?=
 =?UTF-8?q?as,=20OPG=200.05?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/opengradient/client/llm.py | 125 +++++++++++++++++++++++++--------
 1 file changed, 96 insertions(+), 29 deletions(-)

diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py
index d66bf009..8896eac0 100644
--- a/src/opengradient/client/llm.py
+++ b/src/opengradient/client/llm.py
@@ -4,9 +4,9 @@
 import logging
 import ssl
 from dataclasses import dataclass
-from typing import AsyncGenerator, Dict, List, Optional, Union
-
+from typing import AsyncGenerator, Awaitable, Callable, Dict, List, Optional, TypeVar, Union
 import httpx
+
 from eth_account import Account
 from eth_account.account import LocalAccount
 from x402 import x402Client
@@ -20,6 +20,7 @@
 from .tee_registry import TEERegistry, build_ssl_context_from_der
 
 logger = logging.getLogger(__name__)
+T = TypeVar("T")
 
 DEFAULT_RPC_URL = "https://ogevmdevnet.opengradient.ai"
 DEFAULT_TEE_REGISTRY_ADDRESS = "0x4e72238852f3c918f4E4e57AeC9280dDB0c80248"
@@ -101,32 +102,44 @@ def __init__(
         llm_server_url: Optional[str] = None,
     ):
         self._wallet_account: LocalAccount = Account.from_key(private_key)
+        self._rpc_url = rpc_url
+        self._tee_registry_address = tee_registry_address
+        self._llm_server_url = llm_server_url
+
+        # x402 payment stack (created once, reused across TEE refreshes)
+        signer = EthAccountSigner(self._wallet_account)
+        self._x402_client = x402Client()
+        register_exact_evm_client(self._x402_client, signer, networks=[BASE_TESTNET_NETWORK])
+        register_upto_evm_client(self._x402_client, signer, networks=[BASE_TESTNET_NETWORK])
 
+        self._connect_tee()
+
+    # ── TEE resolution and connection ───────────────────────────────────────────
+
+    def _connect_tee(self) -> None:
+        """Resolve TEE from registry and create a secure HTTP client for it."""
         endpoint, tls_cert_der, tee_id, tee_payment_address = self._resolve_tee(
-            llm_server_url,
-            rpc_url,
-            tee_registry_address,
+            self._llm_server_url,
+            self._rpc_url,
+            self._tee_registry_address,
         )
-
         self._tee_id = tee_id
         self._tee_endpoint = endpoint
         self._tee_payment_address = tee_payment_address
 
         ssl_ctx = build_ssl_context_from_der(tls_cert_der) if tls_cert_der else None
-        # When connecting directly via llm_server_url, skip cert verification —
-        # self-hosted TEE servers commonly use self-signed certificates.
-        verify_ssl = llm_server_url is None
-        self._tls_verify: Union[ssl.SSLContext, bool] = ssl_ctx if ssl_ctx else verify_ssl
-
-        # x402 client and signer
-        signer = EthAccountSigner(self._wallet_account)
-        self._x402_client = x402Client()
-        register_exact_evm_client(self._x402_client, signer, networks=[BASE_TESTNET_NETWORK])
-        register_upto_evm_client(self._x402_client, signer, networks=[BASE_TESTNET_NETWORK])
-        # httpx.AsyncClient subclass - construction is sync, connections open lazily
+        self._tls_verify: Union[ssl.SSLContext, bool] = ssl_ctx if ssl_ctx else (self._llm_server_url is None)
         self._http_client = x402HttpxClient(self._x402_client, verify=self._tls_verify)
 
-    # ── TEE resolution ──────────────────────────────────────────────────
+    async def _refresh_tee(self) -> None:
+        """Re-resolve TEE from the registry and rebuild the HTTP client."""
+        old_http_client = self._http_client
+        self._connect_tee()
+        try:
+            await old_http_client.aclose()
+        except Exception:
+            logger.debug("Failed to close previous HTTP client during TEE refresh.", exc_info=True)
+
 
     @staticmethod
     def _resolve_tee(
@@ -195,6 +208,29 @@ def _tee_metadata(self) -> Dict:
             tee_payment_address=self._tee_payment_address,
         )
 
+    async def _call_with_tee_retry(
+        self,
+        operation_name: str,
+        call: Callable[[], Awaitable[T]],
+    ) -> T:
+        """Execute *call*; on connection failure, pick a new TEE and retry once.
+
+        Only retries when the request never reached the server (no HTTP response).
+        Server-side errors (4xx/5xx) are not retried.
+        """
+        try:
+            return await call()
+        except httpx.HTTPStatusError:
+            raise
+        except Exception as exc:
+            logger.warning(
+                "Connection failure during %s; refreshing TEE and retrying once: %s",
+                operation_name,
+                exc,
+            )
+            await self._refresh_tee()
+            return await call()
+
     # ── Public API ──────────────────────────────────────────────────────
 
     def ensure_opg_approval(self, opg_amount: float) -> Permit2ApprovalResult:
@@ -255,7 +291,6 @@ async def completion(
             RuntimeError: If the inference fails.
         """
         model_id = model.split("/")[1]
-        headers = self._headers(x402_settlement_mode)
         payload: Dict = {
             "model": model_id,
             "prompt": prompt,
@@ -265,11 +300,11 @@ async def completion(
         if stop_sequence:
             payload["stop"] = stop_sequence
 
-        try:
+        async def _request() -> TextGenerationOutput:
             response = await self._http_client.post(
                 self._tee_endpoint + _COMPLETION_ENDPOINT,
                 json=payload,
-                headers=headers,
+                headers=self._headers(x402_settlement_mode),
                 timeout=_REQUEST_TIMEOUT,
             )
             response.raise_for_status()
@@ -282,12 +317,15 @@ async def completion(
                 tee_timestamp=result.get("tee_timestamp"),
                 **self._tee_metadata(),
             )
-        except RuntimeError:
-            raise
+
+        try:
+            return await self._call_with_tee_retry("completion", _request)
         except httpx.HTTPStatusError as e:
             if e.response.status_code == 402:
                 raise RuntimeError(_402_HINT) from e
             raise RuntimeError(f"TEE LLM completion failed: {e}") from e
+        except RuntimeError:
+            raise
         except Exception as e:
             raise RuntimeError(f"TEE LLM completion failed: {e}") from e
 
@@ -353,14 +391,13 @@ async def chat(
 
     async def _chat_request(self, params: _ChatParams, messages: List[Dict]) -> TextGenerationOutput:
         """Non-streaming chat request."""
-        headers = self._headers(params.x402_settlement_mode)
         payload = self._chat_payload(params, messages)
 
-        try:
+        async def _request() -> TextGenerationOutput:
             response = await self._http_client.post(
                 self._tee_endpoint + _CHAT_ENDPOINT,
                 json=payload,
-                headers=headers,
+                headers=self._headers(params.x402_settlement_mode),
                 timeout=_REQUEST_TIMEOUT,
             )
             response.raise_for_status()
@@ -386,14 +423,17 @@ async def _chat_request(self, params: _ChatParams, messages: List[Dict]) -> Text
                 tee_timestamp=result.get("tee_timestamp"),
                 **self._tee_metadata(),
             )
-        except RuntimeError:
-            raise
+
+        try:
+            return await self._call_with_tee_retry("chat", _request)
         except httpx.HTTPStatusError as e:
             # Provide an actionable error message for the very common 402 case
             # (issue #188 — users see a cryptic RuntimeError instead of guidance).
             if e.response.status_code == 402:
                 raise RuntimeError(_402_HINT) from e
             raise RuntimeError(f"TEE LLM chat failed: {e}") from e
+        except RuntimeError:
+            raise
         except Exception as e:
             raise RuntimeError(f"TEE LLM chat failed: {e}") from e
 
@@ -427,6 +467,33 @@ async def _chat_stream(self, params: _ChatParams, messages: List[Dict]) -> Async
         headers = self._headers(params.x402_settlement_mode)
         payload = self._chat_payload(params, messages, stream=True)
 
+        chunks_yielded = False
+        try:
+            async with self._http_client.stream(
+                "POST",
+                self._tee_endpoint + _CHAT_ENDPOINT,
+                json=payload,
+                headers=headers,
+                timeout=_REQUEST_TIMEOUT,
+            ) as response:
+                async for chunk in self._parse_sse_response(response):
+                    chunks_yielded = True
+                    yield chunk
+            return
+        except httpx.HTTPStatusError:
+            raise
+        except Exception as exc:
+            if chunks_yielded:
+                raise
+            logger.warning(
+                "Connection failure during stream setup; refreshing TEE and retrying once: %s",
+                exc,
+            )
+
+        # Only reached if the first attempt failed before yielding any chunks.
+        # Re-resolve the TEE endpoint from the registry and retry once.
+        await self._refresh_tee()
+        headers = self._headers(params.x402_settlement_mode)
         async with self._http_client.stream(
             "POST",
             self._tee_endpoint + _CHAT_ENDPOINT,
@@ -441,9 +508,9 @@ async def _parse_sse_response(self, response) -> AsyncGenerator[StreamChunk, Non
         """Parse an SSE response stream into StreamChunk objects."""
         status_code = getattr(response, "status_code", None)
         if status_code is not None and status_code >= 400:
-            body = await response.aread()
             if status_code == 402:
                 raise RuntimeError(_402_HINT)
+            body = await response.aread()
             raise RuntimeError(f"TEE LLM streaming request failed with status {status_code}: {body.decode('utf-8', errors='replace')}")
 
         buffer = b""

From 7b2157b5ccad506fe64c97ec206465fe9628cc86 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:01:05 +0100
Subject: [PATCH 09/27] test: add HTTP 402 hint coverage for completion, chat,
 and streaming

---
 tests/llm_test.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/tests/llm_test.py b/tests/llm_test.py
index bb845a75..fb98bafb 100644
--- a/tests/llm_test.py
+++ b/tests/llm_test.py
@@ -47,7 +47,11 @@ async def post(self, url: str, *, json=None, headers=None, timeout=None) -> "_Fa
         self._post_calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
         resp = _FakeResponse(self._response_status, self._response_body)
         if self._response_status >= 400:
-            resp.raise_for_status = MagicMock(side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=MagicMock()))
+            mock_response = MagicMock()
+            mock_response.status_code = self._response_status
+            resp.raise_for_status = MagicMock(
+                side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=mock_response)
+            )
         return resp
 
     @asynccontextmanager
@@ -209,6 +213,14 @@ async def test_http_error_raises_opengradient_error(self, fake_http):
         with pytest.raises(RuntimeError, match="TEE LLM completion failed"):
             await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
 
+    async def test_http_402_raises_hint(self, fake_http):
+        """HTTP 402 from the TEE must surface the actionable _402_HINT message."""
+        fake_http.set_response(402, {})
+        llm = _make_llm()
+
+        with pytest.raises(RuntimeError, match="Payment required \(HTTP 402\)"):
+            await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
+
 
 # ── Chat (non-streaming) tests ───────────────────────────────────────
 
@@ -361,6 +373,14 @@ async def test_http_error_raises_opengradient_error(self, fake_http):
         with pytest.raises(RuntimeError, match="TEE LLM chat failed"):
             await llm.chat(model=TEE_LLM.GPT_5, messages=[{"role": "user", "content": "Hi"}])
 
+    async def test_http_402_raises_hint(self, fake_http):
+        """HTTP 402 from the TEE must surface the actionable _402_HINT message."""
+        fake_http.set_response(402, {})
+        llm = _make_llm()
+
+        with pytest.raises(RuntimeError, match="Payment required \(HTTP 402\)"):
+            await llm.chat(model=TEE_LLM.GPT_5, messages=[{"role": "user", "content": "Hi"}])
+
 
 # ── Streaming tests ──────────────────────────────────────────────────
 
@@ -440,6 +460,20 @@ async def test_stream_error_raises(self, fake_http):
         with pytest.raises(RuntimeError, match="streaming request failed"):
             _ = [chunk async for chunk in gen]
 
+    async def test_stream_402_raises_hint(self, fake_http):
+        """HTTP 402 during streaming must surface the actionable _402_HINT message."""
+        fake_http.set_stream_response(402, [b"Payment Required"])
+        llm = _make_llm()
+
+        gen = await llm.chat(
+            model=TEE_LLM.GPT_5,
+            messages=[{"role": "user", "content": "Hi"}],
+            stream=True,
+        )
+
+        with pytest.raises(RuntimeError, match="Payment required \(HTTP 402\)"):
+            _ = [chunk async for chunk in gen]
+
     async def test_tools_with_stream_falls_back_to_single_chunk(self, fake_http):
         """When tools + stream=True, LLM falls back to non-streaming and yields one chunk."""
         tools = [{"type": "function", "function": {"name": "f"}}]

From 23f06db080bd4fdb9a86530b65036f7e5e7dceb5 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:05:56 +0100
Subject: [PATCH 10/27] =?UTF-8?q?fix:=20merge=20main=20into=20llm=5Ftest.p?=
 =?UTF-8?q?y=20=E2=80=94=20restore=20retry/SSL=20tests,=20add=20402=20hint?=
 =?UTF-8?q?=20coverage,=20fix=20mock=20status=5Fcode?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/llm_test.py | 259 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 4 deletions(-)

diff --git a/tests/llm_test.py b/tests/llm_test.py
index fb98bafb..90c46621 100644
--- a/tests/llm_test.py
+++ b/tests/llm_test.py
@@ -5,9 +5,10 @@
 """
 
 import json
+import ssl
 from contextlib import asynccontextmanager
 from typing import List
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import httpx
 import pytest
@@ -31,6 +32,8 @@ def __init__(self, *_args, **_kwargs):
         self._response_body: bytes = b"{}"
         self._post_calls: List[dict] = []
         self._stream_response = None
+        self._error_on_next: BaseException | None = None
+        self._stream_error_on_next: BaseException | None = None
 
     def set_response(self, status_code: int, body: dict) -> None:
         self._response_status = status_code
@@ -43,8 +46,19 @@ def set_stream_response(self, status_code: int, chunks: List[bytes]) -> None:
     def post_calls(self) -> List[dict]:
         return self._post_calls
 
+    def fail_next_post(self, exc: BaseException) -> None:
+        """Make the next post() call raise *exc*, then revert to normal."""
+        self._error_on_next = exc
+
+    def fail_next_stream(self, exc: BaseException) -> None:
+        """Make the next stream() call raise *exc*, then revert to normal."""
+        self._stream_error_on_next = exc
+
     async def post(self, url: str, *, json=None, headers=None, timeout=None) -> "_FakeResponse":
         self._post_calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
+        if self._error_on_next is not None:
+            exc, self._error_on_next = self._error_on_next, None
+            raise exc
         resp = _FakeResponse(self._response_status, self._response_body)
         if self._response_status >= 400:
             mock_response = MagicMock()
@@ -57,6 +71,9 @@ async def post(self, url: str, *, json=None, headers=None, timeout=None) -> "_Fa
     @asynccontextmanager
     async def stream(self, method: str, url: str, *, json=None, headers=None, timeout=None):
         self._post_calls.append({"method": method, "url": url, "json": json, "headers": headers, "timeout": timeout})
+        if self._stream_error_on_next is not None:
+            exc, self._stream_error_on_next = self._stream_error_on_next, None
+            raise exc
         yield self._stream_response
 
     async def aclose(self):
@@ -218,7 +235,7 @@ async def test_http_402_raises_hint(self, fake_http):
         fake_http.set_response(402, {})
         llm = _make_llm()
 
-        with pytest.raises(RuntimeError, match="Payment required \(HTTP 402\)"):
+        with pytest.raises(RuntimeError, match=r"Payment required \(HTTP 402\)"):
             await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
 
 
@@ -378,7 +395,7 @@ async def test_http_402_raises_hint(self, fake_http):
         fake_http.set_response(402, {})
         llm = _make_llm()
 
-        with pytest.raises(RuntimeError, match="Payment required \(HTTP 402\)"):
+        with pytest.raises(RuntimeError, match=r"Payment required \(HTTP 402\)"):
             await llm.chat(model=TEE_LLM.GPT_5, messages=[{"role": "user", "content": "Hi"}])
 
 
@@ -471,7 +488,7 @@ async def test_stream_402_raises_hint(self, fake_http):
             stream=True,
         )
 
-        with pytest.raises(RuntimeError, match="Payment required \(HTTP 402\)"):
+        with pytest.raises(RuntimeError, match=r"Payment required \(HTTP 402\)"):
             _ = [chunk async for chunk in gen]
 
     async def test_tools_with_stream_falls_back_to_single_chunk(self, fake_http):
@@ -569,3 +586,237 @@ def test_registry_success(self):
             assert cert == b"cert-bytes"
             assert tee_id == "tee-42"
             assert pay_addr == "0xPay"
+
+
+# ── TEE retry tests (non-streaming) ──────────────────────────────────
+
+
+@pytest.mark.asyncio
+class TestTeeRetryCompletion:
+    async def test_retries_on_connection_error_and_succeeds(self, fake_http):
+        """First call hits connection error → refresh TEE → second call succeeds."""
+        fake_http.set_response(200, {"completion": "retried ok", "tee_signature": "s", "tee_timestamp": "t"})
+        fake_http.fail_next_post(ConnectionError("connection refused"))
+        llm = _make_llm()
+
+        result = await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
+
+        assert result.completion_output == "retried ok"
+        assert len(fake_http.post_calls) == 2
+
+    async def test_http_status_error_not_retried(self, fake_http):
+        """A server-side error (HTTP 500) should not trigger a TEE retry."""
+        fake_http.set_response(500, {"error": "boom"})
+        llm = _make_llm()
+
+        with pytest.raises(RuntimeError, match="TEE LLM completion failed"):
+            await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
+        assert len(fake_http.post_calls) == 1
+
+    async def test_second_failure_propagates(self, fake_http):
+        """If the retry also fails, the error should propagate."""
+        call_count = 0
+
+        async def always_fail(*args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise ConnectionError("still broken")
+
+        fake_http.post = always_fail
+        llm = _make_llm()
+
+        with pytest.raises(RuntimeError, match="TEE LLM completion failed"):
+            await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
+        assert call_count == 2
+
+
+@pytest.mark.asyncio
+class TestTeeRetryChat:
+    async def test_retries_on_connection_error_and_succeeds(self, fake_http):
+        fake_http.set_response(
+            200,
+            {"choices": [{"message": {"role": "assistant", "content": "retry ok"}, "finish_reason": "stop"}]},
+        )
+        fake_http.fail_next_post(OSError("network unreachable"))
+        llm = _make_llm()
+
+        result = await llm.chat(model=TEE_LLM.GPT_5, messages=[{"role": "user", "content": "Hi"}])
+
+        assert result.chat_output["content"] == "retry ok"
+        assert len(fake_http.post_calls) == 2
+
+    async def test_http_status_error_not_retried(self, fake_http):
+        fake_http.set_response(500, {"error": "internal"})
+        llm = _make_llm()
+
+        with pytest.raises(RuntimeError, match="TEE LLM chat failed"):
+            await llm.chat(model=TEE_LLM.GPT_5, messages=[{"role": "user", "content": "Hi"}])
+        assert len(fake_http.post_calls) == 1
+
+
+# ── TEE retry tests (streaming) ──────────────────────────────────────
+
+
+@pytest.mark.asyncio
+class TestTeeRetryStreaming:
+    async def test_retries_stream_on_connection_error_before_chunks(self, fake_http):
+        """Connection failure during stream setup (no chunks yielded) → retry succeeds."""
+        fake_http.set_stream_response(
+            200,
+            [
+                b'data: {"model":"m","choices":[{"index":0,"delta":{"content":"ok"},"finish_reason":"stop"}]}\n\n',
+                b"data: [DONE]\n\n",
+            ],
+        )
+        fake_http.fail_next_stream(ConnectionError("reset by peer"))
+        llm = _make_llm()
+
+        gen = await llm.chat(
+            model=TEE_LLM.GPT_5,
+            messages=[{"role": "user", "content": "Hi"}],
+            stream=True,
+        )
+        chunks = [c async for c in gen]
+
+        assert len(chunks) == 1
+        assert chunks[0].choices[0].delta.content == "ok"
+        assert len(fake_http.post_calls) == 2
+
+    async def test_no_retry_after_chunks_yielded(self, fake_http):
+        """Failure AFTER chunks were yielded must raise, not retry."""
+
+        class _FailMidStream:
+            def __init__(self):
+                self.status_code = 200
+
+            async def aiter_raw(self):
+                yield b'data: {"model":"m","choices":[{"index":0,"delta":{"content":"partial"},"finish_reason":null}]}\n\n'
+                raise ConnectionError("mid-stream disconnect")
+
+            async def aread(self) -> bytes:
+                return b""
+
+        fake_http._stream_response = _FailMidStream()
+        llm = _make_llm()
+
+        gen = await llm.chat(
+            model=TEE_LLM.GPT_5,
+            messages=[{"role": "user", "content": "Hi"}],
+            stream=True,
+        )
+
+        with pytest.raises(ConnectionError):
+            _ = [c async for c in gen]
+
+        assert len(fake_http.post_calls) == 1
+
+
+# ── _refresh_tee tests ─────────────────────────────────────
+
+
+@pytest.mark.asyncio
+class TestRefreshTeeAndReset:
+    async def test_replaces_http_client(self):
+        """After refresh, the http client should be a new instance."""
+        clients_created = []
+
+        def make_client(*args, **kwargs):
+            c = FakeHTTPClient()
+            clients_created.append(c)
+            return c
+
+        with (
+            patch(_PATCHES["x402_httpx"], side_effect=make_client),
+            patch(_PATCHES["x402_client"]),
+            patch(_PATCHES["signer"]),
+            patch(_PATCHES["register_exact"]),
+            patch(_PATCHES["register_upto"]),
+        ):
+            llm = _make_llm()
+            old_client = llm._http_client
+
+            await llm._refresh_tee()
+
+            assert llm._http_client is not old_client
+            assert len(clients_created) == 2  # init + refresh
+
+    async def test_closes_old_client(self, fake_http):
+        llm = _make_llm()
+        old_client = llm._http_client
+        old_client.aclose = AsyncMock()
+
+        await llm._refresh_tee()
+
+        old_client.aclose.assert_awaited_once()
+
+    async def test_close_failure_is_swallowed(self, fake_http):
+        llm = _make_llm()
+        old_client = llm._http_client
+        old_client.aclose = AsyncMock(side_effect=OSError("already closed"))
+
+        # Should not raise
+        await llm._refresh_tee()
+
+
+# ── TEE cert rotation (crash + re-register) tests ────────────────────
+
+
+@pytest.mark.asyncio
+class TestTeeCertRotation:
+    """Simulate a TEE crashing and a new one registering at the same IP
+    with a different ephemeral TLS certificate.  The old cert is now
+    invalid, so the first request fails with SSLCertVerificationError.
+    The retry should re-resolve from the registry (getting the new cert)
+    and succeed."""
+
+    async def test_ssl_verification_failure_triggers_tee_refresh_completion(self, fake_http):
+        fake_http.set_response(200, {"completion": "ok after refresh", "tee_signature": "s", "tee_timestamp": "t"})
+        fake_http.fail_next_post(ssl.SSLCertVerificationError("certificate verify failed"))
+        llm = _make_llm()
+
+        with patch.object(llm, "_connect_tee", wraps=llm._connect_tee) as spy:
+            result = await llm.completion(model=TEE_LLM.GPT_5, prompt="Hi")
+
+        # _connect_tee was called once during the retry (refresh)
+        spy.assert_called_once()
+        assert result.completion_output == "ok after refresh"
+        assert len(fake_http.post_calls) == 2
+
+    async def test_ssl_verification_failure_triggers_tee_refresh_chat(self, fake_http):
+        fake_http.set_response(
+            200,
+            {"choices": [{"message": {"role": "assistant", "content": "ok after refresh"}, "finish_reason": "stop"}]},
+        )
+        fake_http.fail_next_post(ssl.SSLCertVerificationError("certificate verify failed"))
+        llm = _make_llm()
+
+        with patch.object(llm, "_connect_tee", wraps=llm._connect_tee) as spy:
+            result = await llm.chat(model=TEE_LLM.GPT_5, messages=[{"role": "user", "content": "Hi"}])
+
+        spy.assert_called_once()
+        assert result.chat_output["content"] == "ok after refresh"
+        assert len(fake_http.post_calls) == 2
+
+    async def test_ssl_verification_failure_triggers_tee_refresh_streaming(self, fake_http):
+        fake_http.set_stream_response(
+            200,
+            [
+                b'data: {"model":"m","choices":[{"index":0,"delta":{"content":"ok"},"finish_reason":"stop"}]}\n\n',
+                b"data: [DONE]\n\n",
+            ],
+        )
+        fake_http.fail_next_stream(ssl.SSLCertVerificationError("certificate verify failed"))
+        llm = _make_llm()
+
+        with patch.object(llm, "_connect_tee", wraps=llm._connect_tee) as spy:
+            gen = await llm.chat(
+                model=TEE_LLM.GPT_5,
+                messages=[{"role": "user", "content": "Hi"}],
+                stream=True,
+            )
+            chunks = [c async for c in gen]
+
+        spy.assert_called_once()
+        assert len(chunks) == 1
+        assert chunks[0].choices[0].delta.content == "ok"
+        assert len(fake_http.post_calls) == 2

From 488c7ce4ff5d7adb2e754ec5ddacca341476a1b3 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:06:25 +0100
Subject: [PATCH 11/27] fix: remove stale Union import from _conversions.py
 (convert_fixed_point_to_python now always returns np.float32)

---
 src/opengradient/client/_conversions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/opengradient/client/_conversions.py b/src/opengradient/client/_conversions.py
index 895408d6..8c04426f 100644
--- a/src/opengradient/client/_conversions.py
+++ b/src/opengradient/client/_conversions.py
@@ -3,7 +3,7 @@
 import json
 import logging
 from decimal import Decimal
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple
 
 import numpy as np
 from web3.datastructures import AttributeDict

From f5f5c70f109cabebef2f18e8d20aada5d65570ee Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:07 +0100
Subject: [PATCH 12/27] chore: sync .github/workflows/release.yml from upstream
 main

---
 .github/workflows/release.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 488c713d..37a12bed 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -44,6 +44,11 @@ jobs:
       - name: Run tests
         run: make test
 
+      - name: Run LLM integration tests
+        env:
+          PRIVATE_KEY: ${{ secrets.PRIVATE_KEY }}
+        run: make llm_integrationtest
+
   release:
     needs: [check, test]
     runs-on: ubuntu-latest

From 7708dec63e1aba3734ec8ad21b2caa0235a6e032 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:08 +0100
Subject: [PATCH 13/27] chore: sync README.md from upstream main

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b93fcefb..bf990bc2 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ OpenGradient enables developers to build AI applications with verifiable executi
 pip install opengradient
 ```
 
-**Note**: Windows users should temporarily enable WSL during installation (fix in progress).
+**Note**: > **Windows users:** See the [Windows Installation Guide](./WINDOWS_INSTALL.md) for step-by-step setup instructions.
 
 ## Network Architecture
 

From dece88ba0a73f56fb5ef210c10a1f584bd8412d5 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:09 +0100
Subject: [PATCH 14/27] chore: sync WINDOWS_INSTALL.md from upstream main

---
 WINDOWS_INSTALL.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 WINDOWS_INSTALL.md

diff --git a/WINDOWS_INSTALL.md b/WINDOWS_INSTALL.md
new file mode 100644
index 00000000..ac57d32f
--- /dev/null
+++ b/WINDOWS_INSTALL.md
@@ -0,0 +1,36 @@
+# Windows Installation Guide
+
+The `opengradient` package requires a C compiler
+to build its native dependencies. Windows does not
+have one by default.
+
+## Step 1 — Enable WSL
+
+Open PowerShell as Administrator and run:
+
+    wsl --install
+
+Restart your PC when prompted.
+
+## Step 2 — Install Python and uv inside WSL
+
+Open the Ubuntu app and run:
+
+    sudo apt update && sudo apt install -y python3 curl
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+
+## Step 3 — Install SDK
+
+    uv add opengradient
+
+## Step 4 — Verify
+
+    uv run python3 -c "import opengradient; print('Ready!')"
+
+## Common Errors
+
+- Visual C++ 14.0 required → Use WSL instead
+- wsl: command not found → Update Windows 10 to Build 19041+
+- WSL stuck → Enable Virtualization in BIOS
+- uv: command not found → Run: source $HOME/.local/bin/env

From 7fbc75828d954691a683064a6a9f7642298c2bf0 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:10 +0100
Subject: [PATCH 15/27] chore: sync docs/opengradient/client/llm.md from
 upstream main

---
 docs/opengradient/client/llm.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/opengradient/client/llm.md b/docs/opengradient/client/llm.md
index 65aed9f0..40615b73 100644
--- a/docs/opengradient/client/llm.md
+++ b/docs/opengradient/client/llm.md
@@ -200,8 +200,8 @@ a transaction. Otherwise, sends an ERC-20 approve transaction.
 
 **Arguments**
 
-* **`opg_amount`**: Minimum number of OPG tokens required (e.g. ``0.05``
-        for 0.05 OPG). Must be at least 0.05 OPG.
+* **`opg_amount`**: Minimum number of OPG tokens required (e.g. ``0.1``
+        for 0.1 OPG). Must be at least 0.1 OPG.
 
 **Returns**
 
@@ -211,5 +211,5 @@ Permit2ApprovalResult: Contains ``allowance_before``,
 
 **Raises**
 
-* **`ValueError`**: If the OPG amount is less than 0.05.
+* **`ValueError`**: If the OPG amount is less than 0.1.
 * **`RuntimeError`**: If the approval transaction fails.
\ No newline at end of file

From fc972b800b7a8a733d7197b2617bfeeb325e6750 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:12 +0100
Subject: [PATCH 16/27] chore: sync docs/opengradient/index.md from upstream
 main

---
 docs/opengradient/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/opengradient/index.md b/docs/opengradient/index.md
index 45b84a2d..32abc92f 100644
--- a/docs/opengradient/index.md
+++ b/docs/opengradient/index.md
@@ -6,7 +6,7 @@ opengradient
 
 # Package opengradient
 
-**Version: 0.9.0**
+**Version: 0.9.2**
 
 OpenGradient Python SDK for decentralized AI inference with end-to-end verification.
 

From e0927b1697c8891bccc777dd3a70a399c47e0f9e Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:13 +0100
Subject: [PATCH 17/27] chore: sync integrationtest/llm/test_llm_chat.py from
 upstream main

---
 integrationtest/llm/test_llm_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integrationtest/llm/test_llm_chat.py b/integrationtest/llm/test_llm_chat.py
index 8cc9f371..632436ac 100644
--- a/integrationtest/llm/test_llm_chat.py
+++ b/integrationtest/llm/test_llm_chat.py
@@ -23,7 +23,7 @@
 ]
 
 # Amount of OPG tokens to fund the test account with
-OPG_FUND_AMOUNT = 0.05
+OPG_FUND_AMOUNT = 0.1
 # Amount of ETH to fund the test account with (for gas)
 ETH_FUND_AMOUNT = 0.0001
 

From de224f5747dd10ba5376fcdb8fe3cbca640f75fe Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:14 +0100
Subject: [PATCH 18/27] chore: sync pyproject.toml from upstream main

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1f515651..40190299 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "opengradient"
-version = "0.9.1"
+version = "0.9.3"
 description = "Python SDK for OpenGradient decentralized model management & inference services"
 authors = [{name = "OpenGradient", email = "adam@vannalabs.ai"}]
 readme = "README.md"
@@ -27,7 +27,7 @@ dependencies = [
     "langchain>=0.3.7",
     "openai>=1.58.1",
     "pydantic>=2.9.2",
-    "og-x402==0.0.1.dev2"
+    "og-x402==0.0.1.dev4"
 ]
 
 [project.optional-dependencies]

From 1769a0eb3441b6d7e531d4f0b20c2256a88705ce Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:15 +0100
Subject: [PATCH 19/27] chore: sync src/opengradient/client/_utils.py from
 upstream main

---
 src/opengradient/client/_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/opengradient/client/_utils.py b/src/opengradient/client/_utils.py
index 5e2938ad..d9d8436d 100644
--- a/src/opengradient/client/_utils.py
+++ b/src/opengradient/client/_utils.py
@@ -49,6 +49,9 @@ def run_with_retry(
     """
     effective_retries = max_retries if max_retries is not None else DEFAULT_MAX_RETRY
 
+    if effective_retries < 1:
+        raise ValueError(f"max_retries must be at least 1, got {effective_retries}")
+
     for attempt in range(effective_retries):
         try:
             return txn_function()
@@ -62,3 +65,5 @@ def run_with_retry(
                 continue
 
             raise
+
+    raise RuntimeError(f"run_with_retry exhausted {effective_retries} attempts without returning or raising")

From 1f7aa523a9c0808f0df3da40dbdc6e63a31e28d8 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:16 +0100
Subject: [PATCH 20/27] chore: sync src/opengradient/client/opg_token.py from
 upstream main

---
 src/opengradient/client/opg_token.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/opengradient/client/opg_token.py b/src/opengradient/client/opg_token.py
index d86d9de9..af80783a 100644
--- a/src/opengradient/client/opg_token.py
+++ b/src/opengradient/client/opg_token.py
@@ -82,8 +82,8 @@ def ensure_opg_approval(wallet_account: LocalAccount, opg_amount: float) -> Perm
 
     allowance_before = token.functions.allowance(owner, spender).call()
 
-    # Only approve if the allowance is less than 10% of the requested amount
-    if allowance_before >= amount_base * 0.1:
+    # Only approve if the allowance is less than 50% of the requested amount
+    if allowance_before >= amount_base * 0.5:
         return Permit2ApprovalResult(
             allowance_before=allowance_before,
             allowance_after=allowance_before,
@@ -124,7 +124,6 @@ def ensure_opg_approval(wallet_account: LocalAccount, opg_amount: float) -> Perm
                 )
             time.sleep(ALLOWANCE_POLL_INTERVAL)
 
-
         return Permit2ApprovalResult(
             allowance_before=allowance_before,
             allowance_after=allowance_after,

From 73739190b8823cb6f2234bf189157a8644a105e3 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:17 +0100
Subject: [PATCH 21/27] chore: sync tests/opg_token_test.py from upstream main

---
 tests/opg_token_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/opg_token_test.py b/tests/opg_token_test.py
index f2b1255f..0e44a072 100644
--- a/tests/opg_token_test.py
+++ b/tests/opg_token_test.py
@@ -77,6 +77,7 @@ def test_zero_amount_with_zero_allowance_skips(self, mock_wallet, mock_web3):
 
         assert result.tx_hash is None
 
+
 class TestEnsureOpgApprovalSendsTx:
     """Cases where allowance is insufficient and a transaction is sent."""
 
@@ -181,6 +182,7 @@ def test_waits_for_allowance_update_after_receipt(self, mock_wallet, mock_web3,
         assert result.allowance_after == amount_base
         assert result.tx_hash == "0xconfirmed"
 
+
 class TestEnsureOpgApprovalErrors:
     """Error handling paths."""
 

From 34509115467ae64f19d77288ebb3d0f13d87957b Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:18 +0100
Subject: [PATCH 22/27] chore: sync stresstest/llm.py from upstream main

---
 stresstest/llm.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/stresstest/llm.py b/stresstest/llm.py
index 7c2811db..f9652c19 100644
--- a/stresstest/llm.py
+++ b/stresstest/llm.py
@@ -1,4 +1,5 @@
 import argparse
+import asyncio
 import statistics
 
 from utils import stress_test_wrapper
@@ -7,16 +8,18 @@
 
 # Number of requests to run serially
 NUM_REQUESTS = 100
-MODEL = "anthropic/claude-haiku-4-5"
+MODEL = og.TEE_LLM.GEMINI_2_5_FLASH
 
 
-def main(private_key: str):
+async def main(private_key: str):
     llm = og.LLM(private_key=private_key)
+    llm.ensure_opg_approval(opg_amount=0.1)
 
-    def run_prompt(prompt: str):
-        llm.completion(MODEL, prompt, max_tokens=50)
+    async def run_prompt(prompt: str):
+        messages = [{"role": "user", "content": prompt}]
+        await llm.chat(MODEL, messages=messages, max_tokens=50, x402_settlement_mode=og.x402SettlementMode.INDIVIDUAL_FULL)
 
-    latencies, failures = stress_test_wrapper(run_prompt, num_requests=NUM_REQUESTS, is_llm=True)
+    latencies, failures = await stress_test_wrapper(run_prompt, num_requests=NUM_REQUESTS)
 
     # Calculate and print statistics
     total_requests = NUM_REQUESTS
@@ -55,4 +58,4 @@ def run_prompt(prompt: str):
     parser.add_argument("private_key", help="Private key for inference")
     args = parser.parse_args()
 
-    main(args.private_key)
+    asyncio.run(main(args.private_key))

From 74fa27393aae8cc2f21c5f1bcfb5fa6c0704f3dc Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:19 +0100
Subject: [PATCH 23/27] chore: sync stresstest/utils.py from upstream main

---
 stresstest/utils.py | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/stresstest/utils.py b/stresstest/utils.py
index b7e22512..8ac7b064 100644
--- a/stresstest/utils.py
+++ b/stresstest/utils.py
@@ -1,17 +1,7 @@
 import random
 import time
 import uuid
-from typing import Callable, List, Tuple
-
-
-def generate_unique_input(request_id: int) -> dict:
-    """Generate a unique input for testing."""
-    num_input1 = [random.uniform(0, 10) for _ in range(3)]
-    num_input2 = random.randint(1, 20)
-    str_input1 = [random.choice(["hello", "world", "ONNX", "test"]) for _ in range(2)]
-    str_input2 = f"Request {request_id}: {str(uuid.uuid4())[:8]}"
-
-    return {"num_input1": num_input1, "num_input2": num_input2, "str_input1": str_input1, "str_input2": str_input2}
+from typing import Callable, Coroutine, List, Tuple
 
 
 def generate_unique_prompt(request_id: int) -> str:
@@ -26,14 +16,13 @@ def generate_unique_prompt(request_id: int) -> str:
     return f"Request {request_id}: Tell me a {adjective} fact about {topic}. Keep it short. Unique ID: {unique_id}"
 
 
-def stress_test_wrapper(infer_function: Callable, num_requests: int, is_llm: bool = False) -> Tuple[List[float], int]:
+async def stress_test_wrapper(infer_function: Callable[..., Coroutine], num_requests: int) -> Tuple[List[float], int]:
     """
-    Wrapper function to stress test the inference.
+    Async wrapper function to stress test the inference.
 
     Args:
-    infer_function (Callable): The inference function to test.
+    infer_function (Callable): An async inference function to test.
     num_requests (int): Number of requests to send.
-    is_llm (bool): Whether the test is for an LLM model. Default is False.
 
     Returns:
     Tuple[List[float], int]: List of latencies for each request and the number of failures.
@@ -42,15 +31,12 @@ def stress_test_wrapper(infer_function: Callable, num_requests: int, is_llm: boo
     failures = 0
 
     for i in range(num_requests):
-        if is_llm:
-            input_data = generate_unique_prompt(i)
-        else:
-            input_data = generate_unique_input(i)
+        input_data = generate_unique_prompt(i)
 
         start_time = time.time()
 
         try:
-            _ = infer_function(input_data)
+            _ = await infer_function(input_data)
             end_time = time.time()
             latency = end_time - start_time
             latencies.append(latency)

From 67861905dddd6d264a88765d50882e914120020e Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:21 +0100
Subject: [PATCH 24/27] chore: sync uv.lock from upstream main

---
 uv.lock | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/uv.lock b/uv.lock
index 94346b2c..6d0327a6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1835,15 +1835,15 @@ wheels = [
 
 [[package]]
 name = "og-x402"
-version = "0.0.1.dev2"
+version = "0.0.1.dev4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydantic" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/1e/75/40c43cd44aa394e68acc98f8d5b8376f3a5e3b9eddf55b1c0c34616c340b/og_x402-0.0.1.dev2.tar.gz", hash = "sha256:bf5d4484ece5a371358a336fcc79fe5678be611044c55ade45c4be9d19d7691b", size = 899662, upload-time = "2026-03-17T06:35:36.587Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b1/5b/46a55d93d9da5535ff2bb28d48d5766c9108d9e16546cb9c7a65cde0fb11/og_x402-0.0.1.dev4.tar.gz", hash = "sha256:2d8a71b2f4222284e65d45e2d122faafe3bdb33c4fae77903f9665d29e517a97", size = 900109, upload-time = "2026-03-23T15:10:37.144Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/79/8c7543c2e647508e04ad0983e9a3a7b861f388ec591ccdc42c69a3128d42/og_x402-0.0.1.dev2-py3-none-any.whl", hash = "sha256:65e7d3bbb3c7f51e51dad974f6c405a230693816f72d874cf0d6d705a8eec271", size = 952331, upload-time = "2026-03-17T06:35:34.695Z" },
+    { url = "https://files.pythonhosted.org/packages/45/da/5e0be4b8415a6c557a94991367c6124998df3ba014bceb76b595ef48c8c7/og_x402-0.0.1.dev4-py3-none-any.whl", hash = "sha256:c329ceb4fe7cc4195fa5bf9c769f5c571b61c8333b33fd0fe204a2ab377d8366", size = 952662, upload-time = "2026-03-23T15:10:35.21Z" },
 ]
 
 [[package]]
@@ -1867,7 +1867,7 @@ wheels = [
 
 [[package]]
 name = "opengradient"
-version = "0.9.0"
+version = "0.9.3"
 source = { editable = "." }
 dependencies = [
     { name = "click" },
@@ -1907,7 +1907,7 @@ requires-dist = [
     { name = "langgraph", marker = "extra == 'dev'" },
     { name = "mypy", marker = "extra == 'dev'" },
     { name = "numpy", specifier = ">=1.26.4" },
-    { name = "og-x402", specifier = "==0.0.1.dev2" },
+    { name = "og-x402", specifier = "==0.0.1.dev4" },
     { name = "openai", specifier = ">=1.58.1" },
     { name = "pdoc3", marker = "extra == 'dev'", specifier = "==0.10.0" },
     { name = "pydantic", specifier = ">=2.9.2" },

From 67542a910bf4a8e6ad431c6bc73167c1426f085d Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:08:35 +0100
Subject: [PATCH 25/27] chore: remove stresstest/infer.py (deleted in upstream
 main)

---
 stresstest/infer.py | 58 ---------------------------------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 stresstest/infer.py

diff --git a/stresstest/infer.py b/stresstest/infer.py
deleted file mode 100644
index 6a72cefa..00000000
--- a/stresstest/infer.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import argparse
-import statistics
-
-from utils import stress_test_wrapper
-
-import opengradient as og
-
-# Number of requests to run serially
-NUM_REQUESTS = 10_000
-MODEL = "QmbUqS93oc4JTLMHwpVxsE39mhNxy6hpf6Py3r9oANr8aZ"
-
-
-def main(private_key: str):
-    alpha = og.Alpha(private_key=private_key)
-
-    def run_inference(input_data: dict):
-        alpha.infer(MODEL, og.InferenceMode.VANILLA, input_data)
-
-    latencies, failures = stress_test_wrapper(run_inference, num_requests=NUM_REQUESTS)
-
-    # Calculate and print statistics
-    total_requests = NUM_REQUESTS
-    success_rate = (len(latencies) / total_requests) * 100 if total_requests > 0 else 0
-
-    if latencies:
-        avg_latency = statistics.mean(latencies)
-        median_latency = statistics.median(latencies)
-        min_latency = min(latencies)
-        max_latency = max(latencies)
-        p95_latency = statistics.quantiles(latencies, n=20)[18]  # 95th percentile
-    else:
-        avg_latency = median_latency = min_latency = max_latency = p95_latency = 0
-
-    print("\nOpenGradient Inference Stress Test Results:")
-    print(f"Using model '{MODEL}'")
-    print("=" * 20 + "\n")
-    print(f"Total Requests: {total_requests}")
-    print(f"Successful Requests: {len(latencies)}")
-    print(f"Failed Requests: {failures}")
-    print(f"Success Rate: {success_rate:.2f}%\n")
-    print(f"Average Latency: {avg_latency:.4f} seconds")
-    print(f"Median Latency: {median_latency:.4f} seconds")
-    print(f"Min Latency: {min_latency:.4f} seconds")
-    print(f"Max Latency: {max_latency:.4f} seconds")
-    print(f"95th Percentile Latency: {p95_latency:.4f} seconds")
-
-    if failures > 0:
-        print("\n🛑 WARNING: TEST FAILED")
-    else:
-        print("\n✅ NO FAILURES")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run inference stress test")
-    parser.add_argument("private_key", help="Private key for inference")
-    args = parser.parse_args()
-
-    main(args.private_key)

From bd147fb4f2189ebd483505b453aad59a7e8d17b0 Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:14:07 +0100
Subject: [PATCH 26/27] Merge main into fix branch: rebased llm.py with all PR
 changes

---
 src/opengradient/client/llm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py
index 8896eac0..af383861 100644
--- a/src/opengradient/client/llm.py
+++ b/src/opengradient/client/llm.py
@@ -320,12 +320,12 @@ async def _request() -> TextGenerationOutput:
 
         try:
             return await self._call_with_tee_retry("completion", _request)
+        except RuntimeError:
+            raise
         except httpx.HTTPStatusError as e:
             if e.response.status_code == 402:
                 raise RuntimeError(_402_HINT) from e
             raise RuntimeError(f"TEE LLM completion failed: {e}") from e
-        except RuntimeError:
-            raise
         except Exception as e:
             raise RuntimeError(f"TEE LLM completion failed: {e}") from e
 
@@ -426,14 +426,14 @@ async def _request() -> TextGenerationOutput:
 
         try:
             return await self._call_with_tee_retry("chat", _request)
+        except RuntimeError:
+            raise
         except httpx.HTTPStatusError as e:
             # Provide an actionable error message for the very common 402 case
             # (issue #188 — users see a cryptic RuntimeError instead of guidance).
             if e.response.status_code == 402:
                 raise RuntimeError(_402_HINT) from e
             raise RuntimeError(f"TEE LLM chat failed: {e}") from e
-        except RuntimeError:
-            raise
         except Exception as e:
             raise RuntimeError(f"TEE LLM chat failed: {e}") from e
 
@@ -507,9 +507,9 @@ async def _chat_stream(self, params: _ChatParams, messages: List[Dict]) -> Async
     async def _parse_sse_response(self, response) -> AsyncGenerator[StreamChunk, None]:
         """Parse an SSE response stream into StreamChunk objects."""
         status_code = getattr(response, "status_code", None)
+        if status_code is not None and status_code == 402:
+            raise RuntimeError(_402_HINT)
         if status_code is not None and status_code >= 400:
-            if status_code == 402:
-                raise RuntimeError(_402_HINT)
             body = await response.aread()
             raise RuntimeError(f"TEE LLM streaming request failed with status {status_code}: {body.decode('utf-8', errors='replace')}")
 

From 258e08c9de4005f0d813f704e46a3b44cec7158f Mon Sep 17 00:00:00 2001
From: AMATH <116212274+amathxbt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 01:14:15 +0100
Subject: [PATCH 27/27] Merge main into fix branch: rebased llm_test.py with
 all PR test additions