Load optimizer state when appropriate, part 2 (#141)

danobi · web-flow · commit 46b3bbdd79a9 · 2025-12-01T20:39:34.000-08:00
Signed-off-by: Daniel Xu &lt;daniel@thinkingmachines.ai&gt;
diff --git a/llms-full.txt b/llms-full.txt
@@ -676,6 +676,27 @@ training_client.load_state(sft_checkpoint_path)
 - Multi-step training pipelines (e.g., starting DPO training from an SFT checkpoint)
 - Starting fresh training from pretrained weights with a new optimizer
 
+### ServiceClient methods for loading checkpoints
+
+The `ServiceClient` also provides methods to create a new `TrainingClient` directly from a saved checkpoint:
+
+- `create_training_client_from_state(path)`: Creates a `TrainingClient` with weights loaded from the checkpoint (no optimizer state). Use this when starting a new training phase from saved weights.
+- `create_training_client_from_state_with_optimizer(path)`: Creates a `TrainingClient` with both weights and optimizer state loaded. Use this when resuming interrupted training.
+
+```python
+# Resume training with optimizer state
+training_client = service_client.create_training_client_from_state_with_optimizer(
+    "tinker://run-id/weights/checkpoint-001"
+)
+
+# Start fresh training from a checkpoint (weights only)
+training_client = service_client.create_training_client_from_state(
+    "tinker://run-id/weights/checkpoint-001"
+)
+```
+
+Async versions are also available: `create_training_client_from_state_async()` and `create_training_client_from_state_with_optimizer_async()`.
+
 
 ---
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "numpy",
     "rich",
     "termcolor",
-    "tinker>=0.6.0",
+    "tinker>=0.6.1",
     "torch",
     "transformers",
     "blobfile",
diff --git a/tinker_cookbook/recipes/rl_loop.py b/tinker_cookbook/recipes/rl_loop.py
@@ -83,7 +83,7 @@ def main(config: Config):
 
     resume_info = checkpoint_utils.get_last_checkpoint(config.log_path)
     if resume_info:
-        training_client = service_client.create_training_client_from_state(
+        training_client = service_client.create_training_client_from_state_with_optimizer(
             resume_info["state_path"]
         )
         start_batch = resume_info["batch"]
diff --git a/tinker_cookbook/recipes/sl_loop.py b/tinker_cookbook/recipes/sl_loop.py
@@ -63,7 +63,7 @@ def main(config: Config):
     # Check for resuming
     resume_info = checkpoint_utils.get_last_checkpoint(config.log_path)
     if resume_info:
-        training_client = service_client.create_training_client_from_state(
+        training_client = service_client.create_training_client_from_state_with_optimizer(
             resume_info["state_path"]
         )
         start_batch = resume_info["batch"]
diff --git a/tinker_cookbook/rl/train.py b/tinker_cookbook/rl/train.py
@@ -1058,14 +1058,20 @@ async def main(
         start_batch = 0
 
     service_client = tinker.ServiceClient(base_url=cfg.base_url)
-    load_state_path: str | None = (
-        resume_info["state_path"] if resume_info else cfg.load_checkpoint_path
-    )
-    if load_state_path:
+    if resume_info:
+        # Resuming interrupted training - load optimizer state for proper continuation
+        training_client = (
+            await service_client.create_training_client_from_state_with_optimizer_async(
+                resume_info["state_path"]
+            )
+        )
+        logger.info(f"Resumed training from {resume_info['state_path']}")
+    elif cfg.load_checkpoint_path:
+        # Starting fresh from a checkpoint - load weights only (fresh optimizer)
         training_client = await service_client.create_training_client_from_state_async(
-            load_state_path
+            cfg.load_checkpoint_path
         )
-        logger.info(f"Loaded state from {load_state_path}")
+        logger.info(f"Loaded weights from {cfg.load_checkpoint_path}")
     else:
         training_client = await service_client.create_lora_training_client_async(
             cfg.model_name, rank=cfg.lora_rank
diff --git a/tinker_cookbook/supervised/train.py b/tinker_cookbook/supervised/train.py
@@ -189,19 +189,25 @@ async def main(config: Config):
         trace_init(output_file=os.path.join(config.log_path, "trace_events.jsonl"))
 
     service_client = tinker.ServiceClient(base_url=config.base_url)
-    load_state_path: str | None = (
-        resume_info["state_path"] if resume_info else config.load_checkpoint_path
-    )
 
     user_metadata: dict[str, str] = {}
     if wandb_link := ml_logger.get_logger_url():
         user_metadata["wandb_link"] = wandb_link
 
-    if load_state_path:
+    if resume_info:
+        # Resuming interrupted training - load optimizer state for proper continuation
+        training_client = (
+            await service_client.create_training_client_from_state_with_optimizer_async(
+                resume_info["state_path"], user_metadata
+            )
+        )
+        logger.info(f"Resumed training from {resume_info['state_path']}")
+    elif config.load_checkpoint_path:
+        # Starting fresh from a checkpoint - load weights only (fresh optimizer)
         training_client = await service_client.create_training_client_from_state_async(
-            load_state_path, user_metadata
+            config.load_checkpoint_path, user_metadata
         )
-        logger.info(f"Loaded weights from {load_state_path}")
+        logger.info(f"Loaded weights from {config.load_checkpoint_path}")
     else:
         training_client = await service_client.create_lora_training_client_async(
             base_model=config.model_name,

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ def main(config: Config):`
`83`	`83`
`84`	`84`	`resume_info = checkpoint_utils.get_last_checkpoint(config.log_path)`
`85`	`85`	`if resume_info:`
`86`		`- training_client = service_client.create_training_client_from_state(`
	`86`	`+ training_client = service_client.create_training_client_from_state_with_optimizer(`
`87`	`87`	`resume_info["state_path"]`
`88`	`88`	`)`
`89`	`89`	`start_batch = resume_info["batch"]`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ def main(config: Config):`
`63`	`63`	`# Check for resuming`
`64`	`64`	`resume_info = checkpoint_utils.get_last_checkpoint(config.log_path)`
`65`	`65`	`if resume_info:`
`66`		`- training_client = service_client.create_training_client_from_state(`
	`66`	`+ training_client = service_client.create_training_client_from_state_with_optimizer(`
`67`	`67`	`resume_info["state_path"]`
`68`	`68`	`)`
`69`	`69`	`start_batch = resume_info["batch"]`