Skip to content

Commit a642464

Browse files
HosseinKaviani-HHossein Kavianihamedani
andauthored
Metric logger (#554)
Co-authored-by: Hossein Kavianihamedani <[email protected]>
1 parent 51bfe78 commit a642464

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

apps/sft/main.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@
2828
from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset
2929
from forge.data.tokenizer import HuggingFaceModelTokenizer
3030
from forge.observability import get_or_create_metric_logger, record_metric, Reduce
31+
from forge.observability.metric_actors import GlobalLoggingActor
3132
from forge.util.config import parse
3233

33-
from monarch.actor import current_rank, current_size, endpoint
34+
from monarch.actor import current_rank, current_size, endpoint, get_or_spawn_controller
3435
from omegaconf import DictConfig, OmegaConf
3536
from torch import nn
3637
from torchdata.stateful_dataloader import StatefulDataLoader
@@ -110,8 +111,12 @@ def _init_dist(self):
110111
logger.info("env: {}".format(env))
111112

112113
async def setup_metric_logger(self):
113-
"""Initialization happens in the main process. Here we just retrieve it"""
114-
mlogger = await get_or_create_metric_logger()
114+
"""Retrieve the already-initialized metric logger from main process"""
115+
116+
# The global logger was already created in main process.
117+
# Use get_or_spawn_controller from monarch to get reference to it
118+
# Get reference to the existing global logger (don't create new one)
119+
mlogger = await get_or_spawn_controller("global_logger", GlobalLoggingActor)
115120
return mlogger
116121

117122
def record_batch_metrics(self, data_metrics: list):
@@ -123,6 +128,7 @@ def record_batch_metrics(self, data_metrics: list):
123128
@endpoint
124129
async def setup(self):
125130
self.train_dataloader = self.setup_data()
131+
126132
self.mlogger = await self.setup_metric_logger()
127133

128134
# self.train_dataloader = self.setup_data(

0 commit comments

Comments
 (0)