Skip to content

Commit 02af787

Browse files
authored
fix chinese comments & add TODO (#15)
1 parent acbd595 commit 02af787

File tree

1 file changed

+10
-8
lines changed

1 file changed

+10
-8
lines changed

recipe/transfer_queue/ray_trainer.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def __init__(
361361

362362
def _initialize_data_system(self):
363363
num_n_samples = self.config.actor_rollout_ref.rollout.n
364-
# 1. 初始化TransferQueueStorage
364+
# 1. initialize TransferQueueStorage
365365
total_storage_size = self.config.data.train_batch_size * self.config.trainer.num_global_batch * num_n_samples
366366
self.data_system_storage_units = {}
367367
storage_placement_group = get_placement_group(self.config.trainer.num_data_storage_units, num_cpus_per_actor=1)
@@ -373,8 +373,9 @@ def _initialize_data_system(self):
373373
self.data_system_storage_units[storage_unit_rank] = storage_node
374374
logging.info(f"TransferQueueStorageSimpleUnit #{storage_unit_rank} has been created.")
375375

376-
# 2. 初始化TransferQueueController
377-
# 这里支持多controller实例以实现负载均衡,支持大规模扩展。不同controller可分配至不同RL计算任务
376+
# 2. initialize TransferQueueController
377+
# we support inilialize multiple controller instances for large-scale scenario. Please allocate exactly
378+
# one controller for a single WorkerGroup.
378379
self.data_system_controllers = {}
379380
controller_placement_group = get_placement_group(self.config.trainer.num_data_controllers, num_cpus_per_actor=1)
380381
for controller_rank in range(self.config.trainer.num_data_controllers):
@@ -388,8 +389,7 @@ def _initialize_data_system(self):
388389
)
389390
logging.info(f"TransferQueueController #{controller_rank} has been created.")
390391

391-
# 3. 将Controller注册至各个Storage
392-
# 每个Storage Unit拿到所有Controller的handler,通过Ray拿到对应的IP+端口,之后建立ZMQ Socket进行消息传输
392+
# 3. register controller & storage
393393
self.data_system_controller_infos = process_zmq_server_info(self.data_system_controllers)
394394
self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units)
395395

@@ -400,11 +400,11 @@ def _initialize_data_system(self):
400400
]
401401
)
402402

403-
# 4. 创建Client
403+
# 4. create client
404+
# each client should be allocated to exactly one controller
404405
self.data_system_client = AsyncTransferQueueClient(
405406
client_id="Trainer",
406407
controller_infos=self.data_system_controller_infos[0],
407-
# TODO: 主控Client感知所有controller,WorkerGroup和Worker的Client感知一个controller
408408
storage_infos=self.data_system_storage_unit_infos,
409409
)
410410

@@ -1472,7 +1472,9 @@ def fit(self):
14721472
log_rollout_meta.reorder(balanced_idx)
14731473
self._log_rollout_data(log_rollout_meta, reward_extra_infos_dict, timing_raw, rollout_data_dir)
14741474

1475-
# validate
1475+
# TODO: clear meta after iteration
1476+
1477+
# TODO: validate
14761478
if (
14771479
self.val_reward_fn is not None
14781480
and self.config.trainer.test_freq > 0

0 commit comments

Comments
 (0)