Skip to content

Commit

Permalink
fix horovod pytorch estimator (intel-analytics#3187)
Browse files Browse the repository at this point in the history
* fix horovod pytorch estimator

* fix style

* fix style

* add syncing weights tests
  • Loading branch information
yangw1234 committed Sep 23, 2021
1 parent f33dbc9 commit 24d032c
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ def __init__(self, ray_ctx, worker_cls=None, worker_param=None, workers_per_node
local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)

ray.get([worker.set_gloo_iface.remote() for worker in self.remote_workers])
self.run(lambda: print("horovod worker initialized"))

def run(self, func):
ray.get([self.remote_workers[i].run.remote(self.per_worker_envs[i], func)
for i in range(self.num_nodes)])
return ray.get([self.remote_workers[i].run.remote(self.per_worker_envs[i], func)
for i in range(self.num_nodes)])
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_train(self):
config={
"lr": 1e-2, # used in optimizer_creator
"hidden_size": 1 # used in model_creator
}, backend="horovod")
}, backend="horovod", workers_per_node=2)
stats1 = estimator.fit(train_data_creator, batch_size=4, epochs=5)
train_loss1 = stats1[-1]["train_loss"]
validation_loss1 = estimator.evaluate(validation_data_creator)["val_loss"]
Expand All @@ -46,11 +46,47 @@ def test_train(self):
train_loss2 = stats2[-1]["train_loss"]
validation_loss2 = estimator.evaluate(validation_data_creator)["val_loss"]

# Verify syncing weights, i.e. the two workers have the same weights after training
import ray
import numpy as np
remote_workers = estimator.estimator.remote_workers
state_dicts = ray.get([worker.state_dict.remote() for worker in remote_workers])
weights = [state["models"] for state in state_dicts]
worker1_weights = weights[0][0]
worker2_weights = weights[1][0]
for layer in list(worker1_weights.keys()):
assert np.allclose(worker1_weights[layer].numpy(),
worker2_weights[layer].numpy())

assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
assert validation_loss2 <= validation_loss1, (validation_loss2,
validation_loss1)
estimator.shutdown()

def test_horovod_initialized_correctly(self):
estimator = Estimator.from_torch(
model=model_creator,
optimizer=optimizer_creator,
loss=nn.MSELoss(),
scheduler_creator=scheduler_creator,
config={
"lr": 1e-2, # used in optimizer_creator
"hidden_size": 1 # used in model_creator
}, backend="horovod", workers_per_node=2)

def get_size():
import horovod.torch as hvd
return hvd.size()
results = estimator.estimator.horovod_runner.run(get_size)
assert results == [2, 2]

def get_rank():
import horovod.torch as hvd
return hvd.rank()
results = estimator.estimator.horovod_runner.run(get_rank)
results = sorted(results)
assert results == [0, 1]

def test_save_and_restore(self):
estimator1 = Estimator.from_torch(
model=model_creator,
Expand Down

0 comments on commit 24d032c

Please sign in to comment.