fix horovod pytorch estimator (intel-analytics#3187)

* fix horovod pytorch estimator * fix style * fix style * add syncing weights tests
yangw1234 · Sep 23, 2021 · 24d032c · 24d032c
1 parent f33dbc9
commit 24d032c
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 3 deletions.
diff --git a/python/orca/src/bigdl/orca/learn/horovod/horovod_ray_runner.py b/python/orca/src/bigdl/orca/learn/horovod/horovod_ray_runner.py
@@ -120,7 +120,8 @@ def __init__(self, ray_ctx, worker_cls=None, worker_param=None, workers_per_node
             local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)
 
         ray.get([worker.set_gloo_iface.remote() for worker in self.remote_workers])
+        self.run(lambda: print("horovod worker initialized"))
 
     def run(self, func):
-        ray.get([self.remote_workers[i].run.remote(self.per_worker_envs[i], func)
-                 for i in range(self.num_nodes)])
+        return ray.get([self.remote_workers[i].run.remote(self.per_worker_envs[i], func)
+                       for i in range(self.num_nodes)])
diff --git a/python/orca/test/bigdl/orca/learn/ray/pytorch/test_estimator_horovod_backend.py b/python/orca/test/bigdl/orca/learn/ray/pytorch/test_estimator_horovod_backend.py
@@ -37,7 +37,7 @@ def test_train(self):
             config={
                 "lr": 1e-2,  # used in optimizer_creator
                 "hidden_size": 1  # used in model_creator
-            }, backend="horovod")
+            }, backend="horovod", workers_per_node=2)
         stats1 = estimator.fit(train_data_creator, batch_size=4, epochs=5)
         train_loss1 = stats1[-1]["train_loss"]
         validation_loss1 = estimator.evaluate(validation_data_creator)["val_loss"]
@@ -46,11 +46,47 @@ def test_train(self):
         train_loss2 = stats2[-1]["train_loss"]
         validation_loss2 = estimator.evaluate(validation_data_creator)["val_loss"]
 
+        # Verify syncing weights, i.e. the two workers have the same weights after training
+        import ray
+        import numpy as np
+        remote_workers = estimator.estimator.remote_workers
+        state_dicts = ray.get([worker.state_dict.remote() for worker in remote_workers])
+        weights = [state["models"] for state in state_dicts]
+        worker1_weights = weights[0][0]
+        worker2_weights = weights[1][0]
+        for layer in list(worker1_weights.keys()):
+            assert np.allclose(worker1_weights[layer].numpy(),
+                               worker2_weights[layer].numpy())
+
         assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
         assert validation_loss2 <= validation_loss1, (validation_loss2,
                                                       validation_loss1)
         estimator.shutdown()
 
+    def test_horovod_initialized_correctly(self):
+        estimator = Estimator.from_torch(
+            model=model_creator,
+            optimizer=optimizer_creator,
+            loss=nn.MSELoss(),
+            scheduler_creator=scheduler_creator,
+            config={
+                "lr": 1e-2,  # used in optimizer_creator
+                "hidden_size": 1  # used in model_creator
+            }, backend="horovod", workers_per_node=2)
+
+        def get_size():
+            import horovod.torch as hvd
+            return hvd.size()
+        results = estimator.estimator.horovod_runner.run(get_size)
+        assert results == [2, 2]
+
+        def get_rank():
+            import horovod.torch as hvd
+            return hvd.rank()
+        results = estimator.estimator.horovod_runner.run(get_rank)
+        results = sorted(results)
+        assert results == [0, 1]
+
     def test_save_and_restore(self):
         estimator1 = Estimator.from_torch(
             model=model_creator,