precision tests passed

hpcaitech · Jul 18, 2024 · ca16753 · ca16753
1 parent 77f4eaf
commit ca16753
Show file tree

Hide file tree

Showing 11 changed files with 585 additions and 316 deletions.
diff --git a/colossalai/lazy/pretrained.py b/colossalai/lazy/pretrained.py
@@ -62,7 +62,6 @@ def new_from_pretrained(
     config = kwargs.pop("config", None)
     cache_dir = kwargs.pop("cache_dir", None)
     force_download = kwargs.pop("force_download", False)
-    resume_download = kwargs.pop("resume_download", False)
     proxies = kwargs.pop("proxies", None)
     local_files_only = kwargs.pop("local_files_only", False)
     use_auth_token = kwargs.pop("use_auth_token", None)
@@ -116,7 +115,6 @@ def new_from_pretrained(
             cache_dir=cache_dir,
             return_unused_kwargs=True,
             force_download=force_download,
-            resume_download=resume_download,
             proxies=proxies,
             local_files_only=local_files_only,
             use_auth_token=use_auth_token,
@@ -195,7 +193,6 @@ def new_from_pretrained(
                     "cache_dir": cache_dir,
                     "force_download": force_download,
                     "proxies": proxies,
-                    "resume_download": resume_download,
                     "local_files_only": local_files_only,
                     "use_auth_token": use_auth_token,
                     "user_agent": user_agent,
@@ -312,7 +309,6 @@ def new_from_pretrained(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 force_download=force_download,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
@@ -812,11 +812,7 @@ def backward(ctx, *grad_output):
         process_group = ctx.process_group
         scatter_dim = ctx.gather_dim
         gather_dim = ctx.scatter_dim
-        if torch.distributed.get_rank() == 0:
-            print(f"shape before A2A: {grad_output[0].shape}")
         return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim)
-        if torch.distributed.get_rank() == 0:
-            print(f"shape after A2A: {return_grad.shape}")
         return (return_grad, None, None, None)