From 33fea37a80e9983752b7547f948297a53c8d9078 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 9 Jan 2024 10:49:15 +0800 Subject: [PATCH] polihs --- .github/workflows/build_on_pr.yml | 23 ++++++++++++-- tests/kit/model_zoo/__init__.py | 30 ++++++++++++++++++- tests/kit/model_zoo/registry.py | 17 ++++++----- .../test_plugin/test_3d_plugin.py | 1 + .../test_plugin/test_gemini_plugin.py | 4 +-- .../test_plugin/test_low_level_zero_plugin.py | 9 ++++-- .../test_plugin/test_torch_ddp_plugin.py | 9 ++++-- .../test_plugin/test_torch_fsdp_plugin.py | 9 ++++-- .../test_gemini_checkpoint_io.py | 14 ++++----- .../test_gemini_torch_compability.py | 2 +- ...st_hybrid_parallel_plugin_checkpoint_io.py | 2 +- .../test_plugins_huggingface_compatibility.py | 2 +- tests/test_lazy/test_models.py | 4 +-- 13 files changed, 95 insertions(+), 31 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 8eb358c4f42c..4f8a85509076 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -142,7 +142,7 @@ jobs: container: image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny - timeout-minutes: 60 + timeout-minutes: 90 defaults: run: shell: bash @@ -174,6 +174,7 @@ jobs: run: | cd TensorNVMe cp -p -r ./build /github/home/tensornvme_cache/ + cp -p -r ./cmake-build /github/home/tensornvme_cache/ - name: Checkout Colossal-AI uses: actions/checkout@v2 @@ -208,9 +209,25 @@ jobs: - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ + -m "not largedist" \ + --testmon \ + --testmon-forceselect \ + --testmon-cov=. \ + --durations=0 \ + --ignore tests/test_analyzer \ + --ignore tests/test_auto_parallel \ + --ignore tests/test_fx \ + --ignore tests/test_autochunk \ + --ignore tests/test_gptq \ + --ignore tests/test_infer_ops \ + --ignore tests/test_legacy \ + --ignore tests/test_moe \ + --ignore tests/test_smoothquant \ + --ignore tests/test_checkpoint_io \ + --ignore tests/test_shardformer \ + tests/ env: - DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py index 04f63b0675f3..5f6789ff3357 100644 --- a/tests/kit/model_zoo/__init__.py +++ b/tests/kit/model_zoo/__init__.py @@ -1,5 +1,33 @@ +import os from . import custom, diffusers, timm, torchaudio, torchvision, transformers from .executor import run_fwd, run_fwd_bwd from .registry import model_zoo -__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"] +# We pick a subset of models for fast testing in order to reduce the total testing time +COMMON_MODELS = [ + 'custom_hanging_param_model', + 'custom_nested_model', + 'custom_repeated_computed_layers', + 'custom_simple_net', + 'diffusers_clip_text_model', + 'diffusers_auto_encoder_kl', + 'diffusers_unet2d_model', + 'timm_densenet', + 'timm_resnet', + 'timm_swin_transformer', + 'torchaudio_wav2vec2_base', + 'torchaudio_conformer', + 'transformers_bert_for_masked_lm', + 'transformers_bloom_for_causal_lm', + 'transformers_falcon_for_causal_lm', + 'transformers_chatglm_for_conditional_generation', + 'transformers_llama_for_casual_lm', + 'transformers_vit_for_masked_image_modeling', + 'transformers_mistral_for_casual_lm' +] + +IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1' + + +__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST'] + diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py index bb522778bb5d..44a0adc6a3af 100644 --- a/tests/kit/model_zoo/registry.py +++ b/tests/kit/model_zoo/registry.py @@ -1,6 +1,6 @@ #!/usr/bin/env python from dataclasses import dataclass -from typing import Callable +from typing import Callable, List, Union __all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"] @@ -61,7 +61,7 @@ def register( """ self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute) - def get_sub_registry(self, keyword: str): + def get_sub_registry(self, keyword: Union[str, List[str]]): """ Get a sub registry with models that contain the keyword. @@ -70,12 +70,15 @@ def get_sub_registry(self, keyword: str): """ new_dict = dict() + if isinstance(keyword, str): + keyword_list = [keyword] + else: + keyword_list = keyword + assert isinstance(keyword_list, (list, tuple)) + for k, v in self.items(): - if keyword == "transformers_gpt": - if keyword in k and not "gptj" in k: # ensure GPT2 does not retrieve GPTJ models - new_dict[k] = v - else: - if keyword in k: + for kw in keyword_list: + if kw in k: new_dict[k] = v assert len(new_dict) > 0, f"No model found with keyword {keyword}" diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py index ad878fb0c86a..eca5b568843b 100644 --- a/tests/test_booster/test_plugin/test_3d_plugin.py +++ b/tests/test_booster/test_plugin/test_3d_plugin.py @@ -68,6 +68,7 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True): for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry( "transformers_llama_for_casual_lm" ).items(): + print(name) err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) torch.cuda.empty_cache() diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index d4205e1f9d73..3462d5dde52b 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -13,7 +13,7 @@ from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]: @@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # @parameterize('init_method', ['lazy', 'none', 'colo']) -@parameterize("subset", ["torchvision", "transformers", "diffusers"]) +@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) @parameterize("tp_size", [2]) diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 3eaaf882c9ba..bcdcc1470e6c 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -11,7 +11,7 @@ # from colossalai.nn.optimizer import HybridAdam from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS # These models are not compatible with AMP _AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"] @@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True): ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS skipped_models = [] - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): # FIXME(ver217): fix these models if name in ignore_models: skipped_models.append(name) diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py index 1a7ca6f2a30c..fa32feb2ff85 100644 --- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py @@ -11,7 +11,7 @@ from colossalai.booster.plugin import TorchDDPPlugin from colossalai.interface import OptimizerWrapper from colossalai.testing import rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS def run_fn(model_fn, data_gen_fn, output_transform_fn): @@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn): def check_torch_ddp_plugin(): - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): if name == "dlrm_interactionarch": continue run_fn(model_fn, data_gen_fn, output_transform_fn) diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py index 8bcbffdd06fe..8a14d7cf872d 100644 --- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py @@ -12,7 +12,7 @@ from colossalai.interface import OptimizerWrapper from colossalai.testing import rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS # test basic fsdp function @@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn): def check_torch_fsdp_plugin(): - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): if any( element in name for element in [ diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py index 8343c5f07e30..49fd85ffba0a 100644 --- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py @@ -7,6 +7,7 @@ from utils import shared_tempdir import colossalai +from colossalai.testing import skip_if_not_enough_gpus from colossalai.booster import Booster from colossalai.booster.plugin import GeminiPlugin from colossalai.lazy import LazyInitContext @@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b @clear_cache_before_run() @parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS) @parameterize("shard", [True, False]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("size_per_shard", [32]) @parameterize("tp_size", [1, 2]) @parameterize("zero_size", [2]) @@ -156,13 +157,12 @@ def run_dist(rank, world_size, port): @pytest.mark.dist -@pytest.mark.parametrize("world_size", [4]) @rerun_if_address_is_in_use() -def test_gemini_ckpIO(world_size): - spawn(run_dist, world_size) +def test_gemini_ckpIO(): + spawn(run_dist, 4) @pytest.mark.largedist -@pytest.mark.parametrize("world_size", [8]) +@skip_if_not_enough_gpus(min_gpus=8) @rerun_if_address_is_in_use() -def test_gemini_ckpIO_3d(world_size): - spawn(run_dist, world_size) \ No newline at end of file +def test_gemini_ckpIO_3d(): + spawn(run_dist, 8) \ No newline at end of file diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py index bb7a60035e02..44a000113629 100644 --- a/tests/test_checkpoint_io/test_gemini_torch_compability.py +++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py @@ -20,7 +20,7 @@ @clear_cache_before_run() @parameterize("shard", [False, True]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) def exam_torch_load_from_gemini(shard: bool, model_name: str): (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = lambda x: x.mean() diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py index c0bc2d2f5d0a..db3c56da874d 100644 --- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py @@ -40,7 +40,7 @@ @clear_cache_before_run() @parameterize("shard", [True, False]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("size_per_shard", [32]) @parameterize("test_config", TEST_CONFIGS) def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict): diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py index a6f67e0d7729..0353ff115840 100644 --- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py +++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py @@ -18,7 +18,7 @@ @clear_cache_before_run() -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("plugin_type", ["ddp", "zero", "gemini"]) def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32): (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next( diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py index a1b5763d4cd8..ee50e5b61009 100644 --- a/tests/test_lazy/test_models.py +++ b/tests/test_lazy/test_models.py @@ -1,11 +1,11 @@ import pytest from lazy_init_utils import SUPPORT_LAZY, check_lazy_init -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS @pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0") -@pytest.mark.parametrize("subset", ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"]) +@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"]) @pytest.mark.parametrize("default_device", ["cpu", "cuda"]) def test_torchvision_models_lazy_init(subset, default_device): sub_model_zoo = model_zoo.get_sub_registry(subset)