Skip to content

Commit

Permalink
[workflow] fixed build CI (#5240)
Browse files Browse the repository at this point in the history
* [workflow] fixed build CI

* polish

* polish

* polish

* polish

* polish
  • Loading branch information
FrankLeeeee authored and ver217 committed Feb 7, 2024
1 parent 5a44f08 commit c7eaf92
Show file tree
Hide file tree
Showing 14 changed files with 101 additions and 156 deletions.
136 changes: 16 additions & 120 deletions .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,57 +22,6 @@ on:
delete:

jobs:
prepare_cache:
name: Prepare testmon cache
if: |
github.event_name == 'create' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
fi
env:
MAIN_BRANCH: ${{ github.event.master_branch }}

prepare_cache_for_pr:
name: Prepare testmon cache for PR
if: |
github.event_name == 'pull_request' &&
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
cancel-in-progress: true
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
fi
env:
PR_NUMBER: ${{ github.event.number }}

detect:
name: Detect file change
if: |
Expand Down Expand Up @@ -140,7 +89,7 @@ jobs:
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 60
defaults:
Expand Down Expand Up @@ -174,6 +123,7 @@ jobs:
run: |
cd TensorNVMe
cp -p -r ./build /github/home/tensornvme_cache/
cp -p -r ./cmake-build /github/home/tensornvme_cache/
- name: Checkout Colossal-AI
uses: actions/checkout@v2
Expand All @@ -198,31 +148,27 @@ jobs:
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
- name: Restore Testmon Cache
run: |
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
fi
env:
PR_NUMBER: ${{ github.event.number }}

- name: Execute Unit Testing
run: |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
-m "not largedist" \
--durations=0 \
--ignore tests/test_analyzer \
--ignore tests/test_auto_parallel \
--ignore tests/test_fx \
--ignore tests/test_autochunk \
--ignore tests/test_gptq \
--ignore tests/test_infer_ops \
--ignore tests/test_legacy \
--ignore tests/test_moe \
--ignore tests/test_smoothquant \
--ignore tests/test_checkpoint_io \
tests/
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
LLAMA_PATH: /data/scratch/llama-tiny

- name: Store Testmon Cache
run: |
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
env:
PR_NUMBER: ${{ github.event.number }}

- name: Collate artifact
env:
PR_NUMBER: ${{ github.event.number }}
Expand Down Expand Up @@ -260,53 +206,3 @@ jobs:
name: report
path: report/

store_cache:
name: Store testmon cache for PR
if: |
github.event_name == 'pull_request' &&
github.event.action == 'closed' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Store testmon cache if possible
if: github.event.pull_request.merged == true
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
fi
env:
PR_NUMBER: ${{ github.event.pull_request.number }}

- name: Remove testmon cache
run: |
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}

remove_cache:
name: Remove testmon cache
if: |
github.event_name == 'delete' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Remove testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
rm -rf "/github/home/testmon_cache/${BASE}"
15 changes: 9 additions & 6 deletions .github/workflows/build_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,20 @@ jobs:
build:
name: Build and Test Colossal-AI
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 40
timeout-minutes: 90
steps:
- name: Check GPU Availability # ensure all GPUs have enough memory
id: check-avai
run: |
avai=true
for i in $(seq 0 7);
for i in $(seq 0 3);
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -gt "10000" ] && avai=false
[ "$gpu_used" -gt "2000" ] && avai=false
done
echo "GPU is available: $avai"
Expand Down Expand Up @@ -60,9 +60,12 @@ jobs:
- name: Unit Testing
if: steps.check-avai.outputs.avai == 'true'
run: |
PYTHONPATH=$PWD pytest --durations=0 tests
PYTHONPATH=$PWD pytest \
-m "not largedist" \
--durations=0 \
tests/
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/doc_test_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
name: Test the changed Doc
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm
timeout-minutes: 60
steps:
Expand Down
32 changes: 30 additions & 2 deletions tests/kit/model_zoo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,33 @@
from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers
import os
from . import custom, diffusers, timm, torchaudio, torchvision, transformers
from .executor import run_fwd, run_fwd_bwd
from .registry import model_zoo

__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"]
# We pick a subset of models for fast testing in order to reduce the total testing time
COMMON_MODELS = [
'custom_hanging_param_model',
'custom_nested_model',
'custom_repeated_computed_layers',
'custom_simple_net',
'diffusers_clip_text_model',
'diffusers_auto_encoder_kl',
'diffusers_unet2d_model',
'timm_densenet',
'timm_resnet',
'timm_swin_transformer',
'torchaudio_wav2vec2_base',
'torchaudio_conformer',
'transformers_bert_for_masked_lm',
'transformers_bloom_for_causal_lm',
'transformers_falcon_for_causal_lm',
'transformers_chatglm_for_conditional_generation',
'transformers_llama_for_casual_lm',
'transformers_vit_for_masked_image_modeling',
'transformers_mistral_for_casual_lm'
]

IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'


__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']

17 changes: 10 additions & 7 deletions tests/kit/model_zoo/registry.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
from dataclasses import dataclass
from typing import Callable
from typing import Callable, List, Union

__all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"]

Expand Down Expand Up @@ -61,7 +61,7 @@ def register(
"""
self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)

def get_sub_registry(self, keyword: str):
def get_sub_registry(self, keyword: Union[str, List[str]]):
"""
Get a sub registry with models that contain the keyword.
Expand All @@ -70,12 +70,15 @@ def get_sub_registry(self, keyword: str):
"""
new_dict = dict()

if isinstance(keyword, str):
keyword_list = [keyword]
else:
keyword_list = keyword
assert isinstance(keyword_list, (list, tuple))

for k, v in self.items():
if keyword == "transformers_gpt":
if keyword in k and not "gptj" in k: # ensure GPT2 does not retrieve GPTJ models
new_dict[k] = v
else:
if keyword in k:
for kw in keyword_list:
if kw in k:
new_dict[k] = v

assert len(new_dict) > 0, f"No model found with keyword {keyword}"
Expand Down
4 changes: 2 additions & 2 deletions tests/test_booster/test_plugin/test_gemini_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from colossalai.nn.optimizer import HybridAdam
from colossalai.tensor.colo_parameter import ColoParameter
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo
from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST


def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
Expand Down Expand Up @@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
# @parameterize('init_method', ['lazy', 'none', 'colo'])


@parameterize("subset", ["torchvision", "transformers", "diffusers"])
@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"])
@parameterize("init_method", ["none"])
@parameterize("zero_size", [2])
@parameterize("tp_size", [2])
Expand Down
9 changes: 7 additions & 2 deletions tests/test_booster/test_plugin/test_low_level_zero_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS

# These models are not compatible with AMP
_AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
Expand Down Expand Up @@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
skipped_models = []

for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
if IS_FAST_TEST:
registry = model_zoo.get_sub_registry(COMMON_MODELS)
else:
registry = model_zoo

for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
# FIXME(ver217): fix these models
if name in ignore_models:
skipped_models.append(name)
Expand Down
9 changes: 7 additions & 2 deletions tests/test_booster/test_plugin/test_torch_ddp_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from colossalai.booster.plugin import TorchDDPPlugin
from colossalai.interface import OptimizerWrapper
from colossalai.testing import rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS


def run_fn(model_fn, data_gen_fn, output_transform_fn):
Expand Down Expand Up @@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):


def check_torch_ddp_plugin():
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
if IS_FAST_TEST:
registry = model_zoo.get_sub_registry(COMMON_MODELS)
else:
registry = model_zoo

for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
if name == "dlrm_interactionarch":
continue
run_fn(model_fn, data_gen_fn, output_transform_fn)
Expand Down
9 changes: 7 additions & 2 deletions tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from colossalai.interface import OptimizerWrapper
from colossalai.testing import rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS


# test basic fsdp function
Expand Down Expand Up @@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):


def check_torch_fsdp_plugin():
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
if IS_FAST_TEST:
registry = model_zoo.get_sub_registry(COMMON_MODELS)
else:
registry = model_zoo

for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
if any(
element in name
for element in [
Expand Down
Loading

0 comments on commit c7eaf92

Please sign in to comment.