Skip to content

Commit

Permalink
[ci] fixed booster test (hpcaitech#5251)
Browse files Browse the repository at this point in the history
* [ci] fixed booster test

* [ci] fixed booster test

* [ci] fixed booster test
  • Loading branch information
FrankLeeeee authored and flybird11111 committed Jan 22, 2024
1 parent 929c32e commit 03c6112
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 14 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 60
defaults:
run:
Expand Down Expand Up @@ -165,7 +165,6 @@ jobs:
--ignore tests/test_checkpoint_io \
tests/
env:
NCCL_SHM_DISABLE: 1
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny

Expand Down Expand Up @@ -205,4 +204,3 @@ jobs:
with:
name: report
path: report/

9 changes: 5 additions & 4 deletions .github/workflows/build_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,16 @@ jobs:
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 90
steps:
- name: Check GPU Availability # ensure all GPUs have enough memory
id: check-avai
run: |
avai=true
for i in $(seq 0 3);
do
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
endIndex=$(($ngpu-1))
for i in $(seq 0 $endIndex);
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -gt "2000" ] && avai=false
done
Expand Down Expand Up @@ -74,7 +75,7 @@ jobs:
if: ${{ failure() }}
run: |
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
msg="Scheduled Build and Test failed, please visit $url for details"
echo $msg
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
env:
Expand Down
1 change: 0 additions & 1 deletion tests/kit/model_zoo/transformers/chatglm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel

from ..registry import ModelAttribute, model_zoo

# ================================
Expand Down
4 changes: 2 additions & 2 deletions tests/test_booster/test_plugin/test_3d_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
from colossalai.fx import is_compatible_with_meta
from colossalai.lazy.lazy_init import LazyInitContext
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo


@clear_cache_before_run()
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
try:
if init_method == "lazy":
Expand Down Expand Up @@ -69,7 +70,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
"transformers_llama_for_casual_lm"
).items():
err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
torch.cuda.empty_cache()

if err is None:
passed_models.append(name)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_booster/test_plugin/test_gemini_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
from colossalai.lazy.lazy_init import LazyInitContext
from colossalai.nn.optimizer import HybridAdam
from colossalai.tensor.colo_parameter import ColoParameter
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo


@clear_cache_before_run()
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
try:
if init_method == "lazy":
Expand Down Expand Up @@ -116,7 +117,7 @@ def check_gemini_plugin(
"transformers_falcon_for_sequence_classification",
"transformers_falcon_for_token_classification",
"transformers_falcon_for_question_answering",
"transformers_gptj_lm", # lead to OOM when running in ci
"transformers_gptj_lm", # lead to OOM when running in ci
"transformers_gptj_for_question_answering",
"transformers_gptj_for_sequence_classification",
]:
Expand Down Expand Up @@ -145,7 +146,6 @@ def check_gemini_plugin(
tp_size = 1

err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size)
torch.cuda.empty_cache()
if err is None:
passed_models.append(name)
else:
Expand Down

0 comments on commit 03c6112

Please sign in to comment.