From 2a493b68c8d51c8a66d249f54811dee350d45059 Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Thu, 16 Nov 2023 10:32:45 +0800 Subject: [PATCH] udpate example --- colossalai/inference/hybridengine/engine.py | 11 +- .../ray_serve/Colossal_Inference_rayserve.py | 0 .../inference/serving/ray_serve/README.md | 0 .../serving/ray_serve/send_request.py | 0 .../serving/ray_serve/send_requests.py | 0 .../legacy}/inference/serving/test_ci.sh | 0 .../torch_serve/Colossal_Inference_Handler.py | 0 .../inference/serving/torch_serve/README.md | 0 .../serving/torch_serve/config.properties | 0 .../serving/torch_serve/docker/Dockerfile | 0 .../serving/torch_serve/model-config.yaml | 0 .../serving/torch_serve/sample_text.txt | 0 examples/inference/bench_bloom.py | 84 ------------- examples/inference/bench_chatglm2.py | 118 ----------------- examples/inference/bench_llama.py | 119 ------------------ examples/inference/benchmark.py | 68 ++++++---- examples/inference/gptq_bloom.py | 105 ---------------- examples/inference/gptq_llama.py | 87 ------------- examples/inference/hybrid_llama.py | 96 ++++++++++++++ examples/inference/run.sh | 21 ++-- 20 files changed, 161 insertions(+), 548 deletions(-) rename {examples => colossalai/legacy}/inference/serving/ray_serve/Colossal_Inference_rayserve.py (100%) rename {examples => colossalai/legacy}/inference/serving/ray_serve/README.md (100%) rename {examples => colossalai/legacy}/inference/serving/ray_serve/send_request.py (100%) rename {examples => colossalai/legacy}/inference/serving/ray_serve/send_requests.py (100%) rename {examples => colossalai/legacy}/inference/serving/test_ci.sh (100%) rename {examples => colossalai/legacy}/inference/serving/torch_serve/Colossal_Inference_Handler.py (100%) rename {examples => colossalai/legacy}/inference/serving/torch_serve/README.md (100%) rename {examples => colossalai/legacy}/inference/serving/torch_serve/config.properties (100%) rename {examples => colossalai/legacy}/inference/serving/torch_serve/docker/Dockerfile (100%) rename {examples => colossalai/legacy}/inference/serving/torch_serve/model-config.yaml (100%) rename {examples => colossalai/legacy}/inference/serving/torch_serve/sample_text.txt (100%) delete mode 100644 examples/inference/bench_bloom.py delete mode 100644 examples/inference/bench_chatglm2.py delete mode 100644 examples/inference/bench_llama.py delete mode 100644 examples/inference/gptq_bloom.py delete mode 100644 examples/inference/gptq_llama.py create mode 100644 examples/inference/hybrid_llama.py diff --git a/colossalai/inference/hybridengine/engine.py b/colossalai/inference/hybridengine/engine.py index 9248d45ff1c3..bd5a2c2e6cbd 100644 --- a/colossalai/inference/hybridengine/engine.py +++ b/colossalai/inference/hybridengine/engine.py @@ -14,7 +14,14 @@ PP_AXIS, TP_AXIS = 0, 1 -_supported_models = ["LlamaForCausalLM", "BloomForCausalLM", "LlamaGPTQForCausalLM", "SmoothLlamaForCausalLM", "ChatGLMForConditionalGeneration"] +_supported_models = [ + "LlamaForCausalLM", + "BloomForCausalLM", + "LlamaGPTQForCausalLM", + "SmoothLlamaForCausalLM", + "ChatGLMForConditionalGeneration", +] + class CaiInferEngine: """ @@ -161,7 +168,7 @@ def _shardformer(self, model, model_policy, stage_manager, tp_group): enable_flash_attention=False, enable_jit_fused=False, enable_sequence_parallelism=False, - quant=self.quant, + extra_kwargs={"quant": self.quant}, ) shardformer = ShardFormer(shard_config=shardconfig) shard_model, _ = shardformer.optimize(model, model_policy) diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/colossalai/legacy/inference/serving/ray_serve/Colossal_Inference_rayserve.py similarity index 100% rename from examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py rename to colossalai/legacy/inference/serving/ray_serve/Colossal_Inference_rayserve.py diff --git a/examples/inference/serving/ray_serve/README.md b/colossalai/legacy/inference/serving/ray_serve/README.md similarity index 100% rename from examples/inference/serving/ray_serve/README.md rename to colossalai/legacy/inference/serving/ray_serve/README.md diff --git a/examples/inference/serving/ray_serve/send_request.py b/colossalai/legacy/inference/serving/ray_serve/send_request.py similarity index 100% rename from examples/inference/serving/ray_serve/send_request.py rename to colossalai/legacy/inference/serving/ray_serve/send_request.py diff --git a/examples/inference/serving/ray_serve/send_requests.py b/colossalai/legacy/inference/serving/ray_serve/send_requests.py similarity index 100% rename from examples/inference/serving/ray_serve/send_requests.py rename to colossalai/legacy/inference/serving/ray_serve/send_requests.py diff --git a/examples/inference/serving/test_ci.sh b/colossalai/legacy/inference/serving/test_ci.sh similarity index 100% rename from examples/inference/serving/test_ci.sh rename to colossalai/legacy/inference/serving/test_ci.sh diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/colossalai/legacy/inference/serving/torch_serve/Colossal_Inference_Handler.py similarity index 100% rename from examples/inference/serving/torch_serve/Colossal_Inference_Handler.py rename to colossalai/legacy/inference/serving/torch_serve/Colossal_Inference_Handler.py diff --git a/examples/inference/serving/torch_serve/README.md b/colossalai/legacy/inference/serving/torch_serve/README.md similarity index 100% rename from examples/inference/serving/torch_serve/README.md rename to colossalai/legacy/inference/serving/torch_serve/README.md diff --git a/examples/inference/serving/torch_serve/config.properties b/colossalai/legacy/inference/serving/torch_serve/config.properties similarity index 100% rename from examples/inference/serving/torch_serve/config.properties rename to colossalai/legacy/inference/serving/torch_serve/config.properties diff --git a/examples/inference/serving/torch_serve/docker/Dockerfile b/colossalai/legacy/inference/serving/torch_serve/docker/Dockerfile similarity index 100% rename from examples/inference/serving/torch_serve/docker/Dockerfile rename to colossalai/legacy/inference/serving/torch_serve/docker/Dockerfile diff --git a/examples/inference/serving/torch_serve/model-config.yaml b/colossalai/legacy/inference/serving/torch_serve/model-config.yaml similarity index 100% rename from examples/inference/serving/torch_serve/model-config.yaml rename to colossalai/legacy/inference/serving/torch_serve/model-config.yaml diff --git a/examples/inference/serving/torch_serve/sample_text.txt b/colossalai/legacy/inference/serving/torch_serve/sample_text.txt similarity index 100% rename from examples/inference/serving/torch_serve/sample_text.txt rename to colossalai/legacy/inference/serving/torch_serve/sample_text.txt diff --git a/examples/inference/bench_bloom.py b/examples/inference/bench_bloom.py deleted file mode 100644 index 5c7af6ed5aef..000000000000 --- a/examples/inference/bench_bloom.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse -import os -import time - -import torch -from _utils import print_perf_stats -from transformers import BloomForCausalLM, BloomTokenizerFast - -import colossalai -from colossalai.inference.tensor_parallel.engine import TPInferEngine -from colossalai.logging import disable_existing_loggers -from colossalai.shardformer import ShardConfig -from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn - -os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true" - - -def bench_bloom(args): - model_path = args.path - max_batch_size = args.batch_size - max_input_len = args.input_len - max_output_len = args.output_len - - tokenizer = BloomTokenizerFast.from_pretrained(model_path) - tokenizer.pad_token = tokenizer.eos_token - model = BloomForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id) - model = model.half() - - # init TPInferEngine and shard the original model - # To benchmark torch original, comment out the line of optimizing model - shard_config = ShardConfig( - enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True} - ) - infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) - - # prepare data for generation - generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) - input_tokens = { - "input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)), - "attention_mask": torch.ones((max_batch_size, max_input_len)), - } - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) - print(f" input_tokens[{t}].shape: {input_tokens[t].shape}") - - iters = 10 - times = [] - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s") - times.append((end - start) / (out_len - max_input_len)) - - print_perf_stats(times, model.config, max_batch_size) - - -def check_bloom(rank, world_size, port, args): - disable_existing_loggers() - colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") - bench_bloom(args) - - -@rerun_if_address_is_in_use() -@clear_cache_before_run() -def test_bloom(args): - spawn(check_bloom, args.tp_size, args=args) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", type=str, help="Model path", required=True) - parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size") - parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size") - parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length") - parser.add_argument("--output_len", type=int, default=128, help="Maximum output length") - - args = parser.parse_args() - - test_bloom(args) diff --git a/examples/inference/bench_chatglm2.py b/examples/inference/bench_chatglm2.py deleted file mode 100644 index 3892d98ba743..000000000000 --- a/examples/inference/bench_chatglm2.py +++ /dev/null @@ -1,118 +0,0 @@ -import argparse -import os -import time - -import torch -from _utils import print_perf_stats -from transformers import AutoTokenizer - -import colossalai -from colossalai.inference.tensor_parallel.engine import TPInferEngine -from colossalai.logging import disable_existing_loggers -from colossalai.shardformer import ShardConfig -from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration -from colossalai.testing import rerun_if_address_is_in_use, spawn - -os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true" - - -def run_chatglm2_test(args): - chatglm2_model_path = args.path - max_batch_size = args.batch_size - max_input_len = args.input_len - max_output_len = args.output_len - args.test_mode - - print("max_batch_size : " + str(max_batch_size)) - - tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True) - model = ChatGLMForConditionalGeneration.from_pretrained(chatglm2_model_path, pad_token_id=tokenizer.eos_token_id) - model = model.half() - model.config - - shard_config = ShardConfig( - enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True} - ) - infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) - - generate_kwargs = dict(max_new_tokens=1, do_sample=False) - input_tokens = { - "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"), - "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"), - } - - iters = 10 - prefill_times = [] - - warmup = 3 - - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print("generation time {} s".format(str(end - start))) - print(out_len - max_input_len) - prefill_times.append((end - start) / (out_len - max_input_len)) - - prefill_times = prefill_times[warmup:] - prefill_time_avg = sum(prefill_times) / len(prefill_times) - generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) - - times = [] - decoder_times = [] - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print("generation time {} s".format(str(end - start))) - print(out_len - max_input_len) - times.append((end - start) / (out_len - max_input_len)) - if args.test_mode == "decoder_test": - decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1)) - - times = times[warmup:] - latency = sum(times) / len(times) - print("total process latency is : " + str(latency) + " s") - print("total throughput is : " + str(1 / latency * max_batch_size)) - - if args.test_mode == "decoder_test": - decoder_times = decoder_times[warmup:] - latency = sum(decoder_times) / len(decoder_times) - - print("decoder process latency is : " + str(latency) + " s") - print("decoder throughput is : " + str(1 / latency * max_batch_size)) - - print_perf_stats(times, model.config, max_batch_size) - - -def check_chatglm2(rank, world_size, port, args): - disable_existing_loggers() - colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") - run_chatglm2_test(args) - - -@rerun_if_address_is_in_use() -def test_chatglm2(args): - spawn(check_chatglm2, args.tp_size, args=args) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", type=str, help="Model path", required=True) - parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size") - parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size") - parser.add_argument("--input_len", type=int, default=256, help="Maximum input length") - parser.add_argument("--output_len", type=int, default=128, help="Maximum output length") - parser.add_argument( - "--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"] - ) - - args = parser.parse_args() - - test_chatglm2(args) diff --git a/examples/inference/bench_llama.py b/examples/inference/bench_llama.py deleted file mode 100644 index 4db32c71af30..000000000000 --- a/examples/inference/bench_llama.py +++ /dev/null @@ -1,119 +0,0 @@ -import argparse -import os -import time - -import torch -from _utils import print_perf_stats -from transformers import LlamaForCausalLM, LlamaTokenizer - -import colossalai -from colossalai.inference.tensor_parallel.engine import TPInferEngine -from colossalai.logging import disable_existing_loggers -from colossalai.shardformer import ShardConfig -from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn - -os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true" - - -def run_llama_test(args): - llama_model_path = args.path - max_batch_size = args.batch_size - max_input_len = args.input_len - max_output_len = args.output_len - args.test_mode - - print("max_batch_size : " + str(max_batch_size)) - - tokenizer = LlamaTokenizer.from_pretrained(llama_model_path) - tokenizer.pad_token_id = tokenizer.unk_token_id - model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id) - model = model.half() - model.config - - shard_config = ShardConfig( - enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True} - ) - infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) - - generate_kwargs = dict(max_new_tokens=1, do_sample=False) - input_tokens = { - "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"), - "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"), - } - - iters = 10 - prefill_times = [] - - warmup = 3 - - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print("generation time {} s".format(str(end - start))) - print(out_len - max_input_len) - prefill_times.append((end - start) / (out_len - max_input_len)) - - prefill_times = prefill_times[warmup:] - prefill_time_avg = sum(prefill_times) / len(prefill_times) - generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) - - times = [] - decoder_times = [] - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print("generation time {} s".format(str(end - start))) - print(out_len - max_input_len) - times.append((end - start) / (out_len - max_input_len)) - if args.test_mode == "decoder_test": - decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1)) - - times = times[warmup:] - latency = sum(times) / len(times) - print("total process latency is : " + str(latency) + " s") - print("total throughput is : " + str(1 / latency * max_batch_size)) - - if args.test_mode == "decoder_test": - decoder_times = decoder_times[warmup:] - latency = sum(decoder_times) / len(decoder_times) - - print("decoder process latency is : " + str(latency) + " s") - print("decoder throughput is : " + str(1 / latency * max_batch_size)) - - print_perf_stats(times, model.config, max_batch_size) - - -def check_llama(rank, world_size, port, args): - disable_existing_loggers() - colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") - run_llama_test(args) - - -@rerun_if_address_is_in_use() -@clear_cache_before_run() -def test_llama(args): - spawn(check_llama, args.tp_size, args=args) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", type=str, help="Model path", required=True) - parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size") - parser.add_argument("-b", "--batch_size", type=int, default=32, help="Maximum batch size") - parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length") - parser.add_argument("--output_len", type=int, default=128, help="Maximum output length") - parser.add_argument( - "--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"] - ) - - args = parser.parse_args() - - test_llama(args) diff --git a/examples/inference/benchmark.py b/examples/inference/benchmark.py index 8392d0a1e579..8c52c97dac1d 100644 --- a/examples/inference/benchmark.py +++ b/examples/inference/benchmark.py @@ -1,4 +1,5 @@ import argparse +import os import time import torch @@ -6,14 +7,12 @@ import transformers import colossalai -from colossalai.inference import PPInferEngine -from colossalai.inference.pipeline.policies import LlamaModelInferPolicy +from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy +from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn GIGABYTE = 1024**3 MEGABYTE = 1024 * 1024 -colossalai.launch_from_torch(config={}) - def data_gen(batch_size: int = 4, seq_len: int = 512): input_ids = torch.randint(10, 30000, (1, seq_len), dtype=torch.int32) @@ -28,6 +27,9 @@ def data_gen(batch_size: int = 4, seq_len: int = 512): def print_details_info(timestamps, model_config, args, whole_end2end): + log_file_name = f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.output_len}_bsz{args.batch_size}_mbsz{args.mb_size}.log" + os.makedirs(os.path.dirname(log_file_name), exist_ok=True) + if dist.get_rank() == 0: prefill = [] encoder = [] @@ -39,13 +41,14 @@ def print_details_info(timestamps, model_config, args, whole_end2end): ) end2end.append(timestamp[-1] - timestamp[0]) print(whole_end2end) + with open( - f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log", + log_file_name, "w+", ) as f: mb_avg_end2end = sum(end2end) / len(end2end) - mb_avg_latency = mb_avg_end2end / (args.new_length * args.mb_size) - whole_avg_latency = whole_end2end / (args.new_length * args.batch_size) + mb_avg_latency = mb_avg_end2end / (args.output_len * args.mb_size) + whole_avg_latency = whole_end2end / (args.output_len * args.batch_size) num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers) num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size if args.dtype in ["fp16", "bf16"]: @@ -54,7 +57,7 @@ def print_details_info(timestamps, model_config, args, whole_end2end): num_bytes = 4 f.write( - f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.new_length}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n" + f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.output_len}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n" ) f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill) / len(prefill) * 1000)) f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder) / len(encoder) * 1000)) @@ -76,7 +79,7 @@ def print_details_info(timestamps, model_config, args, whole_end2end): memory_reserved = torch.cuda.memory_reserved() max_memory_reserved = torch.cuda.max_memory_reserved() with open( - f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log", + log_file_name, "a", ) as f: f.write( @@ -90,18 +93,7 @@ def print_details_info(timestamps, model_config, args, whole_end2end): ) -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="toy", help="the size of model") - parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size") - parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length") - parser.add_argument("--new_length", type=int, default=4, help="new tokens length") - parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size") - parser.add_argument("--pp_size", type=int, default=2, help="pipeline size") - parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log") - parser.add_argument("--dtype", type=str, default="fp16", help="data type") - args = parser.parse_args() - +def benchmark_inference(args): if args.model == "toy": model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8)) elif args.model == "7b": @@ -111,24 +103,50 @@ def print_details_info(timestamps, model_config, args, whole_end2end): else: raise NotImplementedError - engine = PPInferEngine( + engine = CaiInferEngine( pp_size=args.pp_size, + tp_size=args.tp_size, dtype=args.dtype, micro_batch_size=args.mb_size, - new_length=args.new_length, model=model, model_policy=LlamaModelInferPolicy(), verbose=True, max_batch_size=args.mb_size, max_input_len=args.seq_len, - max_output_len=args.seq_len + args.new_length + 256, + max_output_len=args.output_len, ) data = data_gen(args.batch_size, args.seq_len) torch.cuda.synchronize() whole_end2end = time.time() - output, timestamps = engine.inference([data]) + output, timestamps = engine.inference(data) torch.cuda.synchronize() whole_end2end = time.time() - whole_end2end print_details_info(timestamps, model.config, args, whole_end2end) + + +def hybrid_inference(rank, world_size, port, args): + colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") + benchmark_inference(args) + + +@rerun_if_address_is_in_use() +@clear_cache_before_run() +def benchmark(args): + spawn(hybrid_inference, nprocs=args.tp_size * args.pp_size, args=args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="toy", help="the size of model") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size") + parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length") + parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size") + parser.add_argument("--pp_size", type=int, default=2, help="pipeline size") + parser.add_argument("--tp_size", type=int, default=2, help="pipeline size") + parser.add_argument("--output_len", type=int, default=16, help="Output length") + parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log") + parser.add_argument("--dtype", type=str, default="fp16", help="data type") + args = parser.parse_args() + benchmark(args) diff --git a/examples/inference/gptq_bloom.py b/examples/inference/gptq_bloom.py deleted file mode 100644 index a6e07b98cf04..000000000000 --- a/examples/inference/gptq_bloom.py +++ /dev/null @@ -1,105 +0,0 @@ -import argparse -import os -import time - -import torch -from _utils import print_perf_stats -from auto_gptq import AutoGPTQForCausalLM -from transformers import BloomTokenizerFast - -import colossalai -from colossalai.inference.tensor_parallel.engine import TPInferEngine -from colossalai.logging import disable_existing_loggers -from colossalai.shardformer import ShardConfig -from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn - -os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true" - - -def bench_bloom(args): - pretrained_model_dir = args.path - quantized_model_dir = args.quantized_path - max_batch_size = args.batch_size - max_input_len = args.input_len - max_output_len = args.output_len - - tokenizer = BloomTokenizerFast.from_pretrained(pretrained_model_dir) - tokenizer.pad_token = tokenizer.eos_token - - # load quantized model to the first GPU - model = AutoGPTQForCausalLM.from_quantized( - quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False - ) - - model = model.half() - - model_config = model.config - shard_config = ShardConfig( - enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True} - ) - infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) - generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) - - input_tokens = { - "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"), - "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"), - } - - # init TPInferEngine and shard the original model - # To benchmark torch original, comment out the line of optimizing model - shard_config = ShardConfig( - enable_tensor_parallelism=True if args.tp_size > 1 else False, - extra_kwargs={"inference_only": True, "quant": "gptq"}, - ) - infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) - - # prepare data for generation - generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) - input_tokens = { - "input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)), - "attention_mask": torch.ones((max_batch_size, max_input_len)), - } - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) - # print(f" input_tokens[{t}].shape: {input_tokens[t].shape}") - - iters = 10 - times = [] - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s") - times.append((end - start) / (out_len - max_input_len)) - - print_perf_stats(times, model_config, max_batch_size) - - -def check_bloom(rank, world_size, port, args): - disable_existing_loggers() - colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") - bench_bloom(args) - - -@rerun_if_address_is_in_use() -@clear_cache_before_run() -def test_bloom(args): - spawn(check_bloom, args.tp_size, args=args) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", type=str, help="Model path", required=True) - parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True) - parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size") - parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size") - parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length") - parser.add_argument("--output_len", type=int, default=128, help="Maximum output length") - - args = parser.parse_args() - - test_bloom(args) diff --git a/examples/inference/gptq_llama.py b/examples/inference/gptq_llama.py deleted file mode 100644 index 61da7ca24f0c..000000000000 --- a/examples/inference/gptq_llama.py +++ /dev/null @@ -1,87 +0,0 @@ -import argparse -import os -import time - -import torch -from _utils import print_perf_stats -from auto_gptq import AutoGPTQForCausalLM -from transformers import LlamaTokenizer - -import colossalai -from colossalai.inference.tensor_parallel.engine import TPInferEngine -from colossalai.logging import disable_existing_loggers -from colossalai.shardformer import ShardConfig -from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn - -os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true" - - -def run_llama_test(args): - pretrained_model_dir = args.path - quantized_model_dir = args.quantized_path - max_batch_size = args.batch_size - max_input_len = args.input_len - max_output_len = args.output_len - - tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) - tokenizer.pad_token_id = tokenizer.eos_token_id - - # load quantized model to the first GPU - model = AutoGPTQForCausalLM.from_quantized( - quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False - ) - - model_config = model.config - shard_config = ShardConfig( - enable_tensor_parallelism=True if args.tp_size > 1 else False, - extra_kwargs={"inference_only": True, "quant": "gptq"}, - ) - infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) - - generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) - - input_tokens = { - "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"), - "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"), - } - - iters = 10 - times = [] - - for i in range(iters): - torch.cuda.synchronize() - start = time.time() - outputs = infer_engine.generate(input_tokens, **generate_kwargs) - torch.cuda.synchronize() - end = time.time() - out_len = outputs.shape[1] - print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s") - times.append((end - start) / (out_len - max_input_len)) - - print_perf_stats(times, model_config, max_batch_size) - - -def check_llama(rank, world_size, port, args): - disable_existing_loggers() - colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") - run_llama_test(args) - - -@rerun_if_address_is_in_use() -@clear_cache_before_run() -def test_llama(args): - spawn(check_llama, args.tp_size, args=args) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", type=str, help="Model path", required=True) - parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True) - parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size") - parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size") - parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length") - parser.add_argument("--output_len", type=int, default=128, help="Maximum output length") - - args = parser.parse_args() - - test_llama(args) diff --git a/examples/inference/hybrid_llama.py b/examples/inference/hybrid_llama.py new file mode 100644 index 000000000000..29262080fca9 --- /dev/null +++ b/examples/inference/hybrid_llama.py @@ -0,0 +1,96 @@ +import argparse +import time + +import pytest +import torch +import torch.distributed as dist +import transformers +from transformers import LlamaForCausalLM, LlamaTokenizer + +import colossalai +from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy +from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn + + +def pipeline_inference_test(args): + llama_model_path = args.path + max_input_len = args.max_input_len + max_output_len = args.max_output_len + max_batch_size = args.batch_size + micro_batch_size = args.micro_batch_size + tp_size = args.tp_size + pp_size = args.pp_size + rank = dist.get_rank() + + tokenizer = LlamaTokenizer.from_pretrained(llama_model_path) + tokenizer.pad_token_id = tokenizer.unk_token_id + model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id) + model = model.half() + + model = transformers.LlamaForCausalLM( + transformers.LlamaConfig( + vocab_size=20000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=4 + ) + ) + + engine = CaiInferEngine( + tp_size=tp_size, + pp_size=pp_size, + model=model, + model_policy=LlamaModelInferPolicy(), + max_output_len=max_output_len, + micro_batch_size=micro_batch_size, + ) + + input_tokens = { + "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"), + "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"), + } + + iters = 10 + warmup = 3 + times = [] + + for i in range(iters): + torch.cuda.synchronize() + start = time.time() + outputs = engine.inference(input_tokens) + torch.cuda.synchronize() + end = time.time() + if rank == 0: + out_len = len(outputs[0]) + print("generation time {} s".format(str(end - start))) + print(out_len) + times.append((end - start) / out_len) + if rank == 0: + times = times[warmup:] + latency = sum(times) / len(times) + print("total process latency is : " + str(latency) + " s") + print("total throughput is : " + str(1 / latency * max_batch_size)) + + +def check_tp_pipeline_inference(rank, world_size, port, args): + colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") + run_tp_pipeline_inference_test(args) + + +@pytest.mark.dist +@rerun_if_address_is_in_use() +@clear_cache_before_run() +def test_inference(args): + spawn(check_tp_pipeline_inference, nprocs=args.tp_size * args.pp_size, args=args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--path", type=str, help="Model path", required=True) + parser.add_argument("-tp", "--tp_size", type=int, default=2, help="Tensor parallel size") + parser.add_argument("-pp", "--pp_size", type=int, default=2, help="Tensor parallel size") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="Maximum batch size") + parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length") + parser.add_argument("--max_output_len", type=int, default=16, help="Maximum output length") + parser.add_argument("--micro_batch_size", type=int, default=2, help="Micro batch size") + + args = parser.parse_args() + + test_inference(args) diff --git a/examples/inference/run.sh b/examples/inference/run.sh index e3c33bb88db1..cb37e6586e51 100644 --- a/examples/inference/run.sh +++ b/examples/inference/run.sh @@ -1,50 +1,55 @@ script_dir=$(cd "$(dirname "$0")" && pwd) cd "${script_dir}" + # 7b, fp16, 2 gpu, 1024, 128 for BATCH_SIZE in 2 4 8 16; do - CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \ + CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \ --model="7b" \ --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=1024 \ --new_length=128 \ --mb_size=$((${BATCH_SIZE}/2)) \ - --pp_size=2 + --pp_size=2 \ + --tp_size=2 done # 7b, fp16, 2 gpu, 512, 512 for BATCH_SIZE in 2 4 8 16 32; do - CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \ + CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \ --model="7b" \ --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=512 \ --new_length=512 \ --mb_size=$((${BATCH_SIZE}/2)) \ - --pp_size=2 + --pp_size=2 \ + --tp_size=2 done # 7b, fp16, 2 gpu, 1024, 128 for BATCH_SIZE in 2 4 8; do - CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \ + CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \ --model="13b" \ --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=1024 \ --new_length=128 \ --mb_size=$((${BATCH_SIZE}/2)) \ - --pp_size=2 + --pp_size=2 \ + --tp_size=2 done # 13b, fp16, 2 gpu, 512, 512 for BATCH_SIZE in 2 4 8 16; do - CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \ + CUDA_VISIBLE_DEVICES0,1,2,3 python ./benchmark.py \ --model="13b" \ --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=512 \ --new_length=512 \ --mb_size=$((${BATCH_SIZE}/2)) \ - --pp_size=2 + --pp_size=2 \ + --tp_size=2 done