-
Notifications
You must be signed in to change notification settings - Fork 198
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
CLI example + Runpod launch script (#548)
- Loading branch information
1 parent
268360e
commit 1716748
Showing
2 changed files
with
139 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import argparse | ||
from awq import AutoAWQForCausalLM | ||
from transformers import AutoTokenizer | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="CLI for model quantization and saving") | ||
parser.add_argument("--hf_model_path", type=str, required=True, help="Path to the Hugging Face model") | ||
parser.add_argument("--quant_name", type=str, required=True, help="Name of the quantized model") | ||
parser.add_argument("--local_save_path", type=str, required=True, help="Path to save the quantized model") | ||
|
||
# Quantization config arguments | ||
parser.add_argument("--zero_point", action="store_true", help="Enable zero point for quantization") | ||
parser.add_argument("--no-zero_point", action="store_false", dest="zero_point", help="Disable zero point for quantization") | ||
parser.add_argument("--q_group_size", type=int, default=128, help="Quantization group size") | ||
parser.add_argument("--w_bit", type=int, default=4, help="Weight bit width") | ||
parser.add_argument("--version", type=str, default="GEMM", help="Quantization version") | ||
|
||
# Model config arguments | ||
parser.add_argument("--low_cpu_mem_usage", action="store_true", help="Use low CPU memory") | ||
parser.add_argument("--no-low_cpu_mem_usage", action="store_false", dest="low_cpu_mem_usage", help="Don't use low CPU memory") | ||
parser.add_argument("--use_cache", action="store_true", help="Use cache") | ||
parser.add_argument("--no-use_cache", action="store_false", dest="use_cache", help="Don't use cache") | ||
|
||
parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=False) | ||
|
||
args = parser.parse_args() | ||
|
||
quant_config = { | ||
"zero_point": args.zero_point, | ||
"q_group_size": args.q_group_size, | ||
"w_bit": args.w_bit, | ||
"version": args.version | ||
} | ||
|
||
model_config = { | ||
"low_cpu_mem_usage": args.low_cpu_mem_usage, | ||
"use_cache": args.use_cache | ||
} | ||
|
||
print(f"Loading model from: {args.hf_model_path}") | ||
model = AutoAWQForCausalLM.from_pretrained(args.hf_model_path, **model_config) | ||
tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path, trust_remote_code=True) | ||
|
||
print(f"Quantizing model with config: {quant_config}") | ||
model.quantize(tokenizer, quant_config=quant_config) | ||
|
||
print(f"Saving quantized model to: {args.local_save_path}") | ||
model.save_quantized(args.local_save_path) | ||
tokenizer.save_pretrained(args.local_save_path) | ||
|
||
print(f"Quantized model '{args.quant_name}' saved successfully.") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
import time | ||
import runpod | ||
|
||
# Load environment variables | ||
HF_TOKEN = os.environ.get('HF_TOKEN') | ||
runpod.api_key = os.environ.get('RUNPOD_API_KEY') | ||
|
||
# RunPod Parameters | ||
# get more by running print(runpod.get_gpus()) | ||
template_name = f"AutoAWQ Pod {int(time.time())}" | ||
docker_image = "runpod/pytorch:2.2.0-py3.10-cuda12.1.1-devel-ubuntu22.04" | ||
gpu_ids = { | ||
"MI300X": "AMD Instinct MI300X OAM", # 192 GB, $3.99/h | ||
"H100": "NVIDIA H100 80GB HBM3", # 80 GB, $3.99/h | ||
"A100": "NVIDIA A100-SXM4-80GB", # 80 GB, $1.94/h | ||
"A6000": "NVIDIA RTX A6000", # 48 GB, $0.76/h | ||
"4090": "NVIDIA GeForce RTX 4090", # 24 GB, $0.69/h | ||
} | ||
env_variables = { | ||
"HF_TOKEN": HF_TOKEN, | ||
} | ||
gpu_id = gpu_ids["4090"] | ||
num_gpus = 1 | ||
system_memory_gb = 100 | ||
system_storage_gb = 20 # fp16 model is downloaded here | ||
volume_storage_gb = 20 # quantized model is saved here | ||
|
||
# Quantization Parameters | ||
hf_model_path = "Qwen/Qwen2-0.5B-Instruct" | ||
quant_name = "qwen2-0.5b-instruct-awq" | ||
local_save_path = f"/workspace/{quant_name}" | ||
hf_upload_path = f"casperhansen/{quant_name}" | ||
|
||
cli_args = dict( | ||
hf_model_path = hf_model_path, | ||
quant_name = quant_name, | ||
local_save_path = local_save_path, | ||
zero_point = True, | ||
q_group_size = 128, | ||
w_bit = 4, | ||
version = "GEMM", | ||
low_cpu_mem_usage = True, | ||
use_cache = False, | ||
) | ||
cli_args = " ".join([f"--{k}" if isinstance(v, bool) else f"--{k} {v}" for k,v in cli_args.items()]) | ||
|
||
docker_command = ( | ||
"bash -c '" + | ||
"cd /workspace && " + | ||
"git clone https://github.com/casper-hansen/AutoAWQ.git && " + | ||
"cd AutoAWQ && " + | ||
"pip install -e . && " + | ||
"huggingface-cli login --token $HF_TOKEN && " + | ||
f"python examples/cli.py {cli_args} && " + | ||
f"huggingface-cli upload {hf_upload_path} {local_save_path} ./ && " + | ||
"runpodctl stop pod $RUNPOD_POD_ID" + | ||
"'" | ||
) | ||
|
||
template = runpod.create_template( | ||
name=template_name, | ||
image_name=docker_image, | ||
docker_start_cmd=docker_command, | ||
container_disk_in_gb=system_storage_gb, | ||
volume_in_gb=volume_storage_gb, | ||
volume_mount_path="/workspace", | ||
ports="8888/http,22/tcp", | ||
) | ||
|
||
pod = runpod.create_pod( | ||
name=template_name, | ||
image_name=docker_image, | ||
template_id=template["id"], | ||
gpu_type_id=gpu_id, | ||
gpu_count=num_gpus, | ||
min_memory_in_gb=system_memory_gb, | ||
volume_in_gb=volume_storage_gb, | ||
container_disk_in_gb=system_storage_gb, | ||
env=env_variables, | ||
volume_mount_path="/workspace", | ||
cloud_type="SECURE", | ||
) | ||
|
||
print(pod) |