CLI example + Runpod launch script (#548)

casper-hansen · Jul 22, 2024 · 1716748 · 1716748
1 parent 268360e
commit 1716748
Show file tree

Hide file tree

Showing 2 changed files with 139 additions and 0 deletions.
diff --git a/examples/cli.py b/examples/cli.py
@@ -0,0 +1,54 @@
+import argparse
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+
+def main():
+    parser = argparse.ArgumentParser(description="CLI for model quantization and saving")
+    parser.add_argument("--hf_model_path", type=str, required=True, help="Path to the Hugging Face model")
+    parser.add_argument("--quant_name", type=str, required=True, help="Name of the quantized model")
+    parser.add_argument("--local_save_path", type=str, required=True, help="Path to save the quantized model")
+
+    # Quantization config arguments
+    parser.add_argument("--zero_point", action="store_true", help="Enable zero point for quantization")
+    parser.add_argument("--no-zero_point", action="store_false", dest="zero_point", help="Disable zero point for quantization")
+    parser.add_argument("--q_group_size", type=int, default=128, help="Quantization group size")
+    parser.add_argument("--w_bit", type=int, default=4, help="Weight bit width")
+    parser.add_argument("--version", type=str, default="GEMM", help="Quantization version")
+
+    # Model config arguments
+    parser.add_argument("--low_cpu_mem_usage", action="store_true", help="Use low CPU memory")
+    parser.add_argument("--no-low_cpu_mem_usage", action="store_false", dest="low_cpu_mem_usage", help="Don't use low CPU memory")
+    parser.add_argument("--use_cache", action="store_true", help="Use cache")
+    parser.add_argument("--no-use_cache", action="store_false", dest="use_cache", help="Don't use cache")
+
+    parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=False)
+
+    args = parser.parse_args()
+
+    quant_config = {
+        "zero_point": args.zero_point,
+        "q_group_size": args.q_group_size,
+        "w_bit": args.w_bit,
+        "version": args.version
+    }
+
+    model_config = {
+        "low_cpu_mem_usage": args.low_cpu_mem_usage,
+        "use_cache": args.use_cache
+    }
+
+    print(f"Loading model from: {args.hf_model_path}")
+    model = AutoAWQForCausalLM.from_pretrained(args.hf_model_path, **model_config)
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path, trust_remote_code=True)
+
+    print(f"Quantizing model with config: {quant_config}")
+    model.quantize(tokenizer, quant_config=quant_config)
+
+    print(f"Saving quantized model to: {args.local_save_path}")
+    model.save_quantized(args.local_save_path)
+    tokenizer.save_pretrained(args.local_save_path)
+
+    print(f"Quantized model '{args.quant_name}' saved successfully.")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/runpod_quantize.py b/scripts/runpod_quantize.py
@@ -0,0 +1,85 @@
+import os
+import time
+import runpod
+
+# Load environment variables
+HF_TOKEN = os.environ.get('HF_TOKEN')
+runpod.api_key = os.environ.get('RUNPOD_API_KEY')
+
+# RunPod Parameters
+# get more by running print(runpod.get_gpus())
+template_name = f"AutoAWQ Pod {int(time.time())}"
+docker_image = "runpod/pytorch:2.2.0-py3.10-cuda12.1.1-devel-ubuntu22.04"
+gpu_ids = {
+    "MI300X": "AMD Instinct MI300X OAM", # 192 GB, $3.99/h
+    "H100": "NVIDIA H100 80GB HBM3", # 80 GB, $3.99/h
+    "A100": "NVIDIA A100-SXM4-80GB", # 80 GB, $1.94/h
+    "A6000": "NVIDIA RTX A6000", # 48 GB, $0.76/h
+    "4090": "NVIDIA GeForce RTX 4090", # 24 GB, $0.69/h
+}
+env_variables = {
+    "HF_TOKEN": HF_TOKEN,
+}
+gpu_id = gpu_ids["4090"]
+num_gpus = 1
+system_memory_gb = 100
+system_storage_gb = 20 # fp16 model is downloaded here
+volume_storage_gb = 20 # quantized model is saved here
+
+# Quantization Parameters
+hf_model_path = "Qwen/Qwen2-0.5B-Instruct"
+quant_name = "qwen2-0.5b-instruct-awq"
+local_save_path = f"/workspace/{quant_name}"
+hf_upload_path = f"casperhansen/{quant_name}"
+
+cli_args = dict(
+    hf_model_path = hf_model_path,
+    quant_name = quant_name,
+    local_save_path = local_save_path,
+    zero_point = True,
+    q_group_size = 128,
+    w_bit = 4,
+    version = "GEMM",
+    low_cpu_mem_usage = True,
+    use_cache = False,
+)
+cli_args = " ".join([f"--{k}" if isinstance(v, bool) else f"--{k} {v}" for k,v in cli_args.items()])
+
+docker_command = (
+    "bash -c '" +
+    "cd /workspace && " +
+    "git clone https://github.com/casper-hansen/AutoAWQ.git && " +
+    "cd AutoAWQ && " +
+    "pip install -e . && " +
+    "huggingface-cli login --token $HF_TOKEN && " +
+    f"python examples/cli.py {cli_args} && " +
+    f"huggingface-cli upload {hf_upload_path} {local_save_path} ./ && " +
+    "runpodctl stop pod $RUNPOD_POD_ID" +
+    "'"
+)
+
+template = runpod.create_template(
+    name=template_name,
+    image_name=docker_image,
+    docker_start_cmd=docker_command,
+    container_disk_in_gb=system_storage_gb,
+    volume_in_gb=volume_storage_gb,
+    volume_mount_path="/workspace",
+    ports="8888/http,22/tcp",
+)
+
+pod = runpod.create_pod(
+    name=template_name,
+    image_name=docker_image,
+    template_id=template["id"],
+    gpu_type_id=gpu_id,
+    gpu_count=num_gpus,
+    min_memory_in_gb=system_memory_gb,
+    volume_in_gb=volume_storage_gb,
+    container_disk_in_gb=system_storage_gb,
+    env=env_variables,
+    volume_mount_path="/workspace",
+    cloud_type="SECURE",
+)
+
+print(pod)