第三章：云 GPU 市场——租用策略与成本优化

GPU 是 AI 时代最稀缺的资源。2023 年 H100 的等待期长达 6-9 个月，很多 AI 创业公司因为拿不到 GPU 而死亡。本章讲如何在 GPU 稀缺的时代，用最低成本获取 AI 推理算力。

一、主流云 GPU 价格对比（2024年）

AWS（EC2 GPU 实例）

# AWS GPU 实例类型（2024 年价格）
aws_instances = {
    # 入门级（开发/测试）
    "g4dn.xlarge": {
        "gpu": "T4 (16GB)",
        "on_demand": 0.526,
        "spot": 0.16,     # 节省 70%
        "best_for": "小型模型（7B INT4）、开发调试"
    },
    # 主力推理
    "g5.xlarge": {
        "gpu": "A10G (24GB)",
        "on_demand": 1.006,
        "spot": 0.30,     # 节省 70%
        "best_for": "7B FP16、13B INT8"
    },
    "p3.2xlarge": {
        "gpu": "V100 (16GB)",
        "on_demand": 3.06,
        "spot": 0.92,
        "best_for": "遗留系统，不推荐新项目"
    },
    # 高性能推理
    "p4d.24xlarge": {
        "gpu": "A100 40GB × 8",
        "on_demand": 32.77,
        "spot": 10.0,     # 节省 70%
        "best_for": "70B+ 模型，高吞吐量服务"
    },
    "p5.48xlarge": {
        "gpu": "H100 80GB × 8",
        "on_demand": 98.32,
        "spot": 30.0,
        "best_for": "最高性能推理，顶级训练"
    }
}

GCP（Vertex AI / Compute Engine）

gcp_instances = {
    "n1-standard-4 + T4": {
        "on_demand": 0.35,
        "preemptible": 0.11,    # GCP 的 Spot 实例叫 Preemptible
    },
    "a2-highgpu-1g (A100 40GB × 1)": {
        "on_demand": 3.67,
        "preemptible": 1.10,
    },
    "a3-highgpu-8g (H100 80GB × 8)": {
        "on_demand": 49.18,
        "preemptible": 14.75,
    },
    # GCP 特有：TPU（Google 自研 AI 芯片）
    "tpu-v4-8": {
        "on_demand": 12.88,
        "advantage": "训练 transformer 更高效，推理同等成本下 throughput 更高"
    }
}

专业 GPU 云（更便宜、更灵活）

specialized_providers = {
    "CoreWeave": {
        "A100 80GB": 2.21,      # vs AWS $4.10（节省 46%）
        "H100 80GB": 4.25,      # vs AWS $12.3（节省 65%）
        "strength": "最接近 AWS 的可靠性，价格更低",
        "weakness": "需要合同，最低消费要求",
        "target": "有稳定需求的 AI 公司"
    },
    "Lambda Labs": {
        "A100 80GB": 1.99,
        "H100 80GB": 3.29,
        "strength": "开发者友好，按小时付费无承诺",
        "weakness": "有时 H100 售罄",
        "target": "AI 研究者、初创公司"
    },
    "RunPod": {
        "RTX 4090": 0.44,
        "A100 80GB": 1.64,
        "H100 SXM": 2.49,
        "strength": "价格最低，社区 GPU 资源",
        "weakness": "稳定性不如企业级",
        "target": "个人开发者、成本极度敏感的场景"
    },
    "Together.ai": {
        "model_type": "按 Token 计费（托管推理）",
        "llama3_70b_price": 0.0009,  # $0.9 / 1M tokens（input）
        "strength": "零运维，直接用 API",
        "target": "不想管基础设施的 AI 应用开发者"
    }
}

二、Spot Instance 策略——节省 60-90%

Spot Instance 的原理

正常（On-Demand）：随时可用，随时停止，固定价格
Spot（竞价实例）：
  - 利用云厂商的闲置算力
  - 价格 = 当前市场竞价（随需求波动）
  - 当有更高出价者或容量不足时，你的实例可能被"抢占"
  - 通常提前 2 分钟通知（AWS）

价格波动：
  A100 80GB On-Demand: $4.10/小时
  A100 80GB Spot:       $1.10-2.50/小时（随时段变化）

Spot 适用场景判断

def is_spot_suitable(workload_type: str) -> dict:
    spot_suitability = {
        # ✅ 非常适合
        "batch_inference": {
            "suitable": True,
            "reason": "批量处理不在乎偶尔中断，可以断点续传",
            "example": "每晚处理 100 万条数据的分析任务"
        },
        "model_training": {
            "suitable": True,
            "reason": "训练有 checkpoint，被中断后可以恢复",
            "example": "微调 Llama-3-8B，每 1000 步保存一次"
        },
        
        # ⚠️ 需要额外设计
        "api_service_low_latency": {
            "suitable": "Conditionally",
            "reason": "实例被终止 = 服务中断",
            "solution": "多实例 + 健康检查 + 自动替换 + 负载均衡",
            "example": "在 K8s 上混合 On-Demand（20%）+ Spot（80%）"
        },
        
        # ❌ 不适合
        "stateful_service": {
            "suitable": False,
            "reason": "中断会丢失内存中的状态",
            "example": "有状态的 session 管理"
        }
    }
    return spot_suitability.get(workload_type, {})

Spot 实例中断处理

# AWS Spot 中断处理（Python）
import boto3
import requests
import threading
import time

def check_spot_termination():
    """监控 Spot 中断通知，提前 2 分钟保存状态"""
    while True:
        try:
            # AWS EC2 实例元数据服务（检查是否有终止通知）
            resp = requests.get(
                "http://169.254.169.254/latest/meta-data/spot/termination-time",
                timeout=1
            )
            if resp.status_code == 200:
                print(f"⚠️  Spot 中断通知！终止时间: {resp.text}")
                # 立即保存当前进度
                save_checkpoint()
                # 通知负载均衡器下线
                deregister_from_load_balancer()
                break
        except requests.exceptions.ConnectionError:
            pass  # 没有中断通知，继续
        
        time.sleep(5)  # 每 5 秒检查一次

# 在后台线程监控
monitor_thread = threading.Thread(target=check_spot_termination, daemon=True)
monitor_thread.start()

三、自建 GPU 集群 vs 租用——真实成本分析

# 24 个月的成本比较（运行 Llama-3-70B 推理服务，需要 2 × A100 80GB）

# 方案一：AWS On-Demand
aws_on_demand_24m = {
    "p4d.24xlarge (8xA100)": 32.77 * 24 * 30 * 8,  # 8核全用
    "实际：只需 2 A100": 32.77 / 4 * 24 * 30,        # 大约按比例
    "total_24m": 32.77 / 4 * 24 * 30,               # ~$70,000
}

# 方案二：AWS Spot
aws_spot_24m = {
    "avg_spot_price": 10.0,  # p4d.24xlarge spot 均价
    "total_24m": 10.0 / 4 * 24 * 30,  # ~$21,600
    "risk": "中断风险，需要故障转移设计"
}

# 方案三：租用 Lambda Labs（2 × A100 80GB）
lambda_24m = {
    "price_per_hour": 1.99 * 2,  # 2 卡
    "total_24m": 1.99 * 2 * 24 * 30,  # ~$2,868/月 × 24 = ~$68,800
}

# 方案四：购买 RTX 4090 × 2（自建）
self_build = {
    "rtx_4090_x2": 1600 * 2,          # 硬件 $3,200
    "server_chassis": 800,
    "psu_cooling": 500,
    "monthly_power": 0.12 * 700 * 2 * 24 * 30 / 1000,  # $120/月电费
    "total_hardware": 4500,
    "total_24m": 4500 + 120 * 24,      # ~$7,380
    "limitation": "只能运行 34B INT4（不是 70B FP16）"
}

结论：

开发/测试：自建 RTX 4090 × 2（最便宜）
低流量生产（< 10 RPS）：RunPod 或 Lambda Labs（按需，灵活）
中等流量生产（10-100 RPS）：CoreWeave 或 AWS Spot + 故障转移
高流量生产（> 100 RPS）：AWS/GCP 长期预留实例（1-3 年，可以省 40%）

四、新兴 GPU 市场玩家

Vast.ai（P2P GPU 市场）

# Vast.ai：个人把自己的 GPU 租出去，形成 P2P 市场
vast_ai_advantages = {
    "price": "通常比 RunPod 低 20-40%",
    "gpu_variety": "RTX 4090、3090、A6000 等多种选择",
    "setup": "标准 Docker 容器，支持自定义镜像",
    "typical_price": {
        "RTX 4090": "$0.20-0.35/小时",
        "A100 80GB": "$1.20-1.80/小时",
    }
}

国内云 GPU（阿里云 / 腾讯云 / 百度智能云）

cn_cloud_gpu = {
    "alibaba_cloud": {
        "ecs.gn7i-c8g1.2xlarge (A10)": "¥5.63/小时（~$0.78）",
        "ecs.gn7e-c16g1.4xlarge (A100 40GB)": "¥19.36/小时（~$2.68）",
        "advantage": "对中国大陆用户延迟低，合规性好",
    },
    "baidu_cloud": {
        "P4_VM (V100)": "¥12.40/小时",
        "advantage": "文心 API + 飞桨生态"
    }
}

五、实战：最低成本搭建 LLM API 服务

# 目标：$200/月以内，提供 Llama-3-8B 推理 API
# 方案：RunPod RTX 4090 + vLLM

# 1. 在 RunPod 上启动实例
# 选择：RTX 4090（24GB），$0.44/小时
# 系统盘：50GB PyTorch 镜像

# 2. 安装 vLLM
pip install vllm

# 3. 启动推理服务（OpenAI 兼容 API）
python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --host 0.0.0.0 \
    --port 8000 \
    --max-model-len 4096 \
    --dtype half \
    --gpu-memory-utilization 0.9

# 4. 测试
curl http://YOUR_RUNPOD_IP:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
    "messages": [{"role": "user", "content": "Hello!"}],
    "max_tokens": 100
  }'

成本计算：

运行时间：每天 10 小时 × 30 天 = 300 小时
费用：300 × $0.44 = $132/月
可以处理约 1,000-3,000 请求/小时（取决于 prompt 长度）

关键认知

云 GPU 的三个层次，按成本从高到低：

AWS/GCP On-Demand（最贵，最可靠，适合生产环境）
CoreWeave/Lambda Labs（中等，企业级可靠性，性价比高）
RunPod/Vast.ai（最便宜，适合开发测试和成本极度敏感的场景）

省钱的最大杠杆不是选哪家云，而是选哪个模型：

Llama-3-70B vs Llama-3-8B：性能差距约 20%，成本差距约 8x
如果 8B 够用，没有理由用 70B

“在 AI 推理的商业模式里，'省钱’不是吝啬，而是决定你能不能盈利的工程能力。”