#!/usr/bin/env bash
# gpu-up — provision a vast.ai GPU instance, bind its Ollama to OpenCode on Host B
# via reverse SSH tunnel. Mirrors DL-0012 (Host A) topology — no cookies, no proxies.
#
# Usage: gpu-up [--model <ollama-tag>] [--gpu <name>] [--max-price <USD/h>] [--disk <GB>] [--dry-run]
# Defaults: --model qwen3.6:35b-a3b --gpu RTX_4090 --max-price 0.50 --disk 80
#
# Requires:
#   - vastai authenticated:  vastai set api-key <YOUR_KEY>
#   - /root/.ssh/vast_provisioning_ed25519 (created by bootstrap; see /opt/gpu-tools/README.md)
#   - vasttun user on Host B with permitlisten 127.0.0.1:11440

set -euo pipefail

# === config ===
MODEL="${MODEL:-qwen3.6:35b-a3b}"
GPU_FILTER="${GPU_FILTER:-RTX_4090}"
MAX_PRICE="${MAX_PRICE:-0.50}"
DISK_GB="${DISK_GB:-80}"
DRY_RUN=0
HOST_B_PUBLIC_IP="${HOST_B_PUBLIC_IP:-YOUR_HOST_B_PUBLIC_IP}"
HOST_B_SSH_PORT="${HOST_B_SSH_PORT:-2222}"
TUNNEL_PORT="${TUNNEL_PORT:-11440}"
PROVISIONING_KEY="${PROVISIONING_KEY:-/root/.ssh/vast_provisioning_ed25519}"
STATE_FILE="/var/lib/specker/vast-current.json"
OPENCODE_CONFIG="/home/opencode/.config/opencode/opencode.json"
OPENCODE_BACKUP="${OPENCODE_CONFIG}.host-a-backup"
EXA_KEY="${EXA_KEY:-YOUR_EXA_API_KEY}"
# Vast-friendly image (has ssh-client; vast /.launch needs it).
# pytorch/pytorch is bigger but known-good; ollama is installed in onstart.
IMAGE="${IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel}"

# === args ===
while [[ $# -gt 0 ]]; do
  case $1 in
    --model)     MODEL=$2;     shift 2 ;;
    --gpu)       GPU_FILTER=$2; shift 2 ;;
    --max-price) MAX_PRICE=$2; shift 2 ;;
    --disk)      DISK_GB=$2;   shift 2 ;;
    --image)     IMAGE=$2;     shift 2 ;;
    --dry-run)   DRY_RUN=1;    shift   ;;
    -h|--help)   sed -n '2,15p' "$0"; exit 0 ;;
    *)           echo "[!] unknown arg: $1" >&2; exit 1 ;;
  esac
done

PATH="/root/.local/bin:$PATH"
command -v vastai >/dev/null || { echo "[!] vastai CLI not found in PATH" >&2; exit 1; }
command -v jq     >/dev/null || { echo "[!] jq not found" >&2; exit 1; }
[[ -f $PROVISIONING_KEY     ]] || { echo "[!] $PROVISIONING_KEY missing" >&2; exit 1; }
[[ -f ${PROVISIONING_KEY}.pub ]] || { echo "[!] ${PROVISIONING_KEY}.pub missing" >&2; exit 1; }

# auth probe
if ! vastai show user --raw 2>/dev/null | jq -e .id >/dev/null 2>&1; then
  echo "[!] vastai not authenticated. Run: vastai set api-key <YOUR_KEY>" >&2
  exit 1
fi

# existing instance?
if [[ -f $STATE_FILE ]]; then
  echo "[!] instance already active (state file exists). Run gpu-down first." >&2
  echo "    state: $(cat "$STATE_FILE")"
  exit 1
fi

# === search offers ===
QUERY="gpu_name=$GPU_FILTER disk_space>=$DISK_GB rentable=true verified=true dph_total<=$MAX_PRICE"
echo "[*] searching: $QUERY"
OFFERS=$(vastai search offers "$QUERY" -o "dph_total" --raw 2>/dev/null || echo "[]")
OFFER_COUNT=$(echo "$OFFERS" | jq 'length')
[[ "$OFFER_COUNT" -ge 1 ]] || { echo "[!] no offers" >&2; exit 1; }
OFFER_ID=$(echo "$OFFERS" | jq -r '.[0].id')
PRICE=$(  echo "$OFFERS" | jq -r '.[0].dph_total')
GPU_NAME=$(echo "$OFFERS" | jq -r '.[0].gpu_name')
DISK_AV=$( echo "$OFFERS" | jq -r '.[0].disk_space')
echo "[+] picked offer $OFFER_ID ($GPU_NAME, ${DISK_AV}GB disk, \$$PRICE/h)"
echo "[+] image: $IMAGE"

if [[ $DRY_RUN -eq 1 ]]; then
  echo "[dry-run] stopping here"; exit 0
fi

# === build onstart-cmd ===
PRIVKEY=$(cat "$PROVISIONING_KEY")
ONSTART_TMP=$(mktemp)
cat > "$ONSTART_TMP" <<EOSCRIPT
#!/usr/bin/env bash
set -e
exec > /var/log/onstart.log 2>&1
echo "[onstart] \$(date -Is) start"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq autossh curl ca-certificates
# Install ollama via official installer (binary install; we run serve ourselves since no systemd)
curl -fsSL https://ollama.com/install.sh | sh || true
echo "[onstart] \$(date -Is) ollama binary installed"
# Provisioning key
mkdir -p /root/.ssh && chmod 700 /root/.ssh
cat > /root/.ssh/id_vasttun <<'KEYEOF'
${PRIVKEY}
KEYEOF
chmod 600 /root/.ssh/id_vasttun
# Start ollama serve
pkill -f "ollama serve" || true
nohup ollama serve > /var/log/ollama.log 2>&1 &
sleep 8
# Pull model (blocking)
ollama pull ${MODEL}
echo "[onstart] \$(date -Is) model ${MODEL} pulled"
# Reverse tunnel → Host B :${TUNNEL_PORT} ← Ollama :11434 on this box
nohup autossh -M 0 -N \\
  -o StrictHostKeyChecking=no \\
  -o ServerAliveInterval=60 -o ServerAliveCountMax=3 \\
  -o ExitOnForwardFailure=yes \\
  -R 127.0.0.1:${TUNNEL_PORT}:127.0.0.1:11434 \\
  -p ${HOST_B_SSH_PORT} -i /root/.ssh/id_vasttun \\
  vasttun@${HOST_B_PUBLIC_IP} > /var/log/autossh.log 2>&1 &
echo "[onstart] \$(date -Is) tunnel up"
EOSCRIPT

# === create instance ===
echo "[*] creating instance from offer $OFFER_ID ..."
CREATE_OUT=$(vastai create instance "$OFFER_ID" \
  --image "$IMAGE" \
  --disk "$DISK_GB" \
  --onstart "$ONSTART_TMP" \
  --raw 2>&1) || { echo "[!] create failed:"; echo "$CREATE_OUT"; exit 1; }
INSTANCE_ID=$(echo "$CREATE_OUT" | jq -r '.new_contract // empty')
rm -f "$ONSTART_TMP"
[[ -n "$INSTANCE_ID" ]] || { echo "[!] no instance id in response: $CREATE_OUT"; exit 1; }
echo "[+] instance $INSTANCE_ID created."
echo "[*] cold-start budget: image pull (~5GB pytorch) ~1min + ollama install ~30s + model pull (~24GB) ~2min = ~3-5 min total"

# === wait for tunnel listen on Host B side (this very host) ===
echo "[*] waiting up to 20 min for reverse tunnel on 127.0.0.1:${TUNNEL_PORT} ..."
TUNNEL_UP=0
for i in $(seq 1 120); do
  if ss -tlnp 2>/dev/null | grep -q "127.0.0.1:${TUNNEL_PORT} "; then
    TUNNEL_UP=1
    echo "[+] tunnel up after ~${i}0s"
    break
  fi
  sleep 10
done
if [[ $TUNNEL_UP -ne 1 ]]; then
  echo "[!] tunnel did not come up in 20 min. Check: vastai logs $INSTANCE_ID --tail 100"
  echo "    (instance still running — destroy with: gpu-down)"
  exit 1
fi

# === verify model present in remote Ollama ===
echo "[*] verifying model $MODEL is loaded via tunnel ..."
MODEL_OK=0
for i in $(seq 1 20); do
  if curl -sS --max-time 10 "http://127.0.0.1:${TUNNEL_PORT}/api/tags" 2>/dev/null \
     | jq -e --arg m "$MODEL" '.models[] | select(.name == $m)' >/dev/null 2>&1; then
    MODEL_OK=1
    break
  fi
  sleep 5
done
[[ $MODEL_OK -eq 1 ]] || echo "[!] model not in /api/tags yet (pull may still be running). Tunnel is up, will keep going."

# === switch opencode.json ===
[[ -f $OPENCODE_BACKUP ]] || cp "$OPENCODE_CONFIG" "$OPENCODE_BACKUP"
cat > "$OPENCODE_CONFIG" <<EOF
{
  "\$schema": "https://opencode.ai/config.json",
  "model": "vast/${MODEL}",
  "provider": {
    "vast": {
      "npm": "@ai-sdk/openai-compatible",
      "name": "vast.ai GPU (Host B :${TUNNEL_PORT} ← reverse-tunnel)",
      "options": {
        "baseURL": "http://127.0.0.1:${TUNNEL_PORT}/v1",
        "apiKey": "ollama",
        "timeout": 600000,
        "chunkTimeout": 600000
      },
      "models": {
        "${MODEL}": { "name": "${MODEL} (vast.ai GPU)", "tool_call": true, "temperature": true, "reasoning": true }
      }
    }
  },
  "mcp": {
    "exa": {
      "type": "remote",
      "url": "https://mcp.exa.ai/mcp",
      "enabled": true,
      "headers": { "x-api-key": "${EXA_KEY}" }
    }
  }
}
EOF
chown opencode:opencode "$OPENCODE_CONFIG"
chmod 0640 "$OPENCODE_CONFIG"
systemctl restart opencode-server.service
echo "[+] opencode-server restarted, pointing at vast.ai"

# === save state ===
cat > "$STATE_FILE" <<EOF
{"instance_id": $INSTANCE_ID, "model": "$MODEL", "gpu": "$GPU_NAME", "offer_price_dph": $PRICE, "image": "$IMAGE", "started_at": "$(date -Is)"}
EOF

echo
echo "==================================================================="
echo "  gpu-up complete"
echo "    instance:   $INSTANCE_ID"
echo "    gpu:        $GPU_NAME"
echo "    model:      $MODEL"
echo "    price:      \$$PRICE / hour"
echo "    state:      $STATE_FILE"
echo "    monitor:    vastai logs $INSTANCE_ID --tail 100"
echo "    teardown:   gpu-down"
echo "==================================================================="
