face/run_face_service.sh

#!/usr/bin/env bash
set -euo pipefail

# Optional: activate local virtualenv if present
if [ -d ".venv" ]; then
  # shellcheck disable=SC1091
  source ".venv/bin/activate"

  # Add TensorRT libs to library path if installed via pip
  TENSORRT_LIBS=".venv/lib/python3.12/site-packages/tensorrt_libs"
  if [ -d "$TENSORRT_LIBS" ]; then
    export LD_LIBRARY_PATH="${TENSORRT_LIBS}:${LD_LIBRARY_PATH:-}"
  fi
fi

# =============================================================================
# Model Configuration
# =============================================================================
export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}"
export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}"

# TensorRT acceleration (2-3x faster inference)
# First startup is slow (~30-60s) while TensorRT builds optimized engines
# Engines are cached in ~/.cache/onnxruntime/ for subsequent runs
export USE_TENSORRT="${USE_TENSORRT:-true}"

# =============================================================================
# Performance Tuning
# =============================================================================
CPU_CORES="$(nproc || echo 4)"

# GPU inference is the bottleneck - use 1 worker to avoid loading multiple
# copies of the model into GPU memory. Concurrency is handled via thread pool.
DEFAULT_WORKERS=1

# Thread pool for blocking operations (GPU inference, image decode)
# 4 threads allows overlapping I/O with GPU work
export INFERENCE_THREADS="${INFERENCE_THREADS:-4}"

# CPU threading for numpy/BLAS operations
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"

# =============================================================================
# HTTP Connection Pool
# =============================================================================
export HTTP_POOL_MAX_CONNECTIONS="${HTTP_POOL_MAX_CONNECTIONS:-100}"
export HTTP_POOL_MAX_KEEPALIVE="${HTTP_POOL_MAX_KEEPALIVE:-20}"

# =============================================================================
# Image Processing
# =============================================================================
# Max dimension for input images (reject larger)
export MAX_IMAGE_DIMENSION="${MAX_IMAGE_DIMENSION:-4096}"
# Downscale large images to this size before processing (det_size handles rest)
export TARGET_MAX_DIMENSION="${TARGET_MAX_DIMENSION:-2048}"

# =============================================================================
# Server Configuration
# =============================================================================
WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}"
PORT="${PORT:-18081}"

echo "Starting face service on port ${PORT} with ${WORKERS} workers"
echo "  Model: ${FACE_MODEL_NAME}, det_size: ${FACE_DET_SIZE}, TensorRT: ${USE_TENSORRT}"
echo "  Thread pool: ${INFERENCE_THREADS} workers"
echo "  CPU cores: ${CPU_CORES}"

exec uvicorn app.main:app \
  --host 0.0.0.0 \
  --port "${PORT}" \
  --workers "${WORKERS}" \
  --loop uvloop \
  --http httptools