#!/usr/bin/env bash set -euo pipefail # Optional: activate local virtualenv if present if [ -d ".venv" ]; then # shellcheck disable=SC1091 source ".venv/bin/activate" # Add TensorRT libs to library path if installed via pip TENSORRT_LIBS=".venv/lib/python3.12/site-packages/tensorrt_libs" if [ -d "$TENSORRT_LIBS" ]; then export LD_LIBRARY_PATH="${TENSORRT_LIBS}:${LD_LIBRARY_PATH:-}" fi fi # ============================================================================= # Model Configuration # ============================================================================= export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}" export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}" # TensorRT acceleration (2-3x faster inference) # First startup is slow (~30-60s) while TensorRT builds optimized engines # Engines are cached in ~/.cache/onnxruntime/ for subsequent runs export USE_TENSORRT="${USE_TENSORRT:-true}" # ============================================================================= # Performance Tuning # ============================================================================= CPU_CORES="$(nproc || echo 4)" # GPU inference is the bottleneck - use 1 worker to avoid loading multiple # copies of the model into GPU memory. Concurrency is handled via thread pool. DEFAULT_WORKERS=1 # Thread pool for blocking operations (GPU inference, image decode) # 4 threads allows overlapping I/O with GPU work export INFERENCE_THREADS="${INFERENCE_THREADS:-4}" # CPU threading for numpy/BLAS operations export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}" export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}" # ============================================================================= # HTTP Connection Pool # ============================================================================= export HTTP_POOL_MAX_CONNECTIONS="${HTTP_POOL_MAX_CONNECTIONS:-100}" export HTTP_POOL_MAX_KEEPALIVE="${HTTP_POOL_MAX_KEEPALIVE:-20}" # ============================================================================= # Image Processing # ============================================================================= # Max dimension for input images (reject larger) export MAX_IMAGE_DIMENSION="${MAX_IMAGE_DIMENSION:-4096}" # Downscale large images to this size before processing (det_size handles rest) export TARGET_MAX_DIMENSION="${TARGET_MAX_DIMENSION:-2048}" # ============================================================================= # Server Configuration # ============================================================================= WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}" PORT="${PORT:-18081}" echo "Starting face service on port ${PORT} with ${WORKERS} workers" echo " Model: ${FACE_MODEL_NAME}, det_size: ${FACE_DET_SIZE}, TensorRT: ${USE_TENSORRT}" echo " Thread pool: ${INFERENCE_THREADS} workers" echo " CPU cores: ${CPU_CORES}" exec uvicorn app.main:app \ --host 0.0.0.0 \ --port "${PORT}" \ --workers "${WORKERS}" \ --loop uvloop \ --http httptools