face/run_face_service.sh

75 lines
2.9 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Optional: activate local virtualenv if present
if [ -d ".venv" ]; then
# shellcheck disable=SC1091
source ".venv/bin/activate"
# Add TensorRT libs to library path if installed via pip
TENSORRT_LIBS=".venv/lib/python3.12/site-packages/tensorrt_libs"
if [ -d "$TENSORRT_LIBS" ]; then
export LD_LIBRARY_PATH="${TENSORRT_LIBS}:${LD_LIBRARY_PATH:-}"
fi
fi
# =============================================================================
# Model Configuration
# =============================================================================
export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}"
export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}"
# TensorRT acceleration (2-3x faster inference)
# First startup is slow (~30-60s) while TensorRT builds optimized engines
# Engines are cached in ~/.cache/onnxruntime/ for subsequent runs
export USE_TENSORRT="${USE_TENSORRT:-true}"
# =============================================================================
# Performance Tuning
# =============================================================================
CPU_CORES="$(nproc || echo 4)"
# GPU inference is the bottleneck - use 1 worker to avoid loading multiple
# copies of the model into GPU memory. Concurrency is handled via thread pool.
DEFAULT_WORKERS=1
# Thread pool for blocking operations (GPU inference, image decode)
# 4 threads allows overlapping I/O with GPU work
export INFERENCE_THREADS="${INFERENCE_THREADS:-4}"
# CPU threading for numpy/BLAS operations
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"
# =============================================================================
# HTTP Connection Pool
# =============================================================================
export HTTP_POOL_MAX_CONNECTIONS="${HTTP_POOL_MAX_CONNECTIONS:-100}"
export HTTP_POOL_MAX_KEEPALIVE="${HTTP_POOL_MAX_KEEPALIVE:-20}"
# =============================================================================
# Image Processing
# =============================================================================
# Max dimension for input images (reject larger)
export MAX_IMAGE_DIMENSION="${MAX_IMAGE_DIMENSION:-4096}"
# Downscale large images to this size before processing (det_size handles rest)
export TARGET_MAX_DIMENSION="${TARGET_MAX_DIMENSION:-2048}"
# =============================================================================
# Server Configuration
# =============================================================================
WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}"
PORT="${PORT:-18081}"
echo "Starting face service on port ${PORT} with ${WORKERS} workers"
echo " Model: ${FACE_MODEL_NAME}, det_size: ${FACE_DET_SIZE}, TensorRT: ${USE_TENSORRT}"
echo " Thread pool: ${INFERENCE_THREADS} workers"
echo " CPU cores: ${CPU_CORES}"
exec uvicorn app.main:app \
--host 0.0.0.0 \
--port "${PORT}" \
--workers "${WORKERS}" \
--loop uvloop \
--http httptools