75 lines
2.9 KiB
Bash
Executable File
75 lines
2.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Optional: activate local virtualenv if present
|
|
if [ -d ".venv" ]; then
|
|
# shellcheck disable=SC1091
|
|
source ".venv/bin/activate"
|
|
|
|
# Add TensorRT libs to library path if installed via pip
|
|
TENSORRT_LIBS=".venv/lib/python3.12/site-packages/tensorrt_libs"
|
|
if [ -d "$TENSORRT_LIBS" ]; then
|
|
export LD_LIBRARY_PATH="${TENSORRT_LIBS}:${LD_LIBRARY_PATH:-}"
|
|
fi
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Model Configuration
|
|
# =============================================================================
|
|
export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}"
|
|
export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}"
|
|
|
|
# TensorRT acceleration (2-3x faster inference)
|
|
# First startup is slow (~30-60s) while TensorRT builds optimized engines
|
|
# Engines are cached in ~/.cache/onnxruntime/ for subsequent runs
|
|
export USE_TENSORRT="${USE_TENSORRT:-true}"
|
|
|
|
# =============================================================================
|
|
# Performance Tuning
|
|
# =============================================================================
|
|
CPU_CORES="$(nproc || echo 4)"
|
|
|
|
# GPU inference is the bottleneck - use 1 worker to avoid loading multiple
|
|
# copies of the model into GPU memory. Concurrency is handled via thread pool.
|
|
DEFAULT_WORKERS=1
|
|
|
|
# Thread pool for blocking operations (GPU inference, image decode)
|
|
# 4 threads allows overlapping I/O with GPU work
|
|
export INFERENCE_THREADS="${INFERENCE_THREADS:-4}"
|
|
|
|
# CPU threading for numpy/BLAS operations
|
|
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
|
|
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"
|
|
|
|
# =============================================================================
|
|
# HTTP Connection Pool
|
|
# =============================================================================
|
|
export HTTP_POOL_MAX_CONNECTIONS="${HTTP_POOL_MAX_CONNECTIONS:-100}"
|
|
export HTTP_POOL_MAX_KEEPALIVE="${HTTP_POOL_MAX_KEEPALIVE:-20}"
|
|
|
|
# =============================================================================
|
|
# Image Processing
|
|
# =============================================================================
|
|
# Max dimension for input images (reject larger)
|
|
export MAX_IMAGE_DIMENSION="${MAX_IMAGE_DIMENSION:-4096}"
|
|
# Downscale large images to this size before processing (det_size handles rest)
|
|
export TARGET_MAX_DIMENSION="${TARGET_MAX_DIMENSION:-2048}"
|
|
|
|
# =============================================================================
|
|
# Server Configuration
|
|
# =============================================================================
|
|
WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}"
|
|
PORT="${PORT:-18081}"
|
|
|
|
echo "Starting face service on port ${PORT} with ${WORKERS} workers"
|
|
echo " Model: ${FACE_MODEL_NAME}, det_size: ${FACE_DET_SIZE}, TensorRT: ${USE_TENSORRT}"
|
|
echo " Thread pool: ${INFERENCE_THREADS} workers"
|
|
echo " CPU cores: ${CPU_CORES}"
|
|
|
|
exec uvicorn app.main:app \
|
|
--host 0.0.0.0 \
|
|
--port "${PORT}" \
|
|
--workers "${WORKERS}" \
|
|
--loop uvloop \
|
|
--http httptools
|