Add TensorRT experiment configuration
This commit is contained in:
parent
d34b27ecc5
commit
b8f167d336
|
|
@ -25,6 +25,10 @@ ENV/
|
||||||
models/
|
models/
|
||||||
*.onnx
|
*.onnx
|
||||||
|
|
||||||
|
# TensorRT artifacts
|
||||||
|
*.engine
|
||||||
|
*.profile
|
||||||
|
|
||||||
# OS files
|
# OS files
|
||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
|
|
@ -32,4 +36,3 @@ Thumbs.db
|
||||||
# Test/temp files
|
# Test/temp files
|
||||||
*.tmp
|
*.tmp
|
||||||
*.bak
|
*.bak
|
||||||
|
|
||||||
|
|
|
||||||
49
README.md
49
README.md
|
|
@ -1,18 +1,23 @@
|
||||||
## Genealog Face Service
|
## Genealog Face Service
|
||||||
|
|
||||||
FastAPI-based face embedding and matching microservice using InsightFace + ONNX Runtime GPU. This service is designed to be called from the `genealog-api` backend via HTTP.
|
FastAPI-based face embedding microservice using InsightFace + ONNX Runtime GPU. This service generates face embeddings from images and is designed to be called from the `genealog-api` backend via HTTP.
|
||||||
|
|
||||||
### Endpoints
|
### Endpoints
|
||||||
|
|
||||||
- `GET /healthz` – basic health check and model info.
|
- `GET /healthz` – basic health check and model info.
|
||||||
- `POST /embed-avatar` – JSON body: `{ "image_url": "https://..." }`, returns a single best face embedding for an avatar image.
|
- `POST /embed-avatar` – JSON body: `{ "image_url": "https://..." }`, returns a single best face embedding for an avatar image.
|
||||||
- `POST /embed-image` – JSON body: `{ "image_url": "https://..." }`, returns all detected faces and embeddings.
|
- `POST /embed-image` – JSON body: `{ "image_url": "https://..." }`, returns all detected faces and embeddings.
|
||||||
- `POST /test-avatar` – multipart form with fields:
|
|
||||||
- `tag`: string tag for logging / correlation
|
|
||||||
- `avatar`: avatar image file (face to match)
|
|
||||||
- `image`: target image file (search space)
|
|
||||||
|
|
||||||
All embeddings are normalized float vectors suitable for cosine-similarity comparison.
|
All embeddings are normalized float vectors suitable for cosine-similarity comparison. Face matching/comparison is handled by the calling service (`genealog-api`).
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- **Async HTTP downloads** with retry logic (httpx + tenacity)
|
||||||
|
- **Image validation**: size limits (20MB max), dimension limits (32px-8192px), decompression bomb protection
|
||||||
|
- **Robust image decoding**: handles all color modes (RGB, RGBA, L, LA, PA, CMYK, I, F), EXIF orientation correction
|
||||||
|
- **Face detection fallback**: If no face is detected in `/embed-avatar`, falls back to center crop embedding
|
||||||
|
- **Embedding validation**: Checks for NaN/Inf values before returning
|
||||||
|
- **Modular structure**: Clean separation of concerns (config, models, face processing, image handling, routes)
|
||||||
|
|
||||||
`/embed-avatar` notes:
|
`/embed-avatar` notes:
|
||||||
|
|
||||||
|
|
@ -29,6 +34,14 @@ source .venv/bin/activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Dependencies:**
|
||||||
|
- `fastapi`, `uvicorn` - Web framework
|
||||||
|
- `insightface` - Face detection and recognition
|
||||||
|
- `onnxruntime-gpu` - GPU-accelerated inference
|
||||||
|
- `httpx` - Async HTTP client for image downloads
|
||||||
|
- `tenacity` - Retry logic
|
||||||
|
- `opencv-python-headless`, `numpy`, `Pillow` - Image processing
|
||||||
|
|
||||||
GPU support assumes:
|
GPU support assumes:
|
||||||
|
|
||||||
- WSL2 with GPU enabled.
|
- WSL2 with GPU enabled.
|
||||||
|
|
@ -60,6 +73,10 @@ You can override via environment variables:
|
||||||
PORT=18081 \
|
PORT=18081 \
|
||||||
FACE_MODEL_NAME=buffalo_l \
|
FACE_MODEL_NAME=buffalo_l \
|
||||||
FACE_DET_SIZE=1024 \
|
FACE_DET_SIZE=1024 \
|
||||||
|
MAX_DOWNLOAD_SIZE=20971520 \
|
||||||
|
MAX_IMAGE_DIMENSION=8192 \
|
||||||
|
DOWNLOAD_TIMEOUT=15.0 \
|
||||||
|
MAX_RETRIES=3 \
|
||||||
UVICORN_WORKERS=20 \
|
UVICORN_WORKERS=20 \
|
||||||
./run_face_service.sh
|
./run_face_service.sh
|
||||||
```
|
```
|
||||||
|
|
@ -72,6 +89,26 @@ nohup ./run_face_service.sh > face_service.log 2>&1 &
|
||||||
|
|
||||||
Logs are written to `face_service.log` in the repo root.
|
Logs are written to `face_service.log` in the repo root.
|
||||||
|
|
||||||
|
### Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
genealog-face/
|
||||||
|
├── app/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── main.py # FastAPI app, lifespan handler, health endpoint
|
||||||
|
│ ├── config.py # Environment variables and constants
|
||||||
|
│ ├── models.py # Pydantic request/response models
|
||||||
|
│ ├── face.py # FaceAnalysis loading, embedding logic
|
||||||
|
│ ├── image.py # Image download, decode, validation
|
||||||
|
│ └── routes/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ └── embed.py # /embed-avatar, /embed-image endpoints
|
||||||
|
├── .gitignore
|
||||||
|
├── requirements.txt
|
||||||
|
├── run_face_service.sh
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
|
|
||||||
### Integration with genealog-api (Docker)
|
### Integration with genealog-api (Docker)
|
||||||
|
|
||||||
The `genealog-api` service expects this face service to be reachable at:
|
The `genealog-api` service expects this face service to be reachable at:
|
||||||
|
|
|
||||||
|
|
@ -5,16 +5,25 @@ import os
|
||||||
# Model configuration
|
# Model configuration
|
||||||
MODEL_NAME = os.getenv("FACE_MODEL_NAME", "buffalo_l")
|
MODEL_NAME = os.getenv("FACE_MODEL_NAME", "buffalo_l")
|
||||||
DET_SIZE = int(os.getenv("FACE_DET_SIZE", "1024"))
|
DET_SIZE = int(os.getenv("FACE_DET_SIZE", "1024"))
|
||||||
|
USE_TENSORRT = os.getenv("USE_TENSORRT", "true").lower() in ("true", "1", "yes")
|
||||||
|
|
||||||
# Image processing limits
|
# Image processing limits
|
||||||
MAX_DOWNLOAD_SIZE = int(os.getenv("MAX_DOWNLOAD_SIZE", 20 * 1024 * 1024)) # 20MB
|
MAX_DOWNLOAD_SIZE = int(os.getenv("MAX_DOWNLOAD_SIZE", 20 * 1024 * 1024)) # 20MB
|
||||||
MAX_IMAGE_DIMENSION = int(os.getenv("MAX_IMAGE_DIMENSION", 8192)) # 8192px
|
MAX_IMAGE_DIMENSION = int(os.getenv("MAX_IMAGE_DIMENSION", 4096)) # 4096px (reduced from 8192)
|
||||||
MIN_IMAGE_DIMENSION = int(os.getenv("MIN_IMAGE_DIMENSION", 32)) # 32px
|
MIN_IMAGE_DIMENSION = int(os.getenv("MIN_IMAGE_DIMENSION", 32)) # 32px
|
||||||
|
TARGET_MAX_DIMENSION = int(os.getenv("TARGET_MAX_DIMENSION", 2048)) # Downscale large images
|
||||||
|
|
||||||
# HTTP client settings
|
# HTTP client settings
|
||||||
DOWNLOAD_TIMEOUT = float(os.getenv("DOWNLOAD_TIMEOUT", 15.0)) # 15 seconds
|
DOWNLOAD_TIMEOUT = float(os.getenv("DOWNLOAD_TIMEOUT", 15.0)) # 15 seconds
|
||||||
MAX_RETRIES = int(os.getenv("MAX_RETRIES", 3))
|
MAX_RETRIES = int(os.getenv("MAX_RETRIES", 3))
|
||||||
|
|
||||||
|
# HTTP connection pool settings
|
||||||
|
HTTP_POOL_MAX_CONNECTIONS = int(os.getenv("HTTP_POOL_MAX_CONNECTIONS", 100))
|
||||||
|
HTTP_POOL_MAX_KEEPALIVE = int(os.getenv("HTTP_POOL_MAX_KEEPALIVE", 20))
|
||||||
|
|
||||||
|
# Thread pool for blocking operations (GPU inference, image decode)
|
||||||
|
INFERENCE_THREADS = int(os.getenv("INFERENCE_THREADS", 4))
|
||||||
|
|
||||||
# TODO [PROD]: Add URL allowlist for SSRF protection
|
# TODO [PROD]: Add URL allowlist for SSRF protection
|
||||||
# ALLOWED_URL_PATTERNS = os.getenv("ALLOWED_URL_PATTERNS", "").split(",")
|
# ALLOWED_URL_PATTERNS = os.getenv("ALLOWED_URL_PATTERNS", "").split(",")
|
||||||
|
|
||||||
|
|
|
||||||
67
app/face.py
67
app/face.py
|
|
@ -1,11 +1,13 @@
|
||||||
"""Face analysis and embedding logic using InsightFace."""
|
"""Face analysis and embedding logic using InsightFace."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from insightface.app import FaceAnalysis
|
from insightface.app import FaceAnalysis
|
||||||
|
|
||||||
from app.config import DET_SIZE, MODEL_NAME
|
from app.config import DET_SIZE, MODEL_NAME, USE_TENSORRT
|
||||||
from app.models import BBox
|
from app.models import BBox
|
||||||
|
|
||||||
logger = logging.getLogger("face_service")
|
logger = logging.getLogger("face_service")
|
||||||
|
|
@ -13,16 +15,65 @@ logger = logging.getLogger("face_service")
|
||||||
face_app: FaceAnalysis | None = None
|
face_app: FaceAnalysis | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _check_tensorrt_available() -> bool:
|
||||||
|
"""Check if TensorRT libraries are actually installed."""
|
||||||
|
try:
|
||||||
|
import tensorrt
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _get_providers() -> list:
|
||||||
|
"""Get ONNX Runtime execution providers based on configuration."""
|
||||||
|
import onnxruntime as ort
|
||||||
|
|
||||||
|
available_providers = ort.get_available_providers()
|
||||||
|
logger.info(f"Available ONNX providers: {available_providers}")
|
||||||
|
|
||||||
|
providers = []
|
||||||
|
|
||||||
|
# Try TensorRT first if enabled and libraries are actually installed
|
||||||
|
if USE_TENSORRT and "TensorrtExecutionProvider" in available_providers:
|
||||||
|
if _check_tensorrt_available():
|
||||||
|
providers.append(
|
||||||
|
(
|
||||||
|
"TensorrtExecutionProvider",
|
||||||
|
{
|
||||||
|
"trt_max_workspace_size": 2 * 1024 * 1024 * 1024, # 2GB
|
||||||
|
"trt_fp16_enable": True, # FP16 for faster inference
|
||||||
|
"trt_engine_cache_enable": True, # Cache TensorRT engines
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning("TensorRT requested but libnvinfer.so.10 not found, skipping")
|
||||||
|
|
||||||
|
# CUDA fallback
|
||||||
|
if "CUDAExecutionProvider" in available_providers:
|
||||||
|
providers.append("CUDAExecutionProvider")
|
||||||
|
|
||||||
|
# CPU fallback (always available)
|
||||||
|
providers.append("CPUExecutionProvider")
|
||||||
|
|
||||||
|
logger.info(f"Using providers: {[p[0] if isinstance(p, tuple) else p for p in providers]}")
|
||||||
|
return providers
|
||||||
|
|
||||||
|
|
||||||
def load_face_app() -> FaceAnalysis:
|
def load_face_app() -> FaceAnalysis:
|
||||||
"""Load and initialize the FaceAnalysis model (singleton)."""
|
"""Load and initialize the FaceAnalysis model (singleton)."""
|
||||||
global face_app
|
global face_app
|
||||||
if face_app is not None:
|
if face_app is not None:
|
||||||
return face_app
|
return face_app
|
||||||
|
|
||||||
logger.info(f"Loading InsightFace model pack={MODEL_NAME}, det_size={DET_SIZE}")
|
providers = _get_providers()
|
||||||
|
logger.info(
|
||||||
|
f"Loading InsightFace model pack={MODEL_NAME}, det_size={DET_SIZE}, "
|
||||||
|
f"tensorrt={USE_TENSORRT}"
|
||||||
|
)
|
||||||
fa = FaceAnalysis(
|
fa = FaceAnalysis(
|
||||||
name=MODEL_NAME,
|
name=MODEL_NAME,
|
||||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
providers=providers,
|
||||||
)
|
)
|
||||||
fa.prepare(ctx_id=0, det_size=(DET_SIZE, DET_SIZE))
|
fa.prepare(ctx_id=0, det_size=(DET_SIZE, DET_SIZE))
|
||||||
face_app = fa
|
face_app = fa
|
||||||
|
|
@ -30,6 +81,16 @@ def load_face_app() -> FaceAnalysis:
|
||||||
return face_app
|
return face_app
|
||||||
|
|
||||||
|
|
||||||
|
async def get_faces_async(
|
||||||
|
fa: FaceAnalysis,
|
||||||
|
img: np.ndarray,
|
||||||
|
executor: ThreadPoolExecutor,
|
||||||
|
) -> list:
|
||||||
|
"""Run face detection/embedding in thread pool to not block event loop."""
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
return await loop.run_in_executor(executor, fa.get, img)
|
||||||
|
|
||||||
|
|
||||||
def to_pixel_bbox(bbox, width: int, height: int) -> BBox:
|
def to_pixel_bbox(bbox, width: int, height: int) -> BBox:
|
||||||
"""Convert InsightFace bbox to pixel BBox with coordinate clamping."""
|
"""Convert InsightFace bbox to pixel BBox with coordinate clamping."""
|
||||||
x1, y1, x2, y2 = bbox
|
x1, y1, x2, y2 = bbox
|
||||||
|
|
|
||||||
67
app/image.py
67
app/image.py
|
|
@ -1,6 +1,8 @@
|
||||||
"""Image download, decoding, and validation utilities."""
|
"""Image download, decoding, and validation utilities."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
|
|
@ -21,6 +23,7 @@ from app.config import (
|
||||||
MAX_IMAGE_DIMENSION,
|
MAX_IMAGE_DIMENSION,
|
||||||
MAX_RETRIES,
|
MAX_RETRIES,
|
||||||
MIN_IMAGE_DIMENSION,
|
MIN_IMAGE_DIMENSION,
|
||||||
|
TARGET_MAX_DIMENSION,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger("face_service")
|
logger = logging.getLogger("face_service")
|
||||||
|
|
@ -114,6 +117,9 @@ def _decode_image_bytes(data: bytes, source: str) -> np.ndarray:
|
||||||
img = np.array(pil_image)
|
img = np.array(pil_image)
|
||||||
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
||||||
|
|
||||||
|
# Downscale large images for faster processing
|
||||||
|
img = _maybe_downscale(img)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"decode_image_bytes: source=%s shape=%s mode=%s",
|
"decode_image_bytes: source=%s shape=%s mode=%s",
|
||||||
source,
|
source,
|
||||||
|
|
@ -123,6 +129,25 @@ def _decode_image_bytes(data: bytes, source: str) -> np.ndarray:
|
||||||
return img
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def _maybe_downscale(img: np.ndarray, max_dim: int = TARGET_MAX_DIMENSION) -> np.ndarray:
|
||||||
|
"""Downscale image if larger than max_dim while preserving aspect ratio."""
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
if max(h, w) <= max_dim:
|
||||||
|
return img
|
||||||
|
|
||||||
|
scale = max_dim / max(h, w)
|
||||||
|
new_w = int(w * scale)
|
||||||
|
new_h = int(h * scale)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"downscaling image from %dx%d to %dx%d (scale=%.2f)",
|
||||||
|
w, h, new_w, new_h, scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use INTER_AREA for downscaling (best quality)
|
||||||
|
return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
|
retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
|
||||||
stop=stop_after_attempt(MAX_RETRIES),
|
stop=stop_after_attempt(MAX_RETRIES),
|
||||||
|
|
@ -158,18 +183,45 @@ async def _download_with_retry(client: httpx.AsyncClient, url: str) -> bytes:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
async def download_image(image_url: str) -> np.ndarray:
|
async def download_image(
|
||||||
|
image_url: str,
|
||||||
|
client: httpx.AsyncClient | None = None,
|
||||||
|
executor: ThreadPoolExecutor | None = None,
|
||||||
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Download and decode an image from URL.
|
Download and decode an image from URL.
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
- Async HTTP with connection pooling
|
- Async HTTP with connection pooling (uses shared client if provided)
|
||||||
- Retry with exponential backoff for transient failures
|
- Retry with exponential backoff for transient failures
|
||||||
- Size validation before and after download
|
- Size validation before and after download
|
||||||
- Comprehensive image decoding
|
- Async image decoding in thread pool
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_url: URL to download image from
|
||||||
|
client: Shared httpx client (falls back to creating new one if None)
|
||||||
|
executor: Thread pool for blocking decode (runs sync if None)
|
||||||
"""
|
"""
|
||||||
|
# Use shared client or create temporary one
|
||||||
|
if client is None:
|
||||||
|
from app.resources import http_client
|
||||||
|
client = http_client
|
||||||
|
|
||||||
|
# Fallback to temporary client if still None (e.g., during tests)
|
||||||
|
if client is None:
|
||||||
|
async with httpx.AsyncClient(timeout=DOWNLOAD_TIMEOUT) as temp_client:
|
||||||
|
return await _download_and_decode(temp_client, image_url, executor)
|
||||||
|
|
||||||
|
return await _download_and_decode(client, image_url, executor)
|
||||||
|
|
||||||
|
|
||||||
|
async def _download_and_decode(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
image_url: str,
|
||||||
|
executor: ThreadPoolExecutor | None,
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Internal helper to download and decode image."""
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=DOWNLOAD_TIMEOUT) as client:
|
|
||||||
data = await _download_with_retry(client, image_url)
|
data = await _download_with_retry(client, image_url)
|
||||||
except httpx.TimeoutException:
|
except httpx.TimeoutException:
|
||||||
logger.exception("Timeout downloading image")
|
logger.exception("Timeout downloading image")
|
||||||
|
|
@ -186,7 +238,14 @@ async def download_image(image_url: str) -> np.ndarray:
|
||||||
logger.exception("Failed to download image")
|
logger.exception("Failed to download image")
|
||||||
raise HTTPException(status_code=400, detail=f"Failed to download image: {e}")
|
raise HTTPException(status_code=400, detail=f"Failed to download image: {e}")
|
||||||
|
|
||||||
|
# Decode in thread pool to avoid blocking event loop
|
||||||
try:
|
try:
|
||||||
|
if executor is not None:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
img = await loop.run_in_executor(
|
||||||
|
executor, _decode_image_bytes, data, image_url
|
||||||
|
)
|
||||||
|
else:
|
||||||
img = _decode_image_bytes(data, image_url)
|
img = _decode_image_bytes(data, image_url)
|
||||||
except (ImageDecodeError, ImageValidationError) as e:
|
except (ImageDecodeError, ImageValidationError) as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
|
||||||
46
app/main.py
46
app/main.py
|
|
@ -1,11 +1,22 @@
|
||||||
"""FastAPI application entry point."""
|
"""FastAPI application entry point."""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
|
import httpx
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
from app.config import DET_SIZE, MODEL_NAME
|
import app.resources as resources
|
||||||
|
from app.config import (
|
||||||
|
DET_SIZE,
|
||||||
|
DOWNLOAD_TIMEOUT,
|
||||||
|
HTTP_POOL_MAX_CONNECTIONS,
|
||||||
|
HTTP_POOL_MAX_KEEPALIVE,
|
||||||
|
INFERENCE_THREADS,
|
||||||
|
MODEL_NAME,
|
||||||
|
USE_TENSORRT,
|
||||||
|
)
|
||||||
from app.face import load_face_app
|
from app.face import load_face_app
|
||||||
from app.routes import embed
|
from app.routes import embed
|
||||||
|
|
||||||
|
|
@ -15,12 +26,42 @@ logger = logging.getLogger("face_service")
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Application lifespan handler - load models on startup."""
|
"""Application lifespan handler - load models and shared resources on startup."""
|
||||||
logger.info("Starting face service...")
|
logger.info("Starting face service...")
|
||||||
|
|
||||||
|
# Create HTTP client with connection pooling
|
||||||
|
limits = httpx.Limits(
|
||||||
|
max_connections=HTTP_POOL_MAX_CONNECTIONS,
|
||||||
|
max_keepalive_connections=HTTP_POOL_MAX_KEEPALIVE,
|
||||||
|
)
|
||||||
|
resources.http_client = httpx.AsyncClient(
|
||||||
|
timeout=DOWNLOAD_TIMEOUT,
|
||||||
|
limits=limits,
|
||||||
|
# http2=True requires 'h2' package - disable for now
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"HTTP client initialized (max_conn={HTTP_POOL_MAX_CONNECTIONS}, "
|
||||||
|
f"keepalive={HTTP_POOL_MAX_KEEPALIVE})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create thread pool for blocking operations (GPU inference, image decode)
|
||||||
|
resources.inference_executor = ThreadPoolExecutor(
|
||||||
|
max_workers=INFERENCE_THREADS,
|
||||||
|
thread_name_prefix="inference",
|
||||||
|
)
|
||||||
|
logger.info(f"Thread pool initialized (workers={INFERENCE_THREADS})")
|
||||||
|
|
||||||
|
# Load face model (may take time if TensorRT engines need building)
|
||||||
load_face_app()
|
load_face_app()
|
||||||
|
|
||||||
logger.info("Face service ready")
|
logger.info("Face service ready")
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
logger.info("Shutting down face service...")
|
logger.info("Shutting down face service...")
|
||||||
|
await resources.http_client.aclose()
|
||||||
|
resources.inference_executor.shutdown(wait=True)
|
||||||
|
logger.info("Cleanup complete")
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
|
|
@ -40,5 +81,6 @@ def healthz():
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"model": MODEL_NAME,
|
"model": MODEL_NAME,
|
||||||
"det_size": DET_SIZE,
|
"det_size": DET_SIZE,
|
||||||
|
"tensorrt": USE_TENSORRT,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
"""Shared application resources (HTTP client, thread pool, etc.)."""
|
||||||
|
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
# Global shared resources (initialized in app lifespan)
|
||||||
|
http_client: httpx.AsyncClient | None = None
|
||||||
|
inference_executor: ThreadPoolExecutor | None = None
|
||||||
|
|
@ -8,11 +8,13 @@ from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
from app.face import (
|
from app.face import (
|
||||||
fallback_avatar_embedding,
|
fallback_avatar_embedding,
|
||||||
|
get_faces_async,
|
||||||
load_face_app,
|
load_face_app,
|
||||||
to_pixel_bbox,
|
to_pixel_bbox,
|
||||||
validate_embedding,
|
validate_embedding,
|
||||||
)
|
)
|
||||||
from app.image import download_image
|
from app.image import download_image
|
||||||
|
from app.resources import http_client, inference_executor
|
||||||
from app.models import (
|
from app.models import (
|
||||||
BBox,
|
BBox,
|
||||||
EmbedAvatarResponse,
|
EmbedAvatarResponse,
|
||||||
|
|
@ -36,10 +38,10 @@ async def embed_avatar(req: EmbedRequest):
|
||||||
"""
|
"""
|
||||||
logger.info("embed_avatar: image_url=%s", req.image_url)
|
logger.info("embed_avatar: image_url=%s", req.image_url)
|
||||||
fa = load_face_app()
|
fa = load_face_app()
|
||||||
img = await download_image(str(req.image_url))
|
img = await download_image(str(req.image_url), http_client, inference_executor)
|
||||||
h, w = img.shape[:2]
|
h, w = img.shape[:2]
|
||||||
|
|
||||||
faces = fa.get(img)
|
faces = await get_faces_async(fa, img, inference_executor)
|
||||||
if len(faces) == 0:
|
if len(faces) == 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"embed_avatar: no faces detected image_url=%s size=%dx%d, using fallback",
|
"embed_avatar: no faces detected image_url=%s size=%dx%d, using fallback",
|
||||||
|
|
@ -103,10 +105,10 @@ async def embed_image(req: EmbedRequest):
|
||||||
Returns empty list if no faces detected.
|
Returns empty list if no faces detected.
|
||||||
"""
|
"""
|
||||||
fa = load_face_app()
|
fa = load_face_app()
|
||||||
img = await download_image(str(req.image_url))
|
img = await download_image(str(req.image_url), http_client, inference_executor)
|
||||||
h, w = img.shape[:2]
|
h, w = img.shape[:2]
|
||||||
|
|
||||||
faces = fa.get(img)
|
faces = await get_faces_async(fa, img, inference_executor)
|
||||||
if len(faces) == 0:
|
if len(faces) == 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"embed_image: no faces detected image_url=%s size=%dx%d",
|
"embed_image: no faces detected image_url=%s size=%dx%d",
|
||||||
|
|
|
||||||
|
|
@ -5,27 +5,66 @@ set -euo pipefail
|
||||||
if [ -d ".venv" ]; then
|
if [ -d ".venv" ]; then
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source ".venv/bin/activate"
|
source ".venv/bin/activate"
|
||||||
|
|
||||||
|
# Add TensorRT libs to library path if installed via pip
|
||||||
|
TENSORRT_LIBS=".venv/lib/python3.12/site-packages/tensorrt_libs"
|
||||||
|
if [ -d "$TENSORRT_LIBS" ]; then
|
||||||
|
export LD_LIBRARY_PATH="${TENSORRT_LIBS}:${LD_LIBRARY_PATH:-}"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Model configuration (can be overridden via env)
|
# =============================================================================
|
||||||
|
# Model Configuration
|
||||||
|
# =============================================================================
|
||||||
export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}"
|
export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}"
|
||||||
export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}"
|
export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}"
|
||||||
|
|
||||||
# Tune CPU thread usage and workers
|
# TensorRT acceleration (2-3x faster inference)
|
||||||
|
# First startup is slow (~30-60s) while TensorRT builds optimized engines
|
||||||
|
# Engines are cached in ~/.cache/onnxruntime/ for subsequent runs
|
||||||
|
export USE_TENSORRT="${USE_TENSORRT:-true}"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Performance Tuning
|
||||||
|
# =============================================================================
|
||||||
CPU_CORES="$(nproc || echo 4)"
|
CPU_CORES="$(nproc || echo 4)"
|
||||||
DEFAULT_WORKERS="${CPU_CORES}"
|
|
||||||
if [ "$DEFAULT_WORKERS" -lt 2 ]; then
|
|
||||||
DEFAULT_WORKERS=2
|
|
||||||
fi
|
|
||||||
|
|
||||||
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-2}"
|
# GPU inference is the bottleneck - use 1 worker to avoid loading multiple
|
||||||
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-2}"
|
# copies of the model into GPU memory. Concurrency is handled via thread pool.
|
||||||
|
DEFAULT_WORKERS=1
|
||||||
|
|
||||||
|
# Thread pool for blocking operations (GPU inference, image decode)
|
||||||
|
# 4 threads allows overlapping I/O with GPU work
|
||||||
|
export INFERENCE_THREADS="${INFERENCE_THREADS:-4}"
|
||||||
|
|
||||||
|
# CPU threading for numpy/BLAS operations
|
||||||
|
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
|
||||||
|
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# HTTP Connection Pool
|
||||||
|
# =============================================================================
|
||||||
|
export HTTP_POOL_MAX_CONNECTIONS="${HTTP_POOL_MAX_CONNECTIONS:-100}"
|
||||||
|
export HTTP_POOL_MAX_KEEPALIVE="${HTTP_POOL_MAX_KEEPALIVE:-20}"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Image Processing
|
||||||
|
# =============================================================================
|
||||||
|
# Max dimension for input images (reject larger)
|
||||||
|
export MAX_IMAGE_DIMENSION="${MAX_IMAGE_DIMENSION:-4096}"
|
||||||
|
# Downscale large images to this size before processing (det_size handles rest)
|
||||||
|
export TARGET_MAX_DIMENSION="${TARGET_MAX_DIMENSION:-2048}"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Server Configuration
|
||||||
|
# =============================================================================
|
||||||
WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}"
|
WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}"
|
||||||
# Match genealog-api FACE_SERVICE_URL: http://host.docker.internal:18081
|
|
||||||
PORT="${PORT:-18081}"
|
PORT="${PORT:-18081}"
|
||||||
|
|
||||||
echo "Starting face service on port ${PORT} with ${WORKERS} workers (CPU cores: ${CPU_CORES})"
|
echo "Starting face service on port ${PORT} with ${WORKERS} workers"
|
||||||
|
echo " Model: ${FACE_MODEL_NAME}, det_size: ${FACE_DET_SIZE}, TensorRT: ${USE_TENSORRT}"
|
||||||
|
echo " Thread pool: ${INFERENCE_THREADS} workers"
|
||||||
|
echo " CPU cores: ${CPU_CORES}"
|
||||||
|
|
||||||
exec uvicorn app.main:app \
|
exec uvicorn app.main:app \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue