Compare commits

..

No commits in common. "tensorrt-experiment" and "master" have entirely different histories.

9 changed files with 36 additions and 297 deletions

5
.gitignore vendored
View File

@ -25,10 +25,6 @@ ENV/
models/
*.onnx
# TensorRT artifacts
*.engine
*.profile
# OS files
.DS_Store
Thumbs.db
@ -36,3 +32,4 @@ Thumbs.db
# Test/temp files
*.tmp
*.bak

View File

@ -1,23 +1,18 @@
## Genealog Face Service
FastAPI-based face embedding microservice using InsightFace + ONNX Runtime GPU. This service generates face embeddings from images and is designed to be called from the `genealog-api` backend via HTTP.
FastAPI-based face embedding and matching microservice using InsightFace + ONNX Runtime GPU. This service is designed to be called from the `genealog-api` backend via HTTP.
### Endpoints
- `GET /healthz` basic health check and model info.
- `POST /embed-avatar` JSON body: `{ "image_url": "https://..." }`, returns a single best face embedding for an avatar image.
- `POST /embed-image` JSON body: `{ "image_url": "https://..." }`, returns all detected faces and embeddings.
- `POST /test-avatar` multipart form with fields:
- `tag`: string tag for logging / correlation
- `avatar`: avatar image file (face to match)
- `image`: target image file (search space)
All embeddings are normalized float vectors suitable for cosine-similarity comparison. Face matching/comparison is handled by the calling service (`genealog-api`).
### Features
- **Async HTTP downloads** with retry logic (httpx + tenacity)
- **Image validation**: size limits (20MB max), dimension limits (32px-8192px), decompression bomb protection
- **Robust image decoding**: handles all color modes (RGB, RGBA, L, LA, PA, CMYK, I, F), EXIF orientation correction
- **Face detection fallback**: If no face is detected in `/embed-avatar`, falls back to center crop embedding
- **Embedding validation**: Checks for NaN/Inf values before returning
- **Modular structure**: Clean separation of concerns (config, models, face processing, image handling, routes)
All embeddings are normalized float vectors suitable for cosine-similarity comparison.
`/embed-avatar` notes:
@ -34,14 +29,6 @@ source .venv/bin/activate
pip install -r requirements.txt
```
**Dependencies:**
- `fastapi`, `uvicorn` - Web framework
- `insightface` - Face detection and recognition
- `onnxruntime-gpu` - GPU-accelerated inference
- `httpx` - Async HTTP client for image downloads
- `tenacity` - Retry logic
- `opencv-python-headless`, `numpy`, `Pillow` - Image processing
GPU support assumes:
- WSL2 with GPU enabled.
@ -73,10 +60,6 @@ You can override via environment variables:
PORT=18081 \
FACE_MODEL_NAME=buffalo_l \
FACE_DET_SIZE=1024 \
MAX_DOWNLOAD_SIZE=20971520 \
MAX_IMAGE_DIMENSION=8192 \
DOWNLOAD_TIMEOUT=15.0 \
MAX_RETRIES=3 \
UVICORN_WORKERS=20 \
./run_face_service.sh
```
@ -89,26 +72,6 @@ nohup ./run_face_service.sh > face_service.log 2>&1 &
Logs are written to `face_service.log` in the repo root.
### Project Structure
```
genealog-face/
├── app/
│ ├── __init__.py
│ ├── main.py # FastAPI app, lifespan handler, health endpoint
│ ├── config.py # Environment variables and constants
│ ├── models.py # Pydantic request/response models
│ ├── face.py # FaceAnalysis loading, embedding logic
│ ├── image.py # Image download, decode, validation
│ └── routes/
│ ├── __init__.py
│ └── embed.py # /embed-avatar, /embed-image endpoints
├── .gitignore
├── requirements.txt
├── run_face_service.sh
└── README.md
```
### Integration with genealog-api (Docker)
The `genealog-api` service expects this face service to be reachable at:

View File

@ -5,25 +5,16 @@ import os
# Model configuration
MODEL_NAME = os.getenv("FACE_MODEL_NAME", "buffalo_l")
DET_SIZE = int(os.getenv("FACE_DET_SIZE", "1024"))
USE_TENSORRT = os.getenv("USE_TENSORRT", "true").lower() in ("true", "1", "yes")
# Image processing limits
MAX_DOWNLOAD_SIZE = int(os.getenv("MAX_DOWNLOAD_SIZE", 20 * 1024 * 1024)) # 20MB
MAX_IMAGE_DIMENSION = int(os.getenv("MAX_IMAGE_DIMENSION", 4096)) # 4096px (reduced from 8192)
MAX_IMAGE_DIMENSION = int(os.getenv("MAX_IMAGE_DIMENSION", 8192)) # 8192px
MIN_IMAGE_DIMENSION = int(os.getenv("MIN_IMAGE_DIMENSION", 32)) # 32px
TARGET_MAX_DIMENSION = int(os.getenv("TARGET_MAX_DIMENSION", 2048)) # Downscale large images
# HTTP client settings
DOWNLOAD_TIMEOUT = float(os.getenv("DOWNLOAD_TIMEOUT", 15.0)) # 15 seconds
MAX_RETRIES = int(os.getenv("MAX_RETRIES", 3))
# HTTP connection pool settings
HTTP_POOL_MAX_CONNECTIONS = int(os.getenv("HTTP_POOL_MAX_CONNECTIONS", 100))
HTTP_POOL_MAX_KEEPALIVE = int(os.getenv("HTTP_POOL_MAX_KEEPALIVE", 20))
# Thread pool for blocking operations (GPU inference, image decode)
INFERENCE_THREADS = int(os.getenv("INFERENCE_THREADS", 4))
# TODO [PROD]: Add URL allowlist for SSRF protection
# ALLOWED_URL_PATTERNS = os.getenv("ALLOWED_URL_PATTERNS", "").split(",")

View File

@ -1,13 +1,11 @@
"""Face analysis and embedding logic using InsightFace."""
import asyncio
import logging
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from insightface.app import FaceAnalysis
from app.config import DET_SIZE, MODEL_NAME, USE_TENSORRT
from app.config import DET_SIZE, MODEL_NAME
from app.models import BBox
logger = logging.getLogger("face_service")
@ -15,65 +13,16 @@ logger = logging.getLogger("face_service")
face_app: FaceAnalysis | None = None
def _check_tensorrt_available() -> bool:
"""Check if TensorRT libraries are actually installed."""
try:
import tensorrt
return True
except ImportError:
return False
def _get_providers() -> list:
"""Get ONNX Runtime execution providers based on configuration."""
import onnxruntime as ort
available_providers = ort.get_available_providers()
logger.info(f"Available ONNX providers: {available_providers}")
providers = []
# Try TensorRT first if enabled and libraries are actually installed
if USE_TENSORRT and "TensorrtExecutionProvider" in available_providers:
if _check_tensorrt_available():
providers.append(
(
"TensorrtExecutionProvider",
{
"trt_max_workspace_size": 2 * 1024 * 1024 * 1024, # 2GB
"trt_fp16_enable": True, # FP16 for faster inference
"trt_engine_cache_enable": True, # Cache TensorRT engines
},
)
)
else:
logger.warning("TensorRT requested but libnvinfer.so.10 not found, skipping")
# CUDA fallback
if "CUDAExecutionProvider" in available_providers:
providers.append("CUDAExecutionProvider")
# CPU fallback (always available)
providers.append("CPUExecutionProvider")
logger.info(f"Using providers: {[p[0] if isinstance(p, tuple) else p for p in providers]}")
return providers
def load_face_app() -> FaceAnalysis:
"""Load and initialize the FaceAnalysis model (singleton)."""
global face_app
if face_app is not None:
return face_app
providers = _get_providers()
logger.info(
f"Loading InsightFace model pack={MODEL_NAME}, det_size={DET_SIZE}, "
f"tensorrt={USE_TENSORRT}"
)
logger.info(f"Loading InsightFace model pack={MODEL_NAME}, det_size={DET_SIZE}")
fa = FaceAnalysis(
name=MODEL_NAME,
providers=providers,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
fa.prepare(ctx_id=0, det_size=(DET_SIZE, DET_SIZE))
face_app = fa
@ -81,16 +30,6 @@ def load_face_app() -> FaceAnalysis:
return face_app
async def get_faces_async(
fa: FaceAnalysis,
img: np.ndarray,
executor: ThreadPoolExecutor,
) -> list:
"""Run face detection/embedding in thread pool to not block event loop."""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(executor, fa.get, img)
def to_pixel_bbox(bbox, width: int, height: int) -> BBox:
"""Convert InsightFace bbox to pixel BBox with coordinate clamping."""
x1, y1, x2, y2 = bbox

View File

@ -1,8 +1,6 @@
"""Image download, decoding, and validation utilities."""
import asyncio
import logging
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
import cv2
@ -23,7 +21,6 @@ from app.config import (
MAX_IMAGE_DIMENSION,
MAX_RETRIES,
MIN_IMAGE_DIMENSION,
TARGET_MAX_DIMENSION,
)
logger = logging.getLogger("face_service")
@ -117,9 +114,6 @@ def _decode_image_bytes(data: bytes, source: str) -> np.ndarray:
img = np.array(pil_image)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
# Downscale large images for faster processing
img = _maybe_downscale(img)
logger.info(
"decode_image_bytes: source=%s shape=%s mode=%s",
source,
@ -129,25 +123,6 @@ def _decode_image_bytes(data: bytes, source: str) -> np.ndarray:
return img
def _maybe_downscale(img: np.ndarray, max_dim: int = TARGET_MAX_DIMENSION) -> np.ndarray:
"""Downscale image if larger than max_dim while preserving aspect ratio."""
h, w = img.shape[:2]
if max(h, w) <= max_dim:
return img
scale = max_dim / max(h, w)
new_w = int(w * scale)
new_h = int(h * scale)
logger.info(
"downscaling image from %dx%d to %dx%d (scale=%.2f)",
w, h, new_w, new_h, scale,
)
# Use INTER_AREA for downscaling (best quality)
return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
@retry(
retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
stop=stop_after_attempt(MAX_RETRIES),
@ -183,46 +158,19 @@ async def _download_with_retry(client: httpx.AsyncClient, url: str) -> bytes:
return content
async def download_image(
image_url: str,
client: httpx.AsyncClient | None = None,
executor: ThreadPoolExecutor | None = None,
) -> np.ndarray:
async def download_image(image_url: str) -> np.ndarray:
"""
Download and decode an image from URL.
Features:
- Async HTTP with connection pooling (uses shared client if provided)
- Async HTTP with connection pooling
- Retry with exponential backoff for transient failures
- Size validation before and after download
- Async image decoding in thread pool
Args:
image_url: URL to download image from
client: Shared httpx client (falls back to creating new one if None)
executor: Thread pool for blocking decode (runs sync if None)
- Comprehensive image decoding
"""
# Use shared client or create temporary one
if client is None:
from app.resources import http_client
client = http_client
# Fallback to temporary client if still None (e.g., during tests)
if client is None:
async with httpx.AsyncClient(timeout=DOWNLOAD_TIMEOUT) as temp_client:
return await _download_and_decode(temp_client, image_url, executor)
return await _download_and_decode(client, image_url, executor)
async def _download_and_decode(
client: httpx.AsyncClient,
image_url: str,
executor: ThreadPoolExecutor | None,
) -> np.ndarray:
"""Internal helper to download and decode image."""
try:
data = await _download_with_retry(client, image_url)
async with httpx.AsyncClient(timeout=DOWNLOAD_TIMEOUT) as client:
data = await _download_with_retry(client, image_url)
except httpx.TimeoutException:
logger.exception("Timeout downloading image")
raise HTTPException(status_code=408, detail="Timeout downloading image")
@ -238,15 +186,8 @@ async def _download_and_decode(
logger.exception("Failed to download image")
raise HTTPException(status_code=400, detail=f"Failed to download image: {e}")
# Decode in thread pool to avoid blocking event loop
try:
if executor is not None:
loop = asyncio.get_running_loop()
img = await loop.run_in_executor(
executor, _decode_image_bytes, data, image_url
)
else:
img = _decode_image_bytes(data, image_url)
img = _decode_image_bytes(data, image_url)
except (ImageDecodeError, ImageValidationError) as e:
raise HTTPException(status_code=400, detail=str(e))

View File

@ -1,22 +1,11 @@
"""FastAPI application entry point."""
import logging
from concurrent.futures import ThreadPoolExecutor
from contextlib import asynccontextmanager
import httpx
from fastapi import FastAPI
import app.resources as resources
from app.config import (
DET_SIZE,
DOWNLOAD_TIMEOUT,
HTTP_POOL_MAX_CONNECTIONS,
HTTP_POOL_MAX_KEEPALIVE,
INFERENCE_THREADS,
MODEL_NAME,
USE_TENSORRT,
)
from app.config import DET_SIZE, MODEL_NAME
from app.face import load_face_app
from app.routes import embed
@ -26,42 +15,12 @@ logger = logging.getLogger("face_service")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan handler - load models and shared resources on startup."""
"""Application lifespan handler - load models on startup."""
logger.info("Starting face service...")
# Create HTTP client with connection pooling
limits = httpx.Limits(
max_connections=HTTP_POOL_MAX_CONNECTIONS,
max_keepalive_connections=HTTP_POOL_MAX_KEEPALIVE,
)
resources.http_client = httpx.AsyncClient(
timeout=DOWNLOAD_TIMEOUT,
limits=limits,
# http2=True requires 'h2' package - disable for now
)
logger.info(
f"HTTP client initialized (max_conn={HTTP_POOL_MAX_CONNECTIONS}, "
f"keepalive={HTTP_POOL_MAX_KEEPALIVE})"
)
# Create thread pool for blocking operations (GPU inference, image decode)
resources.inference_executor = ThreadPoolExecutor(
max_workers=INFERENCE_THREADS,
thread_name_prefix="inference",
)
logger.info(f"Thread pool initialized (workers={INFERENCE_THREADS})")
# Load face model (may take time if TensorRT engines need building)
load_face_app()
logger.info("Face service ready")
yield
# Cleanup
logger.info("Shutting down face service...")
await resources.http_client.aclose()
resources.inference_executor.shutdown(wait=True)
logger.info("Cleanup complete")
app = FastAPI(
@ -81,6 +40,5 @@ def healthz():
"status": "ok",
"model": MODEL_NAME,
"det_size": DET_SIZE,
"tensorrt": USE_TENSORRT,
}

View File

@ -1,9 +0,0 @@
"""Shared application resources (HTTP client, thread pool, etc.)."""
from concurrent.futures import ThreadPoolExecutor
import httpx
# Global shared resources (initialized in app lifespan)
http_client: httpx.AsyncClient | None = None
inference_executor: ThreadPoolExecutor | None = None

View File

@ -8,13 +8,11 @@ from fastapi import APIRouter, HTTPException
from app.face import (
fallback_avatar_embedding,
get_faces_async,
load_face_app,
to_pixel_bbox,
validate_embedding,
)
from app.image import download_image
from app.resources import http_client, inference_executor
from app.models import (
BBox,
EmbedAvatarResponse,
@ -32,16 +30,16 @@ router = APIRouter()
async def embed_avatar(req: EmbedRequest):
"""
Extract face embedding from an avatar image.
Returns the largest detected face. If no face is detected,
falls back to center crop embedding with score=0.0.
"""
logger.info("embed_avatar: image_url=%s", req.image_url)
fa = load_face_app()
img = await download_image(str(req.image_url), http_client, inference_executor)
img = await download_image(str(req.image_url))
h, w = img.shape[:2]
faces = await get_faces_async(fa, img, inference_executor)
faces = fa.get(img)
if len(faces) == 0:
logger.warning(
"embed_avatar: no faces detected image_url=%s size=%dx%d, using fallback",
@ -100,15 +98,15 @@ async def embed_avatar(req: EmbedRequest):
async def embed_image(req: EmbedRequest):
"""
Extract face embeddings from all faces in an image.
Returns all detected faces sorted by detection score (highest first).
Returns empty list if no faces detected.
"""
fa = load_face_app()
img = await download_image(str(req.image_url), http_client, inference_executor)
img = await download_image(str(req.image_url))
h, w = img.shape[:2]
faces = await get_faces_async(fa, img, inference_executor)
faces = fa.get(img)
if len(faces) == 0:
logger.warning(
"embed_image: no faces detected image_url=%s size=%dx%d",

View File

@ -5,66 +5,27 @@ set -euo pipefail
if [ -d ".venv" ]; then
# shellcheck disable=SC1091
source ".venv/bin/activate"
# Add TensorRT libs to library path if installed via pip
TENSORRT_LIBS=".venv/lib/python3.12/site-packages/tensorrt_libs"
if [ -d "$TENSORRT_LIBS" ]; then
export LD_LIBRARY_PATH="${TENSORRT_LIBS}:${LD_LIBRARY_PATH:-}"
fi
fi
# =============================================================================
# Model Configuration
# =============================================================================
# Model configuration (can be overridden via env)
export FACE_MODEL_NAME="${FACE_MODEL_NAME:-buffalo_l}"
export FACE_DET_SIZE="${FACE_DET_SIZE:-1024}"
# TensorRT acceleration (2-3x faster inference)
# First startup is slow (~30-60s) while TensorRT builds optimized engines
# Engines are cached in ~/.cache/onnxruntime/ for subsequent runs
export USE_TENSORRT="${USE_TENSORRT:-true}"
# =============================================================================
# Performance Tuning
# =============================================================================
# Tune CPU thread usage and workers
CPU_CORES="$(nproc || echo 4)"
DEFAULT_WORKERS="${CPU_CORES}"
if [ "$DEFAULT_WORKERS" -lt 2 ]; then
DEFAULT_WORKERS=2
fi
# GPU inference is the bottleneck - use 1 worker to avoid loading multiple
# copies of the model into GPU memory. Concurrency is handled via thread pool.
DEFAULT_WORKERS=1
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-2}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-2}"
# Thread pool for blocking operations (GPU inference, image decode)
# 4 threads allows overlapping I/O with GPU work
export INFERENCE_THREADS="${INFERENCE_THREADS:-4}"
# CPU threading for numpy/BLAS operations
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"
# =============================================================================
# HTTP Connection Pool
# =============================================================================
export HTTP_POOL_MAX_CONNECTIONS="${HTTP_POOL_MAX_CONNECTIONS:-100}"
export HTTP_POOL_MAX_KEEPALIVE="${HTTP_POOL_MAX_KEEPALIVE:-20}"
# =============================================================================
# Image Processing
# =============================================================================
# Max dimension for input images (reject larger)
export MAX_IMAGE_DIMENSION="${MAX_IMAGE_DIMENSION:-4096}"
# Downscale large images to this size before processing (det_size handles rest)
export TARGET_MAX_DIMENSION="${TARGET_MAX_DIMENSION:-2048}"
# =============================================================================
# Server Configuration
# =============================================================================
WORKERS="${UVICORN_WORKERS:-$DEFAULT_WORKERS}"
# Match genealog-api FACE_SERVICE_URL: http://host.docker.internal:18081
PORT="${PORT:-18081}"
echo "Starting face service on port ${PORT} with ${WORKERS} workers"
echo " Model: ${FACE_MODEL_NAME}, det_size: ${FACE_DET_SIZE}, TensorRT: ${USE_TENSORRT}"
echo " Thread pool: ${INFERENCE_THREADS} workers"
echo " CPU cores: ${CPU_CORES}"
echo "Starting face service on port ${PORT} with ${WORKERS} workers (CPU cores: ${CPU_CORES})"
exec uvicorn app.main:app \
--host 0.0.0.0 \