face/app/main.py

87 lines
2.2 KiB
Python

"""FastAPI application entry point."""
import logging
from concurrent.futures import ThreadPoolExecutor
from contextlib import asynccontextmanager
import httpx
from fastapi import FastAPI
import app.resources as resources
from app.config import (
DET_SIZE,
DOWNLOAD_TIMEOUT,
HTTP_POOL_MAX_CONNECTIONS,
HTTP_POOL_MAX_KEEPALIVE,
INFERENCE_THREADS,
MODEL_NAME,
USE_TENSORRT,
)
from app.face import load_face_app
from app.routes import embed
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("face_service")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan handler - load models and shared resources on startup."""
logger.info("Starting face service...")
# Create HTTP client with connection pooling
limits = httpx.Limits(
max_connections=HTTP_POOL_MAX_CONNECTIONS,
max_keepalive_connections=HTTP_POOL_MAX_KEEPALIVE,
)
resources.http_client = httpx.AsyncClient(
timeout=DOWNLOAD_TIMEOUT,
limits=limits,
# http2=True requires 'h2' package - disable for now
)
logger.info(
f"HTTP client initialized (max_conn={HTTP_POOL_MAX_CONNECTIONS}, "
f"keepalive={HTTP_POOL_MAX_KEEPALIVE})"
)
# Create thread pool for blocking operations (GPU inference, image decode)
resources.inference_executor = ThreadPoolExecutor(
max_workers=INFERENCE_THREADS,
thread_name_prefix="inference",
)
logger.info(f"Thread pool initialized (workers={INFERENCE_THREADS})")
# Load face model (may take time if TensorRT engines need building)
load_face_app()
logger.info("Face service ready")
yield
# Cleanup
logger.info("Shutting down face service...")
await resources.http_client.aclose()
resources.inference_executor.shutdown(wait=True)
logger.info("Cleanup complete")
app = FastAPI(
title="Face Service",
version="1.0.0",
lifespan=lifespan,
)
# Include routers
app.include_router(embed.router)
@app.get("/healthz")
def healthz():
"""Health check endpoint."""
return {
"status": "ok",
"model": MODEL_NAME,
"det_size": DET_SIZE,
"tensorrt": USE_TENSORRT,
}