feat(paddleocr-vl): add PaddleOCR-VL full pipeline Docker image and API server, plus integration tests and docker helpers

2026-01-17 20:22:23 +00:00
parent addae20cbd
commit 80e6866442
12 changed files with 2414 additions and 21 deletions
--- a/test/helpers/docker.ts
+++ b/test/helpers/docker.ts
@@ -0,0 +1,297 @@
+import { execSync } from 'child_process';
+
+// Project container names (only manage these)
+const PROJECT_CONTAINERS = [
+  'paddleocr-vl-test',
+  'paddleocr-vl-gpu-test',
+  'paddleocr-vl-cpu-test',
+  'paddleocr-vl-full-test',
+  'minicpm-test',
+];
+
+// Image configurations
+export interface IImageConfig {
+  name: string;
+  dockerfile: string;
+  buildContext: string;
+  containerName: string;
+  ports: string[];
+  volumes?: string[];
+  gpus?: boolean;
+  healthEndpoint?: string;
+  healthTimeout?: number;
+}
+
+export const IMAGES = {
+  paddleocrVlGpu: {
+    name: 'paddleocr-vl-gpu',
+    dockerfile: 'Dockerfile_paddleocr_vl_gpu',
+    buildContext: '.',
+    containerName: 'paddleocr-vl-test',
+    ports: ['8000:8000'],
+    volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
+    gpus: true,
+    healthEndpoint: 'http://localhost:8000/health',
+    healthTimeout: 300000, // 5 minutes for model loading
+  } as IImageConfig,
+
+  paddleocrVlCpu: {
+    name: 'paddleocr-vl-cpu',
+    dockerfile: 'Dockerfile_paddleocr_vl_cpu',
+    buildContext: '.',
+    containerName: 'paddleocr-vl-test',
+    ports: ['8000:8000'],
+    volumes: ['ht-huggingface-cache:/root/.cache/huggingface'],
+    gpus: false,
+    healthEndpoint: 'http://localhost:8000/health',
+    healthTimeout: 300000,
+  } as IImageConfig,
+
+  minicpm: {
+    name: 'minicpm45v',
+    dockerfile: 'Dockerfile_minicpm45v',
+    buildContext: '.',
+    containerName: 'minicpm-test',
+    ports: ['11434:11434'],
+    volumes: ['ht-ollama-models:/root/.ollama'],
+    gpus: true,
+    healthEndpoint: 'http://localhost:11434/api/tags',
+    healthTimeout: 120000,
+  } as IImageConfig,
+
+  // Full PaddleOCR-VL pipeline with PP-DocLayoutV2 + structured JSON output
+  paddleocrVlFull: {
+    name: 'paddleocr-vl-full',
+    dockerfile: 'Dockerfile_paddleocr_vl_full',
+    buildContext: '.',
+    containerName: 'paddleocr-vl-full-test',
+    ports: ['8000:8000'],
+    volumes: [
+      'ht-huggingface-cache:/root/.cache/huggingface',
+      'ht-paddleocr-cache:/root/.paddleocr',
+    ],
+    gpus: true,
+    healthEndpoint: 'http://localhost:8000/health',
+    healthTimeout: 600000, // 10 minutes for model loading (vLLM + PP-DocLayoutV2)
+  } as IImageConfig,
+};
+
+/**
+ * Execute a shell command and return output
+ */
+function exec(command: string, silent = false): string {
+  try {
+    return execSync(command, {
+      encoding: 'utf-8',
+      stdio: silent ? 'pipe' : 'inherit',
+    });
+  } catch (err: unknown) {
+    if (silent) return '';
+    throw err;
+  }
+}
+
+/**
+ * Check if a Docker image exists locally
+ */
+export function imageExists(imageName: string): boolean {
+  const result = exec(`docker images -q ${imageName}`, true);
+  return result.trim().length > 0;
+}
+
+/**
+ * Check if a container is running
+ */
+export function isContainerRunning(containerName: string): boolean {
+  const result = exec(`docker ps --filter "name=^${containerName}$" --format "{{.Names}}"`, true);
+  return result.trim() === containerName;
+}
+
+/**
+ * Check if a container exists (running or stopped)
+ */
+export function containerExists(containerName: string): boolean {
+  const result = exec(`docker ps -a --filter "name=^${containerName}$" --format "{{.Names}}"`, true);
+  return result.trim() === containerName;
+}
+
+/**
+ * Stop and remove a container
+ */
+export function removeContainer(containerName: string): void {
+  if (containerExists(containerName)) {
+    console.log(`[Docker] Removing container: ${containerName}`);
+    exec(`docker rm -f ${containerName}`, true);
+  }
+}
+
+/**
+ * Stop all project containers that conflict with the required one
+ */
+export function stopConflictingContainers(requiredContainer: string, requiredPort: string): void {
+  // Stop project containers using the same port
+  for (const container of PROJECT_CONTAINERS) {
+    if (container === requiredContainer) continue;
+
+    if (isContainerRunning(container)) {
+      // Check if this container uses the same port
+      const ports = exec(`docker port ${container} 2>/dev/null || true`, true);
+      if (ports.includes(requiredPort.split(':')[0])) {
+        console.log(`[Docker] Stopping conflicting container: ${container}`);
+        exec(`docker stop ${container}`, true);
+      }
+    }
+  }
+}
+
+/**
+ * Build a Docker image
+ */
+export function buildImage(config: IImageConfig): void {
+  console.log(`[Docker] Building image: ${config.name}`);
+  const cmd = `docker build --load -f ${config.dockerfile} -t ${config.name} ${config.buildContext}`;
+  exec(cmd);
+}
+
+/**
+ * Start a container from an image
+ */
+export function startContainer(config: IImageConfig): void {
+  // Remove existing container if it exists
+  removeContainer(config.containerName);
+
+  console.log(`[Docker] Starting container: ${config.containerName}`);
+
+  const portArgs = config.ports.map((p) => `-p ${p}`).join(' ');
+  const volumeArgs = config.volumes?.map((v) => `-v ${v}`).join(' ') || '';
+  const gpuArgs = config.gpus ? '--gpus all' : '';
+
+  const cmd = `docker run -d --name ${config.containerName} ${gpuArgs} ${portArgs} ${volumeArgs} ${config.name}`;
+  exec(cmd);
+}
+
+/**
+ * Wait for a container to become healthy
+ */
+export async function waitForHealth(
+  endpoint: string,
+  timeoutMs: number = 120000,
+  intervalMs: number = 5000
+): Promise<boolean> {
+  const startTime = Date.now();
+  console.log(`[Docker] Waiting for health: ${endpoint}`);
+
+  while (Date.now() - startTime < timeoutMs) {
+    try {
+      const response = await fetch(endpoint, {
+        method: 'GET',
+        signal: AbortSignal.timeout(5000),
+      });
+      if (response.ok) {
+        console.log(`[Docker] Service healthy!`);
+        return true;
+      }
+    } catch {
+      // Service not ready yet
+    }
+
+    const elapsed = Math.round((Date.now() - startTime) / 1000);
+    console.log(`[Docker] Waiting... (${elapsed}s)`);
+    await new Promise((resolve) => setTimeout(resolve, intervalMs));
+  }
+
+  console.log(`[Docker] Health check timeout after ${timeoutMs / 1000}s`);
+  return false;
+}
+
+/**
+ * Ensure a service is running and healthy
+ * - Builds image if missing
+ * - Stops conflicting project containers
+ * - Starts container if not running
+ * - Waits for health check
+ */
+export async function ensureService(config: IImageConfig): Promise<boolean> {
+  console.log(`\n[Docker] Ensuring service: ${config.name}`);
+
+  // Build image if it doesn't exist
+  if (!imageExists(config.name)) {
+    console.log(`[Docker] Image not found, building...`);
+    buildImage(config);
+  }
+
+  // Stop conflicting containers on the same port
+  const mainPort = config.ports[0];
+  stopConflictingContainers(config.containerName, mainPort);
+
+  // Start container if not running
+  if (!isContainerRunning(config.containerName)) {
+    startContainer(config);
+  } else {
+    console.log(`[Docker] Container already running: ${config.containerName}`);
+  }
+
+  // Wait for health
+  if (config.healthEndpoint) {
+    return waitForHealth(config.healthEndpoint, config.healthTimeout);
+  }
+
+  return true;
+}
+
+/**
+ * Ensure PaddleOCR-VL GPU service is running
+ */
+export async function ensurePaddleOcrVlGpu(): Promise<boolean> {
+  return ensureService(IMAGES.paddleocrVlGpu);
+}
+
+/**
+ * Ensure PaddleOCR-VL CPU service is running
+ */
+export async function ensurePaddleOcrVlCpu(): Promise<boolean> {
+  return ensureService(IMAGES.paddleocrVlCpu);
+}
+
+/**
+ * Ensure MiniCPM service is running
+ */
+export async function ensureMiniCpm(): Promise<boolean> {
+  return ensureService(IMAGES.minicpm);
+}
+
+/**
+ * Check if GPU is available
+ */
+export function isGpuAvailable(): boolean {
+  try {
+    const result = exec('nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null', true);
+    return result.trim().length > 0;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Ensure PaddleOCR-VL service (auto-detect GPU/CPU)
+ */
+export async function ensurePaddleOcrVl(): Promise<boolean> {
+  if (isGpuAvailable()) {
+    console.log('[Docker] GPU detected, using GPU image');
+    return ensurePaddleOcrVlGpu();
+  } else {
+    console.log('[Docker] No GPU detected, using CPU image');
+    return ensurePaddleOcrVlCpu();
+  }
+}
+
+/**
+ * Ensure PaddleOCR-VL Full Pipeline service (PP-DocLayoutV2 + structured output)
+ * This is the recommended service for production use - outputs structured JSON/Markdown
+ */
+export async function ensurePaddleOcrVlFull(): Promise<boolean> {
+  if (!isGpuAvailable()) {
+    console.log('[Docker] WARNING: Full pipeline requires GPU, but none detected');
+  }
+  return ensureService(IMAGES.paddleocrVlFull);
+}