Spaces:

BiasLab2025
/

detection_base

Paused

Zhen Ye Claude Opus 4.6 commited on Feb 23

Commit

078b447

1 Parent(s): 0ace9ca

feat: add benchmark profiler & roofline analysis system

Add hardware extraction, per-frame GPU/CPU profiling with CUDA events,
and automated roofline analysis for detection and segmentation modes.

New endpoints:
- GET /benchmark/hardware — cached hardware specs
- POST /benchmark/profile — per-frame timing breakdown
- POST /benchmark/analysis — full roofline with bottleneck ID

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show

app.py +141 -0
requirements.txt +1 -0
utils/hardware_info.py +385 -0
utils/profiler.py +451 -0
utils/roofline.py +294 -0

app.py CHANGED Viewed

@@ -1017,5 +1017,146 @@ async def gpu_monitor_endpoint(duration: int = 180, interval: int = 1):
     return StreamingResponse(_stream(), media_type="text/plain")
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

     return StreamingResponse(_stream(), media_type="text/plain")
+# ---------------------------------------------------------------------------
+# Benchmark Profiler & Roofline Analysis Endpoints
+# ---------------------------------------------------------------------------
+@app.get("/benchmark/hardware")
+async def benchmark_hardware():
+    """Return hardware specs JSON (no video needed, cached)."""
+    import dataclasses
+    from utils.hardware_info import get_hardware_info
+    hw = await asyncio.to_thread(get_hardware_info)
+    return JSONResponse(dataclasses.asdict(hw))
+@app.post("/benchmark/profile")
+async def benchmark_profile(
+    video: UploadFile = File(...),
+    mode: str = Form("detection"),
+    detector: str = Form("hf_yolov8"),
+    segmenter: str = Form("gsam2_large"),
+    queries: str = Form("person,car,truck"),
+    max_frames: int = Form(100),
+    warmup_frames: int = Form(5),
+    step: int = Form(20),
+):
+    """Run profiled inference and return per-frame timing breakdown.
+    Args:
+        video: Video file to profile.
+        mode: "detection" or "segmentation".
+        detector: Detector key (for detection mode).
+        segmenter: Segmenter key (for segmentation mode).
+        queries: Comma-separated object classes.
+        max_frames: Maximum frames to profile.
+        warmup_frames: Warmup frames (detection only).
+        step: Keyframe interval (segmentation only).
+    """
+    import dataclasses
+    from utils.profiler import run_profiled_detection, run_profiled_segmentation
+    if mode not in ("detection", "segmentation"):
+        raise HTTPException(status_code=400, detail="mode must be 'detection' or 'segmentation'")
+    input_path = _save_upload_to_tmp(video)
+    await video.close()
+    query_list = [q.strip() for q in queries.split(",") if q.strip()]
+    try:
+        if mode == "detection":
+            result = await asyncio.to_thread(
+                run_profiled_detection,
+                input_path, detector, query_list,
+                max_frames=max_frames, warmup_frames=warmup_frames,
+            )
+        else:
+            result = await asyncio.to_thread(
+                run_profiled_segmentation,
+                input_path, segmenter, query_list,
+                max_frames=max_frames, step=step,
+            )
+    except Exception as exc:
+        _safe_delete(input_path)
+        logging.exception("Profiling failed")
+        raise HTTPException(status_code=500, detail=str(exc))
+    finally:
+        _safe_delete(input_path)
+    # Serialize dataclass, handling any non-serializable fields
+    out = dataclasses.asdict(result)
+    # Include GSAM2 metrics if present
+    gsam2 = getattr(result, "_gsam2_metrics", None)
+    if gsam2:
+        out["gsam2_metrics"] = gsam2
+    return JSONResponse(out)
+@app.post("/benchmark/analysis")
+async def benchmark_analysis(
+    video: UploadFile = File(...),
+    mode: str = Form("detection"),
+    detector: str = Form("hf_yolov8"),
+    segmenter: str = Form("gsam2_large"),
+    queries: str = Form("person,car,truck"),
+    max_frames: int = Form(100),
+    warmup_frames: int = Form(5),
+    step: int = Form(20),
+):
+    """Full roofline analysis: hardware + profiling + theoretical ceilings + bottleneck ID.
+    Combines hardware extraction, profiled inference, and roofline model
+    to identify bottlenecks and provide actionable recommendations.
+    """
+    import dataclasses
+    from utils.hardware_info import get_hardware_info
+    from utils.profiler import run_profiled_detection, run_profiled_segmentation
+    from utils.roofline import compute_roofline
+    if mode not in ("detection", "segmentation"):
+        raise HTTPException(status_code=400, detail="mode must be 'detection' or 'segmentation'")
+    input_path = _save_upload_to_tmp(video)
+    await video.close()
+    query_list = [q.strip() for q in queries.split(",") if q.strip()]
+    try:
+        # Get hardware info (cached, fast)
+        hardware = await asyncio.to_thread(get_hardware_info)
+        # Run profiling
+        if mode == "detection":
+            profiling = await asyncio.to_thread(
+                run_profiled_detection,
+                input_path, detector, query_list,
+                max_frames=max_frames, warmup_frames=warmup_frames,
+            )
+        else:
+            profiling = await asyncio.to_thread(
+                run_profiled_segmentation,
+                input_path, segmenter, query_list,
+                max_frames=max_frames, step=step,
+            )
+        # Compute roofline
+        roofline = compute_roofline(hardware, profiling)
+    except Exception as exc:
+        _safe_delete(input_path)
+        logging.exception("Benchmark analysis failed")
+        raise HTTPException(status_code=500, detail=str(exc))
+    finally:
+        _safe_delete(input_path)
+    return JSONResponse({
+        "hardware": dataclasses.asdict(hardware),
+        "profiling": dataclasses.asdict(profiling),
+        "roofline": dataclasses.asdict(roofline),
+    })
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

requirements.txt CHANGED Viewed

@@ -13,3 +13,4 @@ sentence-transformers
 SAM-2 @ git+https://github.com/facebookresearch/sam2.git
 hydra-core>=1.3.2
 iopath>=0.1.10

 SAM-2 @ git+https://github.com/facebookresearch/sam2.git
 hydra-core>=1.3.2
 iopath>=0.1.10
+psutil

utils/hardware_info.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""Hardware specification extraction for roofline analysis.
+Extracts CPU, GPU, memory, and storage parameters via system tools
+and torch APIs. All functions have try/except fallbacks returning None
+for inaccessible fields.
+"""
+import logging
+import os
+import platform
+import re
+import subprocess
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Dict, List, Optional
+logger = logging.getLogger(__name__)
+# CUDA cores per SM by compute capability (major, minor) -> cores_per_sm
+# Kepler through Blackwell
+_CORES_PER_SM: Dict[tuple, int] = {
+    (3, 0): 192, (3, 2): 192, (3, 5): 192, (3, 7): 192,  # Kepler
+    (5, 0): 128, (5, 2): 128, (5, 3): 128,                # Maxwell
+    (6, 0): 64,  (6, 1): 128, (6, 2): 128,                # Pascal
+    (7, 0): 64,  (7, 2): 64,  (7, 5): 64,                 # Volta / Turing
+    (8, 0): 64,  (8, 6): 128, (8, 7): 128, (8, 9): 128,  # Ampere / Ada
+    (9, 0): 128,                                            # Hopper
+    (10, 0): 128,                                           # Blackwell
+}
+# PCIe bandwidth (GB/s, unidirectional) by gen and width
+_PCIE_BW: Dict[int, float] = {
+    3: 0.985,   # ~1 GB/s per lane
+    4: 1.969,
+    5: 3.938,
+    6: 7.563,
+}
+@dataclass
+class CPUInfo:
+    model: Optional[str] = None
+    physical_cores: Optional[int] = None
+    logical_cores: Optional[int] = None
+    frequency_mhz: Optional[float] = None
+    cache_l2_kb: Optional[int] = None
+    cache_l3_kb: Optional[int] = None
+    architecture: Optional[str] = None
+@dataclass
+class MemoryInfo:
+    total_gb: Optional[float] = None
+    available_gb: Optional[float] = None
+    estimated_bandwidth_gbps: Optional[float] = None
+@dataclass
+class GPUInfo:
+    index: int = 0
+    name: Optional[str] = None
+    sm_count: Optional[int] = None
+    cuda_cores: Optional[int] = None
+    clock_mhz: Optional[float] = None
+    memory_clock_mhz: Optional[float] = None
+    memory_bus_width_bits: Optional[int] = None
+    vram_total_gb: Optional[float] = None
+    vram_free_gb: Optional[float] = None
+    memory_bandwidth_gbps: Optional[float] = None
+    fp32_tflops: Optional[float] = None
+    fp16_tflops: Optional[float] = None
+    tensor_core_tflops: Optional[float] = None
+    pcie_gen: Optional[int] = None
+    pcie_width: Optional[int] = None
+    pcie_bandwidth_gbps: Optional[float] = None
+    compute_capability: Optional[str] = None
+    driver_version: Optional[str] = None
+    cuda_version: Optional[str] = None
+@dataclass
+class StorageInfo:
+    storage_type: Optional[str] = None  # "SSD" or "HDD" or "Unknown"
+    sequential_read_mbps: Optional[float] = None
+@dataclass
+class HardwareInfo:
+    cpu: CPUInfo = field(default_factory=CPUInfo)
+    memory: MemoryInfo = field(default_factory=MemoryInfo)
+    gpus: List[GPUInfo] = field(default_factory=list)
+    storage: StorageInfo = field(default_factory=StorageInfo)
+    system: Optional[str] = None
+    python_version: Optional[str] = None
+    torch_version: Optional[str] = None
+    cuda_runtime_version: Optional[str] = None
+def _run_cmd(cmd: List[str], timeout: int = 10) -> Optional[str]:
+    """Run a shell command and return stdout, or None on failure."""
+    try:
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout,
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+        pass
+    return None
+def _nvidia_smi_query(*fields: str) -> Optional[Dict[str, str]]:
+    """Query nvidia-smi for given fields. Returns dict of field->value."""
+    field_str = ",".join(fields)
+    out = _run_cmd([
+        "nvidia-smi",
+        f"--query-gpu={field_str}",
+        "--format=csv,noheader,nounits",
+    ])
+    if not out:
+        return None
+    values = [v.strip() for v in out.split("\n")[0].split(",")]
+    if len(values) != len(fields):
+        return None
+    return dict(zip(fields, values))
+def get_cpu_info() -> CPUInfo:
+    info = CPUInfo()
+    try:
+        info.architecture = platform.machine()
+        info.logical_cores = os.cpu_count()
+        try:
+            import psutil
+            info.physical_cores = psutil.cpu_count(logical=False)
+            freq = psutil.cpu_freq()
+            if freq:
+                info.frequency_mhz = freq.current or freq.max
+        except ImportError:
+            pass
+        system = platform.system()
+        if system == "Linux":
+            out = _run_cmd(["lscpu"])
+            if out:
+                for line in out.split("\n"):
+                    if "Model name" in line:
+                        info.model = line.split(":", 1)[1].strip()
+                    elif "L2 cache" in line:
+                        val = line.split(":", 1)[1].strip()
+                        m = re.search(r"([\d.]+)", val)
+                        if m:
+                            kb = float(m.group(1))
+                            if "MiB" in val or "M" in val:
+                                kb *= 1024
+                            info.cache_l2_kb = int(kb)
+                    elif "L3 cache" in line:
+                        val = line.split(":", 1)[1].strip()
+                        m = re.search(r"([\d.]+)", val)
+                        if m:
+                            kb = float(m.group(1))
+                            if "MiB" in val or "M" in val:
+                                kb *= 1024
+                            info.cache_l3_kb = int(kb)
+        elif system == "Darwin":
+            brand = _run_cmd(["sysctl", "-n", "machdep.cpu.brand_string"])
+            if brand:
+                info.model = brand
+            l2 = _run_cmd(["sysctl", "-n", "hw.l2cachesize"])
+            if l2:
+                try:
+                    info.cache_l2_kb = int(l2) // 1024
+                except ValueError:
+                    pass
+            l3 = _run_cmd(["sysctl", "-n", "hw.l3cachesize"])
+            if l3:
+                try:
+                    info.cache_l3_kb = int(l3) // 1024
+                except ValueError:
+                    pass
+    except Exception:
+        logger.debug("CPU info extraction partially failed", exc_info=True)
+    return info
+def get_memory_info() -> MemoryInfo:
+    info = MemoryInfo()
+    try:
+        try:
+            import psutil
+            vm = psutil.virtual_memory()
+            info.total_gb = round(vm.total / (1024 ** 3), 2)
+            info.available_gb = round(vm.available / (1024 ** 3), 2)
+        except ImportError:
+            # Fallback: /proc/meminfo on Linux
+            if os.path.exists("/proc/meminfo"):
+                with open("/proc/meminfo") as f:
+                    for line in f:
+                        if line.startswith("MemTotal:"):
+                            kb = int(line.split()[1])
+                            info.total_gb = round(kb / (1024 ** 2), 2)
+                        elif line.startswith("MemAvailable:"):
+                            kb = int(line.split()[1])
+                            info.available_gb = round(kb / (1024 ** 2), 2)
+        # Rough estimate: DDR4 ~40 GB/s, DDR5 ~60 GB/s
+        # Without dmidecode we can't know for sure, default to DDR4 estimate
+        if info.total_gb:
+            info.estimated_bandwidth_gbps = 40.0  # conservative DDR4 dual-channel
+    except Exception:
+        logger.debug("Memory info extraction partially failed", exc_info=True)
+    return info
+def get_gpu_info() -> List[GPUInfo]:
+    gpus: List[GPUInfo] = []
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return gpus
+        device_count = torch.cuda.device_count()
+        # Get driver/cuda version from nvidia-smi
+        driver_version = None
+        smi_cuda_version = None
+        nv = _nvidia_smi_query("driver_version")
+        if nv:
+            driver_version = nv.get("driver_version")
+        # nvidia-smi reports the max supported CUDA runtime
+        nv2 = _run_cmd(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"])
+        smi_out = _run_cmd(["nvidia-smi"])
+        if smi_out:
+            m = re.search(r"CUDA Version:\s+([\d.]+)", smi_out)
+            if m:
+                smi_cuda_version = m.group(1)
+        for i in range(device_count):
+            gpu = GPUInfo(index=i)
+            props = torch.cuda.get_device_properties(i)
+            gpu.name = props.name
+            gpu.sm_count = props.multi_processor_count
+            gpu.vram_total_gb = round(props.total_mem / (1024 ** 3), 2)
+            cc = (props.major, props.minor)
+            gpu.compute_capability = f"{props.major}.{props.minor}"
+            gpu.driver_version = driver_version
+            gpu.cuda_version = smi_cuda_version
+            # CUDA cores
+            cores_per_sm = _CORES_PER_SM.get(cc)
+            if cores_per_sm and gpu.sm_count:
+                gpu.cuda_cores = gpu.sm_count * cores_per_sm
+            # nvidia-smi per-GPU queries
+            nv_data = _run_cmd([
+                "nvidia-smi",
+                f"--id={i}",
+                "--query-gpu=clocks.max.graphics,clocks.max.memory,memory.bus_width,pcie.link.gen.current,pcie.link.width.current,memory.free",
+                "--format=csv,noheader,nounits",
+            ])
+            if nv_data:
+                parts = [p.strip() for p in nv_data.split(",")]
+                if len(parts) >= 6:
+                    try:
+                        gpu.clock_mhz = float(parts[0])
+                    except (ValueError, TypeError):
+                        pass
+                    try:
+                        gpu.memory_clock_mhz = float(parts[1])
+                    except (ValueError, TypeError):
+                        pass
+                    try:
+                        gpu.memory_bus_width_bits = int(parts[2])
+                    except (ValueError, TypeError):
+                        pass
+                    try:
+                        gpu.pcie_gen = int(parts[3])
+                    except (ValueError, TypeError):
+                        pass
+                    try:
+                        gpu.pcie_width = int(parts[4])
+                    except (ValueError, TypeError):
+                        pass
+                    try:
+                        gpu.vram_free_gb = round(float(parts[5]) / 1024, 2)
+                    except (ValueError, TypeError):
+                        pass
+            # Derived: memory bandwidth
+            # GDDR: bandwidth = mem_clock * bus_width * 2 (DDR) / 8 (bits->bytes) / 1000 (MHz->GHz)
+            # HBM: bandwidth = mem_clock * bus_width * 2 / 8 / 1000
+            if gpu.memory_clock_mhz and gpu.memory_bus_width_bits:
+                gpu.memory_bandwidth_gbps = round(
+                    gpu.memory_clock_mhz * gpu.memory_bus_width_bits * 2 / 8 / 1000, 1
+                )
+            # Derived: FP32 TFLOPS = cuda_cores * clock_mhz * 2 (FMA) / 1e6
+            if gpu.cuda_cores and gpu.clock_mhz:
+                gpu.fp32_tflops = round(gpu.cuda_cores * gpu.clock_mhz * 2 / 1e6, 2)
+                # FP16 is typically 2x FP32 on Volta+
+                if props.major >= 7:
+                    gpu.fp16_tflops = round(gpu.fp32_tflops * 2, 2)
+                else:
+                    gpu.fp16_tflops = gpu.fp32_tflops
+            # Tensor core TFLOPS (rough: 8x FP32 on Ampere+, 4x on Volta/Turing)
+            if gpu.fp32_tflops:
+                if props.major >= 8:
+                    gpu.tensor_core_tflops = round(gpu.fp32_tflops * 8, 2)
+                elif props.major >= 7:
+                    gpu.tensor_core_tflops = round(gpu.fp32_tflops * 4, 2)
+            # Derived: PCIe bandwidth
+            if gpu.pcie_gen and gpu.pcie_width:
+                per_lane = _PCIE_BW.get(gpu.pcie_gen, 0)
+                gpu.pcie_bandwidth_gbps = round(per_lane * gpu.pcie_width, 2)
+            gpus.append(gpu)
+    except Exception:
+        logger.debug("GPU info extraction partially failed", exc_info=True)
+    return gpus
+def get_storage_info() -> StorageInfo:
+    info = StorageInfo()
+    try:
+        system = platform.system()
+        if system == "Linux":
+            # Check if root device is rotational
+            out = _run_cmd(["lsblk", "-d", "-o", "NAME,ROTA", "--noheadings"])
+            if out:
+                for line in out.strip().split("\n"):
+                    parts = line.split()
+                    if len(parts) == 2:
+                        info.storage_type = "HDD" if parts[1] == "1" else "SSD"
+                        break
+            # Quick sequential read test with dd (1GB)
+            dd_out = _run_cmd(
+                ["dd", "if=/dev/zero", "of=/dev/null", "bs=1M", "count=256"],
+                timeout=15,
+            )
+            # dd prints throughput to stderr, but _run_cmd only captures stdout
+            # Try a different approach
+            try:
+                result = subprocess.run(
+                    ["dd", "if=/dev/zero", "of=/dev/null", "bs=1M", "count=256"],
+                    capture_output=True, text=True, timeout=15,
+                )
+                stderr = result.stderr
+                m = re.search(r"([\d.]+)\s*(GB|MB)/s", stderr)
+                if m:
+                    speed = float(m.group(1))
+                    if m.group(2) == "GB":
+                        speed *= 1000
+                    info.sequential_read_mbps = round(speed, 0)
+            except Exception:
+                pass
+        elif system == "Darwin":
+            info.storage_type = "SSD"  # Modern Macs use NVMe SSDs
+    except Exception:
+        logger.debug("Storage info extraction partially failed", exc_info=True)
+    return info
+@lru_cache(maxsize=1)
+def get_hardware_info() -> HardwareInfo:
+    """Aggregate all hardware info (cached)."""
+    import torch
+    hw = HardwareInfo()
+    hw.cpu = get_cpu_info()
+    hw.memory = get_memory_info()
+    hw.gpus = get_gpu_info()
+    hw.storage = get_storage_info()
+    hw.system = f"{platform.system()} {platform.release()}"
+    hw.python_version = platform.python_version()
+    hw.torch_version = torch.__version__
+    hw.cuda_runtime_version = (
+        torch.version.cuda if torch.cuda.is_available() else None
+    )
+    return hw

utils/profiler.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""Per-frame GPU/CPU profiling for detection and segmentation pipelines.
+Provides CUDA event-based timing and decomposed profiling for
+transformers-based and opaque (YOLO) detectors. Runs in a dedicated
+single-threaded path for accurate, reproducible measurements.
+"""
+import logging
+import statistics
+import time
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Sequence
+import cv2
+import numpy as np
+import torch
+logger = logging.getLogger(__name__)
+# Detectors whose predict() can be decomposed into processor -> model -> post_process
+_DECOMPOSABLE_DETECTORS = {"detr_resnet50", "grounding_dino"}
+# Detectors with opaque predict() calls (YOLO-based)
+_OPAQUE_DETECTORS = {"hf_yolov8", "drone_yolo"}
+@dataclass
+class TimingStats:
+    """Aggregate statistics for a set of measurements (in ms)."""
+    min_ms: float = 0.0
+    max_ms: float = 0.0
+    mean_ms: float = 0.0
+    std_ms: float = 0.0
+    p50_ms: float = 0.0
+    p95_ms: float = 0.0
+    p99_ms: float = 0.0
+    count: int = 0
+    @staticmethod
+    def from_samples(samples: List[float]) -> "TimingStats":
+        if not samples:
+            return TimingStats()
+        sorted_s = sorted(samples)
+        n = len(sorted_s)
+        return TimingStats(
+            min_ms=sorted_s[0],
+            max_ms=sorted_s[-1],
+            mean_ms=statistics.mean(sorted_s),
+            std_ms=statistics.stdev(sorted_s) if n > 1 else 0.0,
+            p50_ms=sorted_s[n // 2],
+            p95_ms=sorted_s[int(n * 0.95)],
+            p99_ms=sorted_s[int(n * 0.99)],
+            count=n,
+        )
+@dataclass
+class FrameTiming:
+    """Timing breakdown for a single frame (all values in ms)."""
+    frame_idx: int = 0
+    decode_ms: float = 0.0
+    preprocess_ms: float = 0.0      # CPU: image processor / resize
+    transfer_ms: float = 0.0        # CPU->GPU data transfer
+    gpu_kernel_ms: float = 0.0      # GPU model forward pass
+    postprocess_ms: float = 0.0     # CPU: post-processing + NMS
+    total_ms: float = 0.0
+    num_detections: int = 0
+@dataclass
+class ProfilingResult:
+    """Full profiling result for a video."""
+    detector_name: str = ""
+    mode: str = ""
+    total_frames: int = 0
+    warmup_frames: int = 0
+    profiled_frames: int = 0
+    video_resolution: str = ""
+    video_fps: float = 0.0
+    # Per-frame timings
+    frame_timings: List[FrameTiming] = field(default_factory=list)
+    # Aggregate stats
+    decode_stats: TimingStats = field(default_factory=TimingStats)
+    preprocess_stats: TimingStats = field(default_factory=TimingStats)
+    transfer_stats: TimingStats = field(default_factory=TimingStats)
+    gpu_kernel_stats: TimingStats = field(default_factory=TimingStats)
+    postprocess_stats: TimingStats = field(default_factory=TimingStats)
+    total_stats: TimingStats = field(default_factory=TimingStats)
+    # GPU memory
+    gpu_peak_memory_mb: float = 0.0
+    gpu_allocated_mb: float = 0.0
+    # Throughput
+    avg_fps: float = 0.0
+    avg_detections_per_frame: float = 0.0
+class CudaTimer:
+    """Non-blocking GPU timer using CUDA events.
+    Records start/stop on the current CUDA stream; synchronizes lazily
+    on ``elapsed_ms()`` call.
+    """
+    def __init__(self):
+        self._start = torch.cuda.Event(enable_timing=True)
+        self._end = torch.cuda.Event(enable_timing=True)
+    def start(self):
+        self._start.record()
+    def stop(self):
+        self._end.record()
+    def elapsed_ms(self) -> float:
+        self._end.synchronize()
+        return self._start.elapsed_time(self._end)
+def _profile_decomposed(detector, frame: np.ndarray, queries: Sequence[str]) -> FrameTiming:
+    """Profile a transformers-based detector with decomposed phases.
+    Works for DETR and Grounding DINO where we can separate:
+    processor(image) -> .to(device) -> model(**inputs) -> post_process()
+    """
+    timing = FrameTiming()
+    # 1. Preprocess (CPU)
+    t0 = time.perf_counter()
+    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    if hasattr(detector, "processor"):
+        processor = detector.processor
+        if hasattr(detector, "_build_prompt"):
+            # Grounding DINO
+            prompt = detector._build_prompt(queries)
+            inputs = processor(images=frame_rgb, text=prompt, return_tensors="pt")
+        else:
+            # DETR
+            inputs = processor(images=frame_rgb, return_tensors="pt")
+    else:
+        timing.preprocess_ms = (time.perf_counter() - t0) * 1000
+        return timing
+    timing.preprocess_ms = (time.perf_counter() - t0) * 1000
+    # 2. Transfer to GPU
+    cuda_timer_transfer = CudaTimer()
+    cuda_timer_transfer.start()
+    inputs = {key: value.to(detector.device) for key, value in inputs.items()}
+    cuda_timer_transfer.stop()
+    timing.transfer_ms = cuda_timer_transfer.elapsed_ms()
+    # 3. GPU forward pass
+    cuda_timer_kernel = CudaTimer()
+    cuda_timer_kernel.start()
+    with torch.no_grad():
+        outputs = detector.model(**inputs)
+    cuda_timer_kernel.stop()
+    timing.gpu_kernel_ms = cuda_timer_kernel.elapsed_ms()
+    # 4. Post-process (CPU)
+    t0 = time.perf_counter()
+    target_sizes = torch.tensor([frame.shape[:2]], device=detector.device)
+    if hasattr(detector, "_post_process"):
+        # Grounding DINO
+        processed_list = detector._post_process(outputs, inputs["input_ids"], target_sizes)
+    else:
+        # DETR
+        processed_list = detector.processor.post_process_object_detection(
+            outputs, threshold=detector.score_threshold, target_sizes=target_sizes,
+        )
+    result = detector._parse_single_result(processed_list[0])
+    timing.postprocess_ms = (time.perf_counter() - t0) * 1000
+    timing.num_detections = len(result.boxes)
+    timing.total_ms = timing.preprocess_ms + timing.transfer_ms + timing.gpu_kernel_ms + timing.postprocess_ms
+    return timing
+def _profile_opaque(detector, frame: np.ndarray, queries: Sequence[str]) -> FrameTiming:
+    """Profile an opaque detector (YOLO) where internals aren't separable."""
+    timing = FrameTiming()
+    # Wrap entire predict() with CUDA events
+    cuda_timer = CudaTimer()
+    t0 = time.perf_counter()
+    cuda_timer.start()
+    result = detector.predict(frame, queries)
+    cuda_timer.stop()
+    wall_ms = (time.perf_counter() - t0) * 1000
+    timing.gpu_kernel_ms = cuda_timer.elapsed_ms()
+    timing.preprocess_ms = 0.0   # Included in gpu_kernel
+    timing.transfer_ms = -1.0    # Not separable
+    timing.postprocess_ms = max(0, wall_ms - timing.gpu_kernel_ms)
+    timing.total_ms = wall_ms
+    timing.num_detections = len(result.boxes)
+    return timing
+def run_profiled_detection(
+    video_path: str,
+    detector_name: str,
+    queries: List[str],
+    max_frames: int = 100,
+    warmup_frames: int = 5,
+) -> ProfilingResult:
+    """Run profiled detection on a video file.
+    Single-threaded profiling path (not injected into the multi-threaded
+    production pipeline) for accurate, reproducible measurements.
+    """
+    from models.model_loader import load_detector
+    from utils.video import VideoReader
+    result = ProfilingResult(
+        detector_name=detector_name,
+        mode="detection",
+        warmup_frames=warmup_frames,
+    )
+    # Load detector
+    detector = load_detector(detector_name)
+    device = getattr(detector, "device", None)
+    has_cuda = device is not None and str(device).startswith("cuda")
+    if not has_cuda:
+        logger.warning("No CUDA device found for profiling; GPU timings will be 0")
+    # Open video
+    reader = VideoReader(video_path)
+    result.video_resolution = f"{reader.width}x{reader.height}"
+    result.video_fps = reader.fps
+    is_decomposable = detector_name in _DECOMPOSABLE_DETECTORS
+    # Reset CUDA peak memory
+    if has_cuda:
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+    frame_timings: List[FrameTiming] = []
+    frame_idx = 0
+    for frame in reader:
+        if frame_idx >= max_frames:
+            break
+        # Decode timing
+        t_decode_start = time.perf_counter()
+        # frame is already decoded by VideoReader, so decode = iteration time
+        # We measure it before predict for consistency
+        decode_ms = 0.0  # Measured below
+        if frame_idx < warmup_frames:
+            # Warmup: run prediction but don't record
+            if is_decomposable:
+                _profile_decomposed(detector, frame, queries)
+            else:
+                _profile_opaque(detector, frame, queries)
+            frame_idx += 1
+            continue
+        # Time the decode (approximated as read time for next frame)
+        t_before = time.perf_counter()
+        # Profile prediction
+        if is_decomposable:
+            timing = _profile_decomposed(detector, frame, queries)
+        else:
+            timing = _profile_opaque(detector, frame, queries)
+        timing.frame_idx = frame_idx
+        # decode_ms is effectively 0 here since VideoReader pre-decoded;
+        # for a real decode benchmark we'd time cv2.read separately.
+        # We'll measure a representative decode cost from the first non-warmup frame.
+        if frame_idx == warmup_frames:
+            # Benchmark decode cost: re-read one frame
+            cap = cv2.VideoCapture(video_path)
+            if cap.isOpened():
+                td0 = time.perf_counter()
+                cap.read()
+                timing.decode_ms = (time.perf_counter() - td0) * 1000
+                cap.release()
+        else:
+            # Approximate: use same decode cost as first frame
+            if frame_timings:
+                timing.decode_ms = frame_timings[0].decode_ms
+        frame_timings.append(timing)
+        frame_idx += 1
+    reader.close()
+    # Aggregate results
+    result.total_frames = frame_idx
+    result.profiled_frames = len(frame_timings)
+    result.frame_timings = frame_timings
+    if frame_timings:
+        result.decode_stats = TimingStats.from_samples([t.decode_ms for t in frame_timings])
+        result.preprocess_stats = TimingStats.from_samples([t.preprocess_ms for t in frame_timings])
+        transfer_samples = [t.transfer_ms for t in frame_timings if t.transfer_ms >= 0]
+        result.transfer_stats = TimingStats.from_samples(transfer_samples)
+        result.gpu_kernel_stats = TimingStats.from_samples([t.gpu_kernel_ms for t in frame_timings])
+        result.postprocess_stats = TimingStats.from_samples([t.postprocess_ms for t in frame_timings])
+        result.total_stats = TimingStats.from_samples([t.total_ms for t in frame_timings])
+        result.avg_fps = 1000.0 / result.total_stats.mean_ms if result.total_stats.mean_ms > 0 else 0
+        result.avg_detections_per_frame = statistics.mean([t.num_detections for t in frame_timings])
+    # GPU memory
+    if has_cuda:
+        torch.cuda.synchronize()
+        result.gpu_peak_memory_mb = round(torch.cuda.max_memory_allocated() / (1024 ** 2), 1)
+        result.gpu_allocated_mb = round(torch.cuda.memory_allocated() / (1024 ** 2), 1)
+    return result
+def run_profiled_segmentation(
+    video_path: str,
+    segmenter_name: str,
+    queries: List[str],
+    max_frames: int = 100,
+    step: int = 20,
+) -> ProfilingResult:
+    """Run profiled segmentation (GSAM2) on a video file.
+    Profiles the GSAM2 stages: GDINO keyframe detection,
+    SAM2 image prediction, SAM2 video propagation.
+    """
+    import tempfile
+    import os
+    result = ProfilingResult(
+        detector_name=segmenter_name,
+        mode="segmentation",
+        warmup_frames=0,
+    )
+    # Open video for metadata
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Cannot open video: {video_path}")
+    result.video_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+    result.video_resolution = f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}"
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    result.total_frames = min(total, max_frames)
+    has_cuda = torch.cuda.is_available()
+    if has_cuda:
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+    # Run GSAM2 with perf metrics
+    import threading
+    metrics = {
+        "end_to_end_ms": 0.0,
+        "frame_extraction_ms": 0.0,
+        "model_load_ms": 0.0,
+        "init_state_ms": 0.0,
+        "tracking_total_ms": 0.0,
+        "gdino_total_ms": 0.0,
+        "sam_image_total_ms": 0.0,
+        "sam_video_total_ms": 0.0,
+        "id_reconciliation_ms": 0.0,
+        "render_total_ms": 0.0,
+        "writer_total_ms": 0.0,
+        "gpu_peak_mem_mb": 0.0,
+    }
+    lock = threading.Lock()
+    fd, output_path = tempfile.mkstemp(prefix="profile_seg_", suffix=".mp4")
+    os.close(fd)
+    try:
+        from inference import run_grounded_sam2_tracking
+        run_grounded_sam2_tracking(
+            video_path,
+            output_path,
+            queries,
+            segmenter_name=segmenter_name,
+            step=step,
+            enable_gpt=False,
+            max_frames=max_frames,
+            _perf_metrics=metrics,
+            _perf_lock=lock,
+        )
+    except Exception as e:
+        logger.error("Profiled segmentation failed: %s", e)
+        raise
+    finally:
+        try:
+            os.remove(output_path)
+        except OSError:
+            pass
+    # Convert GSAM2 metrics to FrameTiming-like structure
+    n_frames = result.total_frames
+    n_keyframes = max(1, n_frames // step)
+    # Create synthetic per-frame timings from aggregate metrics
+    if n_frames > 0:
+        avg_gdino = metrics["gdino_total_ms"] / n_keyframes if n_keyframes else 0
+        avg_sam_img = metrics["sam_image_total_ms"] / n_keyframes if n_keyframes else 0
+        avg_sam_vid = metrics["sam_video_total_ms"] / max(1, n_frames - n_keyframes)
+        avg_render = metrics["render_total_ms"] / n_frames
+        for i in range(n_frames):
+            ft = FrameTiming(frame_idx=i)
+            is_keyframe = (i % step == 0)
+            if is_keyframe:
+                ft.preprocess_ms = avg_gdino
+                ft.gpu_kernel_ms = avg_sam_img
+            else:
+                ft.gpu_kernel_ms = avg_sam_vid
+            ft.postprocess_ms = avg_render
+            ft.decode_ms = metrics["frame_extraction_ms"] / n_frames
+            ft.total_ms = ft.decode_ms + ft.preprocess_ms + ft.gpu_kernel_ms + ft.postprocess_ms
+            result.frame_timings.append(ft)
+    result.profiled_frames = len(result.frame_timings)
+    if result.frame_timings:
+        result.decode_stats = TimingStats.from_samples([t.decode_ms for t in result.frame_timings])
+        result.preprocess_stats = TimingStats.from_samples([t.preprocess_ms for t in result.frame_timings])
+        result.gpu_kernel_stats = TimingStats.from_samples([t.gpu_kernel_ms for t in result.frame_timings])
+        result.postprocess_stats = TimingStats.from_samples([t.postprocess_ms for t in result.frame_timings])
+        result.total_stats = TimingStats.from_samples([t.total_ms for t in result.frame_timings])
+        result.avg_fps = 1000.0 / result.total_stats.mean_ms if result.total_stats.mean_ms > 0 else 0
+    # Additional GSAM2-specific metrics stored as metadata
+    result._gsam2_metrics = metrics  # type: ignore[attr-defined]
+    if has_cuda:
+        torch.cuda.synchronize()
+        result.gpu_peak_memory_mb = max(
+            round(torch.cuda.max_memory_allocated() / (1024 ** 2), 1),
+            metrics.get("gpu_peak_mem_mb", 0),
+        )
+        result.gpu_allocated_mb = round(torch.cuda.memory_allocated() / (1024 ** 2), 1)
+    return result

utils/roofline.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""Roofline model analysis for detection and segmentation pipelines.
+Computes theoretical maximum throughput, identifies bottlenecks, and
+provides actionable recommendations based on hardware specs and
+profiling measurements.
+"""
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+logger = logging.getLogger(__name__)
+# Approximate GFLOPs per forward pass at reference resolution (640x480 for YOLO, 800x800 for DETR)
+# These are rough estimates; actual FLOPs depend on input resolution and model variant.
+_MODEL_FLOPS: Dict[str, float] = {
+    # Detection models (GFLOPs per frame)
+    "hf_yolov8": 78.9,           # YOLOv8m ~79 GFLOPs at 640px
+    "detr_resnet50": 86.0,       # DETR-R50 ~86 GFLOPs at 800px
+    "grounding_dino": 172.0,     # Grounding DINO-B ~172 GFLOPs
+    "drone_yolo": 78.9,          # Same arch as YOLOv8m
+    # Segmentation models (GFLOPs per keyframe)
+    "gsam2_small": 48.0,         # SAM2 small encoder
+    "gsam2_base": 96.0,          # SAM2 base encoder
+    "gsam2_large": 200.0,        # SAM2 large encoder
+    "gsam2_tiny": 24.0,          # SAM2 tiny encoder
+}
+# Approximate bytes moved per forward pass (weights + activations + I/O)
+_MODEL_BYTES: Dict[str, float] = {
+    # In MB — approximate weight size + activation memory
+    "hf_yolov8": 52.0,
+    "detr_resnet50": 166.0,
+    "grounding_dino": 340.0,
+    "drone_yolo": 52.0,
+    "gsam2_small": 92.0,
+    "gsam2_base": 180.0,
+    "gsam2_large": 400.0,
+    "gsam2_tiny": 46.0,
+}
+@dataclass
+class BottleneckBreakdown:
+    """Per-phase bottleneck identification."""
+    phase: str = ""           # "decode", "preprocess", "transfer", "gpu_kernel", "postprocess"
+    time_ms: float = 0.0
+    fraction: float = 0.0     # Fraction of total pipeline time
+    is_bottleneck: bool = False
+@dataclass
+class RooflineResult:
+    """Complete roofline analysis output."""
+    # Hardware ceilings
+    peak_fp32_tflops: float = 0.0
+    peak_fp16_tflops: float = 0.0
+    peak_memory_bandwidth_gbps: float = 0.0
+    ridge_point_flop_per_byte: float = 0.0  # = peak_tflops / peak_bw
+    # Workload characteristics
+    model_name: str = ""
+    model_gflops: float = 0.0
+    model_bytes_mb: float = 0.0
+    operational_intensity: float = 0.0  # FLOPs / bytes_moved
+    # Achieved performance
+    achieved_tflops: float = 0.0
+    achieved_bandwidth_gbps: float = 0.0
+    # Bottleneck analysis
+    primary_bottleneck: str = ""  # "decode", "transfer", "memory", "compute"
+    bottleneck_explanation: str = ""
+    phase_breakdown: List[BottleneckBreakdown] = field(default_factory=list)
+    # Throughput
+    theoretical_max_fps: float = 0.0
+    observed_fps: float = 0.0
+    utilization_pct: float = 0.0
+    # GPU memory
+    gpu_peak_memory_mb: float = 0.0
+    gpu_vram_total_mb: float = 0.0
+    memory_utilization_pct: float = 0.0
+    # Recommendations
+    recommendations: List[str] = field(default_factory=list)
+    # GSAM2-specific metrics (populated for segmentation mode)
+    gsam2_metrics: Optional[Dict] = None
+def compute_roofline(hardware, profiling) -> RooflineResult:
+    """Compute roofline analysis from hardware info and profiling results.
+    Args:
+        hardware: HardwareInfo dataclass from hardware_info.py
+        profiling: ProfilingResult dataclass from profiler.py
+    Returns:
+        RooflineResult with theoretical ceilings, achieved performance,
+        bottleneck identification, and recommendations.
+    """
+    result = RooflineResult()
+    result.model_name = profiling.detector_name
+    # --- Hardware ceilings (use first GPU) ---
+    if hardware.gpus:
+        gpu = hardware.gpus[0]
+        result.peak_fp32_tflops = gpu.fp32_tflops or 0.0
+        result.peak_fp16_tflops = gpu.fp16_tflops or 0.0
+        result.peak_memory_bandwidth_gbps = gpu.memory_bandwidth_gbps or 0.0
+        if gpu.vram_total_gb:
+            result.gpu_vram_total_mb = gpu.vram_total_gb * 1024
+    else:
+        logger.warning("No GPU info available; roofline will have zero ceilings")
+    # Ridge point: where compute and memory roofs intersect
+    if result.peak_memory_bandwidth_gbps > 0:
+        # peak_tflops / peak_bw (TB/s) = FLOPs/byte
+        peak_tbps = result.peak_memory_bandwidth_gbps / 1000  # GB/s -> TB/s
+        if peak_tbps > 0:
+            result.ridge_point_flop_per_byte = result.peak_fp32_tflops / peak_tbps
+    # --- Workload characteristics ---
+    model_key = profiling.detector_name
+    result.model_gflops = _MODEL_FLOPS.get(model_key, 0.0)
+    result.model_bytes_mb = _MODEL_BYTES.get(model_key, 0.0)
+    if result.model_bytes_mb > 0:
+        # Operational intensity = FLOPs / bytes_moved
+        bytes_moved = result.model_bytes_mb * 1e6  # MB -> bytes
+        flops = result.model_gflops * 1e9           # GFLOPs -> FLOPs
+        result.operational_intensity = flops / bytes_moved if bytes_moved > 0 else 0
+    # --- Achieved performance ---
+    gpu_kernel_ms = profiling.gpu_kernel_stats.mean_ms if profiling.gpu_kernel_stats.count > 0 else 0
+    if gpu_kernel_ms > 0 and result.model_gflops > 0:
+        # Achieved TFLOPS = GFLOPs / (kernel_time_s)
+        kernel_time_s = gpu_kernel_ms / 1000
+        result.achieved_tflops = round(result.model_gflops / kernel_time_s / 1000, 4)
+    if gpu_kernel_ms > 0 and result.model_bytes_mb > 0:
+        kernel_time_s = gpu_kernel_ms / 1000
+        result.achieved_bandwidth_gbps = round(result.model_bytes_mb / kernel_time_s / 1000, 2)
+    # --- Per-phase bottleneck breakdown ---
+    phases = [
+        ("decode", profiling.decode_stats.mean_ms),
+        ("preprocess", profiling.preprocess_stats.mean_ms),
+    ]
+    # Only include transfer if we have valid measurements
+    if profiling.transfer_stats.count > 0 and profiling.transfer_stats.mean_ms >= 0:
+        phases.append(("transfer", profiling.transfer_stats.mean_ms))
+    phases.extend([
+        ("gpu_kernel", profiling.gpu_kernel_stats.mean_ms),
+        ("postprocess", profiling.postprocess_stats.mean_ms),
+    ])
+    total_phase_ms = sum(ms for _, ms in phases)
+    max_phase_name = ""
+    max_phase_ms = 0
+    for name, ms in phases:
+        bb = BottleneckBreakdown(
+            phase=name,
+            time_ms=round(ms, 3),
+            fraction=round(ms / total_phase_ms, 4) if total_phase_ms > 0 else 0,
+        )
+        if ms > max_phase_ms:
+            max_phase_ms = ms
+            max_phase_name = name
+        result.phase_breakdown.append(bb)
+    # Mark bottleneck phase
+    for bb in result.phase_breakdown:
+        if bb.phase == max_phase_name:
+            bb.is_bottleneck = True
+    # --- Primary bottleneck classification ---
+    if max_phase_name == "decode":
+        result.primary_bottleneck = "decode-bound"
+        result.bottleneck_explanation = (
+            f"Video decoding ({max_phase_ms:.1f}ms) is the slowest phase. "
+            "GPU is waiting for frames. Consider hardware-accelerated decoding (NVDEC) "
+            "or reducing input resolution."
+        )
+    elif max_phase_name == "transfer":
+        result.primary_bottleneck = "transfer-bound"
+        result.bottleneck_explanation = (
+            f"CPU->GPU data transfer ({max_phase_ms:.1f}ms) is the slowest phase. "
+            "Consider using pinned memory, reducing input tensor size, or "
+            "overlapping transfer with computation."
+        )
+    elif max_phase_name == "gpu_kernel":
+        # Sub-classify: memory-bound vs compute-bound
+        if result.operational_intensity > 0 and result.ridge_point_flop_per_byte > 0:
+            if result.operational_intensity < result.ridge_point_flop_per_byte:
+                result.primary_bottleneck = "memory-bound"
+                result.bottleneck_explanation = (
+                    f"GPU kernel ({max_phase_ms:.1f}ms) is memory-bandwidth limited. "
+                    f"Operational intensity ({result.operational_intensity:.1f} FLOP/byte) "
+                    f"is below the ridge point ({result.ridge_point_flop_per_byte:.1f} FLOP/byte). "
+                    "Consider model quantization (FP16/INT8), reducing batch size, "
+                    "or using a more compute-dense model."
+                )
+            else:
+                result.primary_bottleneck = "compute-bound"
+                result.bottleneck_explanation = (
+                    f"GPU kernel ({max_phase_ms:.1f}ms) is compute-limited. "
+                    f"Achieved {result.achieved_tflops:.2f} TFLOPS out of "
+                    f"{result.peak_fp32_tflops:.2f} TFLOPS peak "
+                    f"({result.achieved_tflops / result.peak_fp32_tflops * 100:.1f}% utilization). "
+                    "Consider FP16 inference, TensorRT optimization, or a smaller model."
+                    if result.peak_fp32_tflops > 0
+                    else "Consider a faster GPU or a smaller model."
+                )
+        else:
+            result.primary_bottleneck = "compute-bound"
+            result.bottleneck_explanation = (
+                f"GPU kernel ({max_phase_ms:.1f}ms) dominates pipeline time."
+            )
+    elif max_phase_name == "preprocess":
+        result.primary_bottleneck = "preprocess-bound"
+        result.bottleneck_explanation = (
+            f"CPU preprocessing ({max_phase_ms:.1f}ms) is the slowest phase. "
+            "Consider GPU-accelerated preprocessing or reducing input resolution."
+        )
+    elif max_phase_name == "postprocess":
+        result.primary_bottleneck = "postprocess-bound"
+        result.bottleneck_explanation = (
+            f"CPU post-processing/NMS ({max_phase_ms:.1f}ms) is the slowest phase. "
+            "Consider batched NMS on GPU or raising the confidence threshold."
+        )
+    else:
+        result.primary_bottleneck = "unknown"
+        result.bottleneck_explanation = "Unable to determine primary bottleneck."
+    # --- Throughput ---
+    # Theoretical max FPS = 1000 / max(phase_times)
+    if max_phase_ms > 0:
+        result.theoretical_max_fps = round(1000 / max_phase_ms, 2)
+    result.observed_fps = round(profiling.avg_fps, 2)
+    if result.theoretical_max_fps > 0:
+        result.utilization_pct = round(result.observed_fps / result.theoretical_max_fps * 100, 1)
+    # --- GPU memory ---
+    result.gpu_peak_memory_mb = profiling.gpu_peak_memory_mb
+    if result.gpu_vram_total_mb > 0:
+        result.memory_utilization_pct = round(
+            result.gpu_peak_memory_mb / result.gpu_vram_total_mb * 100, 1
+        )
+    # --- GSAM2 metrics ---
+    gsam2_metrics = getattr(profiling, "_gsam2_metrics", None)
+    if gsam2_metrics:
+        result.gsam2_metrics = gsam2_metrics
+    # --- Recommendations ---
+    recs = []
+    # Bottleneck-specific recommendations
+    if result.primary_bottleneck == "decode-bound":
+        recs.append("Use NVIDIA NVDEC for hardware-accelerated video decoding")
+        recs.append("Reduce input video resolution before processing")
+    elif result.primary_bottleneck == "transfer-bound":
+        recs.append("Use torch.cuda pinned memory for faster CPU->GPU transfers")
+        recs.append("Pre-allocate GPU tensors and reuse across frames")
+    elif result.primary_bottleneck == "memory-bound":
+        recs.append("Enable FP16 (half-precision) inference to reduce memory bandwidth pressure")
+        recs.append("Consider INT8 quantization via TensorRT for further speedup")
+    elif result.primary_bottleneck == "compute-bound":
+        recs.append("Enable FP16 inference (2x theoretical throughput on Volta+ GPUs)")
+        recs.append("Consider TensorRT or torch.compile() for kernel fusion")
+        if result.peak_fp32_tflops > 0 and result.achieved_tflops / result.peak_fp32_tflops < 0.3:
+            recs.append("Low GPU utilization — consider increasing batch size or using a multi-stream pipeline")
+    # General recommendations
+    if result.memory_utilization_pct > 80:
+        recs.append(f"GPU memory utilization is high ({result.memory_utilization_pct:.0f}%); "
+                     "reduce batch size or use gradient checkpointing to avoid OOM")
+    elif result.memory_utilization_pct > 0 and result.memory_utilization_pct < 30:
+        recs.append(f"GPU memory utilization is low ({result.memory_utilization_pct:.0f}%); "
+                     "consider processing multiple streams or increasing batch size")
+    if profiling.mode == "detection" and profiling.avg_fps < profiling.video_fps:
+        recs.append(
+            f"Processing speed ({profiling.avg_fps:.1f} FPS) is below video frame rate "
+            f"({profiling.video_fps:.1f} FPS); consider frame skipping or a faster model"
+        )
+    result.recommendations = recs
+    return result