Spaces:
Paused
Paused
Zhen Ye Claude Opus 4.6 commited on
Commit ·
078b447
1
Parent(s): 0ace9ca
feat: add benchmark profiler & roofline analysis system
Browse filesAdd hardware extraction, per-frame GPU/CPU profiling with CUDA events,
and automated roofline analysis for detection and segmentation modes.
New endpoints:
- GET /benchmark/hardware — cached hardware specs
- POST /benchmark/profile — per-frame timing breakdown
- POST /benchmark/analysis — full roofline with bottleneck ID
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- app.py +141 -0
- requirements.txt +1 -0
- utils/hardware_info.py +385 -0
- utils/profiler.py +451 -0
- utils/roofline.py +294 -0
app.py
CHANGED
|
@@ -1017,5 +1017,146 @@ async def gpu_monitor_endpoint(duration: int = 180, interval: int = 1):
|
|
| 1017 |
return StreamingResponse(_stream(), media_type="text/plain")
|
| 1018 |
|
| 1019 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1020 |
if __name__ == "__main__":
|
| 1021 |
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
|
|
|
|
| 1017 |
return StreamingResponse(_stream(), media_type="text/plain")
|
| 1018 |
|
| 1019 |
|
| 1020 |
+
# ---------------------------------------------------------------------------
|
| 1021 |
+
# Benchmark Profiler & Roofline Analysis Endpoints
|
| 1022 |
+
# ---------------------------------------------------------------------------
|
| 1023 |
+
|
| 1024 |
+
@app.get("/benchmark/hardware")
|
| 1025 |
+
async def benchmark_hardware():
|
| 1026 |
+
"""Return hardware specs JSON (no video needed, cached)."""
|
| 1027 |
+
import dataclasses
|
| 1028 |
+
from utils.hardware_info import get_hardware_info
|
| 1029 |
+
|
| 1030 |
+
hw = await asyncio.to_thread(get_hardware_info)
|
| 1031 |
+
return JSONResponse(dataclasses.asdict(hw))
|
| 1032 |
+
|
| 1033 |
+
|
| 1034 |
+
@app.post("/benchmark/profile")
|
| 1035 |
+
async def benchmark_profile(
|
| 1036 |
+
video: UploadFile = File(...),
|
| 1037 |
+
mode: str = Form("detection"),
|
| 1038 |
+
detector: str = Form("hf_yolov8"),
|
| 1039 |
+
segmenter: str = Form("gsam2_large"),
|
| 1040 |
+
queries: str = Form("person,car,truck"),
|
| 1041 |
+
max_frames: int = Form(100),
|
| 1042 |
+
warmup_frames: int = Form(5),
|
| 1043 |
+
step: int = Form(20),
|
| 1044 |
+
):
|
| 1045 |
+
"""Run profiled inference and return per-frame timing breakdown.
|
| 1046 |
+
|
| 1047 |
+
Args:
|
| 1048 |
+
video: Video file to profile.
|
| 1049 |
+
mode: "detection" or "segmentation".
|
| 1050 |
+
detector: Detector key (for detection mode).
|
| 1051 |
+
segmenter: Segmenter key (for segmentation mode).
|
| 1052 |
+
queries: Comma-separated object classes.
|
| 1053 |
+
max_frames: Maximum frames to profile.
|
| 1054 |
+
warmup_frames: Warmup frames (detection only).
|
| 1055 |
+
step: Keyframe interval (segmentation only).
|
| 1056 |
+
"""
|
| 1057 |
+
import dataclasses
|
| 1058 |
+
from utils.profiler import run_profiled_detection, run_profiled_segmentation
|
| 1059 |
+
|
| 1060 |
+
if mode not in ("detection", "segmentation"):
|
| 1061 |
+
raise HTTPException(status_code=400, detail="mode must be 'detection' or 'segmentation'")
|
| 1062 |
+
|
| 1063 |
+
input_path = _save_upload_to_tmp(video)
|
| 1064 |
+
await video.close()
|
| 1065 |
+
|
| 1066 |
+
query_list = [q.strip() for q in queries.split(",") if q.strip()]
|
| 1067 |
+
|
| 1068 |
+
try:
|
| 1069 |
+
if mode == "detection":
|
| 1070 |
+
result = await asyncio.to_thread(
|
| 1071 |
+
run_profiled_detection,
|
| 1072 |
+
input_path, detector, query_list,
|
| 1073 |
+
max_frames=max_frames, warmup_frames=warmup_frames,
|
| 1074 |
+
)
|
| 1075 |
+
else:
|
| 1076 |
+
result = await asyncio.to_thread(
|
| 1077 |
+
run_profiled_segmentation,
|
| 1078 |
+
input_path, segmenter, query_list,
|
| 1079 |
+
max_frames=max_frames, step=step,
|
| 1080 |
+
)
|
| 1081 |
+
except Exception as exc:
|
| 1082 |
+
_safe_delete(input_path)
|
| 1083 |
+
logging.exception("Profiling failed")
|
| 1084 |
+
raise HTTPException(status_code=500, detail=str(exc))
|
| 1085 |
+
finally:
|
| 1086 |
+
_safe_delete(input_path)
|
| 1087 |
+
|
| 1088 |
+
# Serialize dataclass, handling any non-serializable fields
|
| 1089 |
+
out = dataclasses.asdict(result)
|
| 1090 |
+
# Include GSAM2 metrics if present
|
| 1091 |
+
gsam2 = getattr(result, "_gsam2_metrics", None)
|
| 1092 |
+
if gsam2:
|
| 1093 |
+
out["gsam2_metrics"] = gsam2
|
| 1094 |
+
return JSONResponse(out)
|
| 1095 |
+
|
| 1096 |
+
|
| 1097 |
+
@app.post("/benchmark/analysis")
|
| 1098 |
+
async def benchmark_analysis(
|
| 1099 |
+
video: UploadFile = File(...),
|
| 1100 |
+
mode: str = Form("detection"),
|
| 1101 |
+
detector: str = Form("hf_yolov8"),
|
| 1102 |
+
segmenter: str = Form("gsam2_large"),
|
| 1103 |
+
queries: str = Form("person,car,truck"),
|
| 1104 |
+
max_frames: int = Form(100),
|
| 1105 |
+
warmup_frames: int = Form(5),
|
| 1106 |
+
step: int = Form(20),
|
| 1107 |
+
):
|
| 1108 |
+
"""Full roofline analysis: hardware + profiling + theoretical ceilings + bottleneck ID.
|
| 1109 |
+
|
| 1110 |
+
Combines hardware extraction, profiled inference, and roofline model
|
| 1111 |
+
to identify bottlenecks and provide actionable recommendations.
|
| 1112 |
+
"""
|
| 1113 |
+
import dataclasses
|
| 1114 |
+
from utils.hardware_info import get_hardware_info
|
| 1115 |
+
from utils.profiler import run_profiled_detection, run_profiled_segmentation
|
| 1116 |
+
from utils.roofline import compute_roofline
|
| 1117 |
+
|
| 1118 |
+
if mode not in ("detection", "segmentation"):
|
| 1119 |
+
raise HTTPException(status_code=400, detail="mode must be 'detection' or 'segmentation'")
|
| 1120 |
+
|
| 1121 |
+
input_path = _save_upload_to_tmp(video)
|
| 1122 |
+
await video.close()
|
| 1123 |
+
|
| 1124 |
+
query_list = [q.strip() for q in queries.split(",") if q.strip()]
|
| 1125 |
+
|
| 1126 |
+
try:
|
| 1127 |
+
# Get hardware info (cached, fast)
|
| 1128 |
+
hardware = await asyncio.to_thread(get_hardware_info)
|
| 1129 |
+
|
| 1130 |
+
# Run profiling
|
| 1131 |
+
if mode == "detection":
|
| 1132 |
+
profiling = await asyncio.to_thread(
|
| 1133 |
+
run_profiled_detection,
|
| 1134 |
+
input_path, detector, query_list,
|
| 1135 |
+
max_frames=max_frames, warmup_frames=warmup_frames,
|
| 1136 |
+
)
|
| 1137 |
+
else:
|
| 1138 |
+
profiling = await asyncio.to_thread(
|
| 1139 |
+
run_profiled_segmentation,
|
| 1140 |
+
input_path, segmenter, query_list,
|
| 1141 |
+
max_frames=max_frames, step=step,
|
| 1142 |
+
)
|
| 1143 |
+
|
| 1144 |
+
# Compute roofline
|
| 1145 |
+
roofline = compute_roofline(hardware, profiling)
|
| 1146 |
+
|
| 1147 |
+
except Exception as exc:
|
| 1148 |
+
_safe_delete(input_path)
|
| 1149 |
+
logging.exception("Benchmark analysis failed")
|
| 1150 |
+
raise HTTPException(status_code=500, detail=str(exc))
|
| 1151 |
+
finally:
|
| 1152 |
+
_safe_delete(input_path)
|
| 1153 |
+
|
| 1154 |
+
return JSONResponse({
|
| 1155 |
+
"hardware": dataclasses.asdict(hardware),
|
| 1156 |
+
"profiling": dataclasses.asdict(profiling),
|
| 1157 |
+
"roofline": dataclasses.asdict(roofline),
|
| 1158 |
+
})
|
| 1159 |
+
|
| 1160 |
+
|
| 1161 |
if __name__ == "__main__":
|
| 1162 |
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
|
requirements.txt
CHANGED
|
@@ -13,3 +13,4 @@ sentence-transformers
|
|
| 13 |
SAM-2 @ git+https://github.com/facebookresearch/sam2.git
|
| 14 |
hydra-core>=1.3.2
|
| 15 |
iopath>=0.1.10
|
|
|
|
|
|
| 13 |
SAM-2 @ git+https://github.com/facebookresearch/sam2.git
|
| 14 |
hydra-core>=1.3.2
|
| 15 |
iopath>=0.1.10
|
| 16 |
+
psutil
|
utils/hardware_info.py
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hardware specification extraction for roofline analysis.
|
| 2 |
+
|
| 3 |
+
Extracts CPU, GPU, memory, and storage parameters via system tools
|
| 4 |
+
and torch APIs. All functions have try/except fallbacks returning None
|
| 5 |
+
for inaccessible fields.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import platform
|
| 11 |
+
import re
|
| 12 |
+
import subprocess
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from functools import lru_cache
|
| 15 |
+
from typing import Dict, List, Optional
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
# CUDA cores per SM by compute capability (major, minor) -> cores_per_sm
|
| 20 |
+
# Kepler through Blackwell
|
| 21 |
+
_CORES_PER_SM: Dict[tuple, int] = {
|
| 22 |
+
(3, 0): 192, (3, 2): 192, (3, 5): 192, (3, 7): 192, # Kepler
|
| 23 |
+
(5, 0): 128, (5, 2): 128, (5, 3): 128, # Maxwell
|
| 24 |
+
(6, 0): 64, (6, 1): 128, (6, 2): 128, # Pascal
|
| 25 |
+
(7, 0): 64, (7, 2): 64, (7, 5): 64, # Volta / Turing
|
| 26 |
+
(8, 0): 64, (8, 6): 128, (8, 7): 128, (8, 9): 128, # Ampere / Ada
|
| 27 |
+
(9, 0): 128, # Hopper
|
| 28 |
+
(10, 0): 128, # Blackwell
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# PCIe bandwidth (GB/s, unidirectional) by gen and width
|
| 32 |
+
_PCIE_BW: Dict[int, float] = {
|
| 33 |
+
3: 0.985, # ~1 GB/s per lane
|
| 34 |
+
4: 1.969,
|
| 35 |
+
5: 3.938,
|
| 36 |
+
6: 7.563,
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class CPUInfo:
|
| 42 |
+
model: Optional[str] = None
|
| 43 |
+
physical_cores: Optional[int] = None
|
| 44 |
+
logical_cores: Optional[int] = None
|
| 45 |
+
frequency_mhz: Optional[float] = None
|
| 46 |
+
cache_l2_kb: Optional[int] = None
|
| 47 |
+
cache_l3_kb: Optional[int] = None
|
| 48 |
+
architecture: Optional[str] = None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class MemoryInfo:
|
| 53 |
+
total_gb: Optional[float] = None
|
| 54 |
+
available_gb: Optional[float] = None
|
| 55 |
+
estimated_bandwidth_gbps: Optional[float] = None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class GPUInfo:
|
| 60 |
+
index: int = 0
|
| 61 |
+
name: Optional[str] = None
|
| 62 |
+
sm_count: Optional[int] = None
|
| 63 |
+
cuda_cores: Optional[int] = None
|
| 64 |
+
clock_mhz: Optional[float] = None
|
| 65 |
+
memory_clock_mhz: Optional[float] = None
|
| 66 |
+
memory_bus_width_bits: Optional[int] = None
|
| 67 |
+
vram_total_gb: Optional[float] = None
|
| 68 |
+
vram_free_gb: Optional[float] = None
|
| 69 |
+
memory_bandwidth_gbps: Optional[float] = None
|
| 70 |
+
fp32_tflops: Optional[float] = None
|
| 71 |
+
fp16_tflops: Optional[float] = None
|
| 72 |
+
tensor_core_tflops: Optional[float] = None
|
| 73 |
+
pcie_gen: Optional[int] = None
|
| 74 |
+
pcie_width: Optional[int] = None
|
| 75 |
+
pcie_bandwidth_gbps: Optional[float] = None
|
| 76 |
+
compute_capability: Optional[str] = None
|
| 77 |
+
driver_version: Optional[str] = None
|
| 78 |
+
cuda_version: Optional[str] = None
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@dataclass
|
| 82 |
+
class StorageInfo:
|
| 83 |
+
storage_type: Optional[str] = None # "SSD" or "HDD" or "Unknown"
|
| 84 |
+
sequential_read_mbps: Optional[float] = None
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@dataclass
|
| 88 |
+
class HardwareInfo:
|
| 89 |
+
cpu: CPUInfo = field(default_factory=CPUInfo)
|
| 90 |
+
memory: MemoryInfo = field(default_factory=MemoryInfo)
|
| 91 |
+
gpus: List[GPUInfo] = field(default_factory=list)
|
| 92 |
+
storage: StorageInfo = field(default_factory=StorageInfo)
|
| 93 |
+
system: Optional[str] = None
|
| 94 |
+
python_version: Optional[str] = None
|
| 95 |
+
torch_version: Optional[str] = None
|
| 96 |
+
cuda_runtime_version: Optional[str] = None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _run_cmd(cmd: List[str], timeout: int = 10) -> Optional[str]:
|
| 100 |
+
"""Run a shell command and return stdout, or None on failure."""
|
| 101 |
+
try:
|
| 102 |
+
result = subprocess.run(
|
| 103 |
+
cmd, capture_output=True, text=True, timeout=timeout,
|
| 104 |
+
)
|
| 105 |
+
if result.returncode == 0:
|
| 106 |
+
return result.stdout.strip()
|
| 107 |
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
| 108 |
+
pass
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _nvidia_smi_query(*fields: str) -> Optional[Dict[str, str]]:
|
| 113 |
+
"""Query nvidia-smi for given fields. Returns dict of field->value."""
|
| 114 |
+
field_str = ",".join(fields)
|
| 115 |
+
out = _run_cmd([
|
| 116 |
+
"nvidia-smi",
|
| 117 |
+
f"--query-gpu={field_str}",
|
| 118 |
+
"--format=csv,noheader,nounits",
|
| 119 |
+
])
|
| 120 |
+
if not out:
|
| 121 |
+
return None
|
| 122 |
+
values = [v.strip() for v in out.split("\n")[0].split(",")]
|
| 123 |
+
if len(values) != len(fields):
|
| 124 |
+
return None
|
| 125 |
+
return dict(zip(fields, values))
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def get_cpu_info() -> CPUInfo:
|
| 129 |
+
info = CPUInfo()
|
| 130 |
+
try:
|
| 131 |
+
info.architecture = platform.machine()
|
| 132 |
+
info.logical_cores = os.cpu_count()
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
import psutil
|
| 136 |
+
info.physical_cores = psutil.cpu_count(logical=False)
|
| 137 |
+
freq = psutil.cpu_freq()
|
| 138 |
+
if freq:
|
| 139 |
+
info.frequency_mhz = freq.current or freq.max
|
| 140 |
+
except ImportError:
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
system = platform.system()
|
| 144 |
+
if system == "Linux":
|
| 145 |
+
out = _run_cmd(["lscpu"])
|
| 146 |
+
if out:
|
| 147 |
+
for line in out.split("\n"):
|
| 148 |
+
if "Model name" in line:
|
| 149 |
+
info.model = line.split(":", 1)[1].strip()
|
| 150 |
+
elif "L2 cache" in line:
|
| 151 |
+
val = line.split(":", 1)[1].strip()
|
| 152 |
+
m = re.search(r"([\d.]+)", val)
|
| 153 |
+
if m:
|
| 154 |
+
kb = float(m.group(1))
|
| 155 |
+
if "MiB" in val or "M" in val:
|
| 156 |
+
kb *= 1024
|
| 157 |
+
info.cache_l2_kb = int(kb)
|
| 158 |
+
elif "L3 cache" in line:
|
| 159 |
+
val = line.split(":", 1)[1].strip()
|
| 160 |
+
m = re.search(r"([\d.]+)", val)
|
| 161 |
+
if m:
|
| 162 |
+
kb = float(m.group(1))
|
| 163 |
+
if "MiB" in val or "M" in val:
|
| 164 |
+
kb *= 1024
|
| 165 |
+
info.cache_l3_kb = int(kb)
|
| 166 |
+
elif system == "Darwin":
|
| 167 |
+
brand = _run_cmd(["sysctl", "-n", "machdep.cpu.brand_string"])
|
| 168 |
+
if brand:
|
| 169 |
+
info.model = brand
|
| 170 |
+
l2 = _run_cmd(["sysctl", "-n", "hw.l2cachesize"])
|
| 171 |
+
if l2:
|
| 172 |
+
try:
|
| 173 |
+
info.cache_l2_kb = int(l2) // 1024
|
| 174 |
+
except ValueError:
|
| 175 |
+
pass
|
| 176 |
+
l3 = _run_cmd(["sysctl", "-n", "hw.l3cachesize"])
|
| 177 |
+
if l3:
|
| 178 |
+
try:
|
| 179 |
+
info.cache_l3_kb = int(l3) // 1024
|
| 180 |
+
except ValueError:
|
| 181 |
+
pass
|
| 182 |
+
except Exception:
|
| 183 |
+
logger.debug("CPU info extraction partially failed", exc_info=True)
|
| 184 |
+
return info
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def get_memory_info() -> MemoryInfo:
|
| 188 |
+
info = MemoryInfo()
|
| 189 |
+
try:
|
| 190 |
+
try:
|
| 191 |
+
import psutil
|
| 192 |
+
vm = psutil.virtual_memory()
|
| 193 |
+
info.total_gb = round(vm.total / (1024 ** 3), 2)
|
| 194 |
+
info.available_gb = round(vm.available / (1024 ** 3), 2)
|
| 195 |
+
except ImportError:
|
| 196 |
+
# Fallback: /proc/meminfo on Linux
|
| 197 |
+
if os.path.exists("/proc/meminfo"):
|
| 198 |
+
with open("/proc/meminfo") as f:
|
| 199 |
+
for line in f:
|
| 200 |
+
if line.startswith("MemTotal:"):
|
| 201 |
+
kb = int(line.split()[1])
|
| 202 |
+
info.total_gb = round(kb / (1024 ** 2), 2)
|
| 203 |
+
elif line.startswith("MemAvailable:"):
|
| 204 |
+
kb = int(line.split()[1])
|
| 205 |
+
info.available_gb = round(kb / (1024 ** 2), 2)
|
| 206 |
+
|
| 207 |
+
# Rough estimate: DDR4 ~40 GB/s, DDR5 ~60 GB/s
|
| 208 |
+
# Without dmidecode we can't know for sure, default to DDR4 estimate
|
| 209 |
+
if info.total_gb:
|
| 210 |
+
info.estimated_bandwidth_gbps = 40.0 # conservative DDR4 dual-channel
|
| 211 |
+
except Exception:
|
| 212 |
+
logger.debug("Memory info extraction partially failed", exc_info=True)
|
| 213 |
+
return info
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def get_gpu_info() -> List[GPUInfo]:
|
| 217 |
+
gpus: List[GPUInfo] = []
|
| 218 |
+
try:
|
| 219 |
+
import torch
|
| 220 |
+
if not torch.cuda.is_available():
|
| 221 |
+
return gpus
|
| 222 |
+
|
| 223 |
+
device_count = torch.cuda.device_count()
|
| 224 |
+
|
| 225 |
+
# Get driver/cuda version from nvidia-smi
|
| 226 |
+
driver_version = None
|
| 227 |
+
smi_cuda_version = None
|
| 228 |
+
nv = _nvidia_smi_query("driver_version")
|
| 229 |
+
if nv:
|
| 230 |
+
driver_version = nv.get("driver_version")
|
| 231 |
+
# nvidia-smi reports the max supported CUDA runtime
|
| 232 |
+
nv2 = _run_cmd(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"])
|
| 233 |
+
smi_out = _run_cmd(["nvidia-smi"])
|
| 234 |
+
if smi_out:
|
| 235 |
+
m = re.search(r"CUDA Version:\s+([\d.]+)", smi_out)
|
| 236 |
+
if m:
|
| 237 |
+
smi_cuda_version = m.group(1)
|
| 238 |
+
|
| 239 |
+
for i in range(device_count):
|
| 240 |
+
gpu = GPUInfo(index=i)
|
| 241 |
+
props = torch.cuda.get_device_properties(i)
|
| 242 |
+
|
| 243 |
+
gpu.name = props.name
|
| 244 |
+
gpu.sm_count = props.multi_processor_count
|
| 245 |
+
gpu.vram_total_gb = round(props.total_mem / (1024 ** 3), 2)
|
| 246 |
+
cc = (props.major, props.minor)
|
| 247 |
+
gpu.compute_capability = f"{props.major}.{props.minor}"
|
| 248 |
+
gpu.driver_version = driver_version
|
| 249 |
+
gpu.cuda_version = smi_cuda_version
|
| 250 |
+
|
| 251 |
+
# CUDA cores
|
| 252 |
+
cores_per_sm = _CORES_PER_SM.get(cc)
|
| 253 |
+
if cores_per_sm and gpu.sm_count:
|
| 254 |
+
gpu.cuda_cores = gpu.sm_count * cores_per_sm
|
| 255 |
+
|
| 256 |
+
# nvidia-smi per-GPU queries
|
| 257 |
+
nv_data = _run_cmd([
|
| 258 |
+
"nvidia-smi",
|
| 259 |
+
f"--id={i}",
|
| 260 |
+
"--query-gpu=clocks.max.graphics,clocks.max.memory,memory.bus_width,pcie.link.gen.current,pcie.link.width.current,memory.free",
|
| 261 |
+
"--format=csv,noheader,nounits",
|
| 262 |
+
])
|
| 263 |
+
if nv_data:
|
| 264 |
+
parts = [p.strip() for p in nv_data.split(",")]
|
| 265 |
+
if len(parts) >= 6:
|
| 266 |
+
try:
|
| 267 |
+
gpu.clock_mhz = float(parts[0])
|
| 268 |
+
except (ValueError, TypeError):
|
| 269 |
+
pass
|
| 270 |
+
try:
|
| 271 |
+
gpu.memory_clock_mhz = float(parts[1])
|
| 272 |
+
except (ValueError, TypeError):
|
| 273 |
+
pass
|
| 274 |
+
try:
|
| 275 |
+
gpu.memory_bus_width_bits = int(parts[2])
|
| 276 |
+
except (ValueError, TypeError):
|
| 277 |
+
pass
|
| 278 |
+
try:
|
| 279 |
+
gpu.pcie_gen = int(parts[3])
|
| 280 |
+
except (ValueError, TypeError):
|
| 281 |
+
pass
|
| 282 |
+
try:
|
| 283 |
+
gpu.pcie_width = int(parts[4])
|
| 284 |
+
except (ValueError, TypeError):
|
| 285 |
+
pass
|
| 286 |
+
try:
|
| 287 |
+
gpu.vram_free_gb = round(float(parts[5]) / 1024, 2)
|
| 288 |
+
except (ValueError, TypeError):
|
| 289 |
+
pass
|
| 290 |
+
|
| 291 |
+
# Derived: memory bandwidth
|
| 292 |
+
# GDDR: bandwidth = mem_clock * bus_width * 2 (DDR) / 8 (bits->bytes) / 1000 (MHz->GHz)
|
| 293 |
+
# HBM: bandwidth = mem_clock * bus_width * 2 / 8 / 1000
|
| 294 |
+
if gpu.memory_clock_mhz and gpu.memory_bus_width_bits:
|
| 295 |
+
gpu.memory_bandwidth_gbps = round(
|
| 296 |
+
gpu.memory_clock_mhz * gpu.memory_bus_width_bits * 2 / 8 / 1000, 1
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
# Derived: FP32 TFLOPS = cuda_cores * clock_mhz * 2 (FMA) / 1e6
|
| 300 |
+
if gpu.cuda_cores and gpu.clock_mhz:
|
| 301 |
+
gpu.fp32_tflops = round(gpu.cuda_cores * gpu.clock_mhz * 2 / 1e6, 2)
|
| 302 |
+
# FP16 is typically 2x FP32 on Volta+
|
| 303 |
+
if props.major >= 7:
|
| 304 |
+
gpu.fp16_tflops = round(gpu.fp32_tflops * 2, 2)
|
| 305 |
+
else:
|
| 306 |
+
gpu.fp16_tflops = gpu.fp32_tflops
|
| 307 |
+
|
| 308 |
+
# Tensor core TFLOPS (rough: 8x FP32 on Ampere+, 4x on Volta/Turing)
|
| 309 |
+
if gpu.fp32_tflops:
|
| 310 |
+
if props.major >= 8:
|
| 311 |
+
gpu.tensor_core_tflops = round(gpu.fp32_tflops * 8, 2)
|
| 312 |
+
elif props.major >= 7:
|
| 313 |
+
gpu.tensor_core_tflops = round(gpu.fp32_tflops * 4, 2)
|
| 314 |
+
|
| 315 |
+
# Derived: PCIe bandwidth
|
| 316 |
+
if gpu.pcie_gen and gpu.pcie_width:
|
| 317 |
+
per_lane = _PCIE_BW.get(gpu.pcie_gen, 0)
|
| 318 |
+
gpu.pcie_bandwidth_gbps = round(per_lane * gpu.pcie_width, 2)
|
| 319 |
+
|
| 320 |
+
gpus.append(gpu)
|
| 321 |
+
|
| 322 |
+
except Exception:
|
| 323 |
+
logger.debug("GPU info extraction partially failed", exc_info=True)
|
| 324 |
+
return gpus
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def get_storage_info() -> StorageInfo:
|
| 328 |
+
info = StorageInfo()
|
| 329 |
+
try:
|
| 330 |
+
system = platform.system()
|
| 331 |
+
if system == "Linux":
|
| 332 |
+
# Check if root device is rotational
|
| 333 |
+
out = _run_cmd(["lsblk", "-d", "-o", "NAME,ROTA", "--noheadings"])
|
| 334 |
+
if out:
|
| 335 |
+
for line in out.strip().split("\n"):
|
| 336 |
+
parts = line.split()
|
| 337 |
+
if len(parts) == 2:
|
| 338 |
+
info.storage_type = "HDD" if parts[1] == "1" else "SSD"
|
| 339 |
+
break
|
| 340 |
+
|
| 341 |
+
# Quick sequential read test with dd (1GB)
|
| 342 |
+
dd_out = _run_cmd(
|
| 343 |
+
["dd", "if=/dev/zero", "of=/dev/null", "bs=1M", "count=256"],
|
| 344 |
+
timeout=15,
|
| 345 |
+
)
|
| 346 |
+
# dd prints throughput to stderr, but _run_cmd only captures stdout
|
| 347 |
+
# Try a different approach
|
| 348 |
+
try:
|
| 349 |
+
result = subprocess.run(
|
| 350 |
+
["dd", "if=/dev/zero", "of=/dev/null", "bs=1M", "count=256"],
|
| 351 |
+
capture_output=True, text=True, timeout=15,
|
| 352 |
+
)
|
| 353 |
+
stderr = result.stderr
|
| 354 |
+
m = re.search(r"([\d.]+)\s*(GB|MB)/s", stderr)
|
| 355 |
+
if m:
|
| 356 |
+
speed = float(m.group(1))
|
| 357 |
+
if m.group(2) == "GB":
|
| 358 |
+
speed *= 1000
|
| 359 |
+
info.sequential_read_mbps = round(speed, 0)
|
| 360 |
+
except Exception:
|
| 361 |
+
pass
|
| 362 |
+
elif system == "Darwin":
|
| 363 |
+
info.storage_type = "SSD" # Modern Macs use NVMe SSDs
|
| 364 |
+
except Exception:
|
| 365 |
+
logger.debug("Storage info extraction partially failed", exc_info=True)
|
| 366 |
+
return info
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
@lru_cache(maxsize=1)
|
| 370 |
+
def get_hardware_info() -> HardwareInfo:
|
| 371 |
+
"""Aggregate all hardware info (cached)."""
|
| 372 |
+
import torch
|
| 373 |
+
|
| 374 |
+
hw = HardwareInfo()
|
| 375 |
+
hw.cpu = get_cpu_info()
|
| 376 |
+
hw.memory = get_memory_info()
|
| 377 |
+
hw.gpus = get_gpu_info()
|
| 378 |
+
hw.storage = get_storage_info()
|
| 379 |
+
hw.system = f"{platform.system()} {platform.release()}"
|
| 380 |
+
hw.python_version = platform.python_version()
|
| 381 |
+
hw.torch_version = torch.__version__
|
| 382 |
+
hw.cuda_runtime_version = (
|
| 383 |
+
torch.version.cuda if torch.cuda.is_available() else None
|
| 384 |
+
)
|
| 385 |
+
return hw
|
utils/profiler.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Per-frame GPU/CPU profiling for detection and segmentation pipelines.
|
| 2 |
+
|
| 3 |
+
Provides CUDA event-based timing and decomposed profiling for
|
| 4 |
+
transformers-based and opaque (YOLO) detectors. Runs in a dedicated
|
| 5 |
+
single-threaded path for accurate, reproducible measurements.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import statistics
|
| 10 |
+
import time
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from typing import Dict, List, Optional, Sequence
|
| 13 |
+
|
| 14 |
+
import cv2
|
| 15 |
+
import numpy as np
|
| 16 |
+
import torch
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# Detectors whose predict() can be decomposed into processor -> model -> post_process
|
| 21 |
+
_DECOMPOSABLE_DETECTORS = {"detr_resnet50", "grounding_dino"}
|
| 22 |
+
# Detectors with opaque predict() calls (YOLO-based)
|
| 23 |
+
_OPAQUE_DETECTORS = {"hf_yolov8", "drone_yolo"}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class TimingStats:
|
| 28 |
+
"""Aggregate statistics for a set of measurements (in ms)."""
|
| 29 |
+
min_ms: float = 0.0
|
| 30 |
+
max_ms: float = 0.0
|
| 31 |
+
mean_ms: float = 0.0
|
| 32 |
+
std_ms: float = 0.0
|
| 33 |
+
p50_ms: float = 0.0
|
| 34 |
+
p95_ms: float = 0.0
|
| 35 |
+
p99_ms: float = 0.0
|
| 36 |
+
count: int = 0
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def from_samples(samples: List[float]) -> "TimingStats":
|
| 40 |
+
if not samples:
|
| 41 |
+
return TimingStats()
|
| 42 |
+
sorted_s = sorted(samples)
|
| 43 |
+
n = len(sorted_s)
|
| 44 |
+
return TimingStats(
|
| 45 |
+
min_ms=sorted_s[0],
|
| 46 |
+
max_ms=sorted_s[-1],
|
| 47 |
+
mean_ms=statistics.mean(sorted_s),
|
| 48 |
+
std_ms=statistics.stdev(sorted_s) if n > 1 else 0.0,
|
| 49 |
+
p50_ms=sorted_s[n // 2],
|
| 50 |
+
p95_ms=sorted_s[int(n * 0.95)],
|
| 51 |
+
p99_ms=sorted_s[int(n * 0.99)],
|
| 52 |
+
count=n,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class FrameTiming:
|
| 58 |
+
"""Timing breakdown for a single frame (all values in ms)."""
|
| 59 |
+
frame_idx: int = 0
|
| 60 |
+
decode_ms: float = 0.0
|
| 61 |
+
preprocess_ms: float = 0.0 # CPU: image processor / resize
|
| 62 |
+
transfer_ms: float = 0.0 # CPU->GPU data transfer
|
| 63 |
+
gpu_kernel_ms: float = 0.0 # GPU model forward pass
|
| 64 |
+
postprocess_ms: float = 0.0 # CPU: post-processing + NMS
|
| 65 |
+
total_ms: float = 0.0
|
| 66 |
+
num_detections: int = 0
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class ProfilingResult:
|
| 71 |
+
"""Full profiling result for a video."""
|
| 72 |
+
detector_name: str = ""
|
| 73 |
+
mode: str = ""
|
| 74 |
+
total_frames: int = 0
|
| 75 |
+
warmup_frames: int = 0
|
| 76 |
+
profiled_frames: int = 0
|
| 77 |
+
video_resolution: str = ""
|
| 78 |
+
video_fps: float = 0.0
|
| 79 |
+
|
| 80 |
+
# Per-frame timings
|
| 81 |
+
frame_timings: List[FrameTiming] = field(default_factory=list)
|
| 82 |
+
|
| 83 |
+
# Aggregate stats
|
| 84 |
+
decode_stats: TimingStats = field(default_factory=TimingStats)
|
| 85 |
+
preprocess_stats: TimingStats = field(default_factory=TimingStats)
|
| 86 |
+
transfer_stats: TimingStats = field(default_factory=TimingStats)
|
| 87 |
+
gpu_kernel_stats: TimingStats = field(default_factory=TimingStats)
|
| 88 |
+
postprocess_stats: TimingStats = field(default_factory=TimingStats)
|
| 89 |
+
total_stats: TimingStats = field(default_factory=TimingStats)
|
| 90 |
+
|
| 91 |
+
# GPU memory
|
| 92 |
+
gpu_peak_memory_mb: float = 0.0
|
| 93 |
+
gpu_allocated_mb: float = 0.0
|
| 94 |
+
|
| 95 |
+
# Throughput
|
| 96 |
+
avg_fps: float = 0.0
|
| 97 |
+
avg_detections_per_frame: float = 0.0
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class CudaTimer:
|
| 101 |
+
"""Non-blocking GPU timer using CUDA events.
|
| 102 |
+
|
| 103 |
+
Records start/stop on the current CUDA stream; synchronizes lazily
|
| 104 |
+
on ``elapsed_ms()`` call.
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
def __init__(self):
|
| 108 |
+
self._start = torch.cuda.Event(enable_timing=True)
|
| 109 |
+
self._end = torch.cuda.Event(enable_timing=True)
|
| 110 |
+
|
| 111 |
+
def start(self):
|
| 112 |
+
self._start.record()
|
| 113 |
+
|
| 114 |
+
def stop(self):
|
| 115 |
+
self._end.record()
|
| 116 |
+
|
| 117 |
+
def elapsed_ms(self) -> float:
|
| 118 |
+
self._end.synchronize()
|
| 119 |
+
return self._start.elapsed_time(self._end)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _profile_decomposed(detector, frame: np.ndarray, queries: Sequence[str]) -> FrameTiming:
|
| 123 |
+
"""Profile a transformers-based detector with decomposed phases.
|
| 124 |
+
|
| 125 |
+
Works for DETR and Grounding DINO where we can separate:
|
| 126 |
+
processor(image) -> .to(device) -> model(**inputs) -> post_process()
|
| 127 |
+
"""
|
| 128 |
+
timing = FrameTiming()
|
| 129 |
+
|
| 130 |
+
# 1. Preprocess (CPU)
|
| 131 |
+
t0 = time.perf_counter()
|
| 132 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 133 |
+
|
| 134 |
+
if hasattr(detector, "processor"):
|
| 135 |
+
processor = detector.processor
|
| 136 |
+
if hasattr(detector, "_build_prompt"):
|
| 137 |
+
# Grounding DINO
|
| 138 |
+
prompt = detector._build_prompt(queries)
|
| 139 |
+
inputs = processor(images=frame_rgb, text=prompt, return_tensors="pt")
|
| 140 |
+
else:
|
| 141 |
+
# DETR
|
| 142 |
+
inputs = processor(images=frame_rgb, return_tensors="pt")
|
| 143 |
+
else:
|
| 144 |
+
timing.preprocess_ms = (time.perf_counter() - t0) * 1000
|
| 145 |
+
return timing
|
| 146 |
+
timing.preprocess_ms = (time.perf_counter() - t0) * 1000
|
| 147 |
+
|
| 148 |
+
# 2. Transfer to GPU
|
| 149 |
+
cuda_timer_transfer = CudaTimer()
|
| 150 |
+
cuda_timer_transfer.start()
|
| 151 |
+
inputs = {key: value.to(detector.device) for key, value in inputs.items()}
|
| 152 |
+
cuda_timer_transfer.stop()
|
| 153 |
+
timing.transfer_ms = cuda_timer_transfer.elapsed_ms()
|
| 154 |
+
|
| 155 |
+
# 3. GPU forward pass
|
| 156 |
+
cuda_timer_kernel = CudaTimer()
|
| 157 |
+
cuda_timer_kernel.start()
|
| 158 |
+
with torch.no_grad():
|
| 159 |
+
outputs = detector.model(**inputs)
|
| 160 |
+
cuda_timer_kernel.stop()
|
| 161 |
+
timing.gpu_kernel_ms = cuda_timer_kernel.elapsed_ms()
|
| 162 |
+
|
| 163 |
+
# 4. Post-process (CPU)
|
| 164 |
+
t0 = time.perf_counter()
|
| 165 |
+
target_sizes = torch.tensor([frame.shape[:2]], device=detector.device)
|
| 166 |
+
if hasattr(detector, "_post_process"):
|
| 167 |
+
# Grounding DINO
|
| 168 |
+
processed_list = detector._post_process(outputs, inputs["input_ids"], target_sizes)
|
| 169 |
+
else:
|
| 170 |
+
# DETR
|
| 171 |
+
processed_list = detector.processor.post_process_object_detection(
|
| 172 |
+
outputs, threshold=detector.score_threshold, target_sizes=target_sizes,
|
| 173 |
+
)
|
| 174 |
+
result = detector._parse_single_result(processed_list[0])
|
| 175 |
+
timing.postprocess_ms = (time.perf_counter() - t0) * 1000
|
| 176 |
+
timing.num_detections = len(result.boxes)
|
| 177 |
+
|
| 178 |
+
timing.total_ms = timing.preprocess_ms + timing.transfer_ms + timing.gpu_kernel_ms + timing.postprocess_ms
|
| 179 |
+
return timing
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def _profile_opaque(detector, frame: np.ndarray, queries: Sequence[str]) -> FrameTiming:
|
| 183 |
+
"""Profile an opaque detector (YOLO) where internals aren't separable."""
|
| 184 |
+
timing = FrameTiming()
|
| 185 |
+
|
| 186 |
+
# Wrap entire predict() with CUDA events
|
| 187 |
+
cuda_timer = CudaTimer()
|
| 188 |
+
|
| 189 |
+
t0 = time.perf_counter()
|
| 190 |
+
cuda_timer.start()
|
| 191 |
+
result = detector.predict(frame, queries)
|
| 192 |
+
cuda_timer.stop()
|
| 193 |
+
wall_ms = (time.perf_counter() - t0) * 1000
|
| 194 |
+
|
| 195 |
+
timing.gpu_kernel_ms = cuda_timer.elapsed_ms()
|
| 196 |
+
timing.preprocess_ms = 0.0 # Included in gpu_kernel
|
| 197 |
+
timing.transfer_ms = -1.0 # Not separable
|
| 198 |
+
timing.postprocess_ms = max(0, wall_ms - timing.gpu_kernel_ms)
|
| 199 |
+
timing.total_ms = wall_ms
|
| 200 |
+
timing.num_detections = len(result.boxes)
|
| 201 |
+
|
| 202 |
+
return timing
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def run_profiled_detection(
|
| 206 |
+
video_path: str,
|
| 207 |
+
detector_name: str,
|
| 208 |
+
queries: List[str],
|
| 209 |
+
max_frames: int = 100,
|
| 210 |
+
warmup_frames: int = 5,
|
| 211 |
+
) -> ProfilingResult:
|
| 212 |
+
"""Run profiled detection on a video file.
|
| 213 |
+
|
| 214 |
+
Single-threaded profiling path (not injected into the multi-threaded
|
| 215 |
+
production pipeline) for accurate, reproducible measurements.
|
| 216 |
+
"""
|
| 217 |
+
from models.model_loader import load_detector
|
| 218 |
+
from utils.video import VideoReader
|
| 219 |
+
|
| 220 |
+
result = ProfilingResult(
|
| 221 |
+
detector_name=detector_name,
|
| 222 |
+
mode="detection",
|
| 223 |
+
warmup_frames=warmup_frames,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
# Load detector
|
| 227 |
+
detector = load_detector(detector_name)
|
| 228 |
+
device = getattr(detector, "device", None)
|
| 229 |
+
has_cuda = device is not None and str(device).startswith("cuda")
|
| 230 |
+
|
| 231 |
+
if not has_cuda:
|
| 232 |
+
logger.warning("No CUDA device found for profiling; GPU timings will be 0")
|
| 233 |
+
|
| 234 |
+
# Open video
|
| 235 |
+
reader = VideoReader(video_path)
|
| 236 |
+
result.video_resolution = f"{reader.width}x{reader.height}"
|
| 237 |
+
result.video_fps = reader.fps
|
| 238 |
+
|
| 239 |
+
is_decomposable = detector_name in _DECOMPOSABLE_DETECTORS
|
| 240 |
+
|
| 241 |
+
# Reset CUDA peak memory
|
| 242 |
+
if has_cuda:
|
| 243 |
+
torch.cuda.reset_peak_memory_stats()
|
| 244 |
+
torch.cuda.synchronize()
|
| 245 |
+
|
| 246 |
+
frame_timings: List[FrameTiming] = []
|
| 247 |
+
frame_idx = 0
|
| 248 |
+
|
| 249 |
+
for frame in reader:
|
| 250 |
+
if frame_idx >= max_frames:
|
| 251 |
+
break
|
| 252 |
+
|
| 253 |
+
# Decode timing
|
| 254 |
+
t_decode_start = time.perf_counter()
|
| 255 |
+
# frame is already decoded by VideoReader, so decode = iteration time
|
| 256 |
+
# We measure it before predict for consistency
|
| 257 |
+
decode_ms = 0.0 # Measured below
|
| 258 |
+
|
| 259 |
+
if frame_idx < warmup_frames:
|
| 260 |
+
# Warmup: run prediction but don't record
|
| 261 |
+
if is_decomposable:
|
| 262 |
+
_profile_decomposed(detector, frame, queries)
|
| 263 |
+
else:
|
| 264 |
+
_profile_opaque(detector, frame, queries)
|
| 265 |
+
frame_idx += 1
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
# Time the decode (approximated as read time for next frame)
|
| 269 |
+
t_before = time.perf_counter()
|
| 270 |
+
|
| 271 |
+
# Profile prediction
|
| 272 |
+
if is_decomposable:
|
| 273 |
+
timing = _profile_decomposed(detector, frame, queries)
|
| 274 |
+
else:
|
| 275 |
+
timing = _profile_opaque(detector, frame, queries)
|
| 276 |
+
|
| 277 |
+
timing.frame_idx = frame_idx
|
| 278 |
+
# decode_ms is effectively 0 here since VideoReader pre-decoded;
|
| 279 |
+
# for a real decode benchmark we'd time cv2.read separately.
|
| 280 |
+
# We'll measure a representative decode cost from the first non-warmup frame.
|
| 281 |
+
if frame_idx == warmup_frames:
|
| 282 |
+
# Benchmark decode cost: re-read one frame
|
| 283 |
+
cap = cv2.VideoCapture(video_path)
|
| 284 |
+
if cap.isOpened():
|
| 285 |
+
td0 = time.perf_counter()
|
| 286 |
+
cap.read()
|
| 287 |
+
timing.decode_ms = (time.perf_counter() - td0) * 1000
|
| 288 |
+
cap.release()
|
| 289 |
+
else:
|
| 290 |
+
# Approximate: use same decode cost as first frame
|
| 291 |
+
if frame_timings:
|
| 292 |
+
timing.decode_ms = frame_timings[0].decode_ms
|
| 293 |
+
|
| 294 |
+
frame_timings.append(timing)
|
| 295 |
+
frame_idx += 1
|
| 296 |
+
|
| 297 |
+
reader.close()
|
| 298 |
+
|
| 299 |
+
# Aggregate results
|
| 300 |
+
result.total_frames = frame_idx
|
| 301 |
+
result.profiled_frames = len(frame_timings)
|
| 302 |
+
result.frame_timings = frame_timings
|
| 303 |
+
|
| 304 |
+
if frame_timings:
|
| 305 |
+
result.decode_stats = TimingStats.from_samples([t.decode_ms for t in frame_timings])
|
| 306 |
+
result.preprocess_stats = TimingStats.from_samples([t.preprocess_ms for t in frame_timings])
|
| 307 |
+
transfer_samples = [t.transfer_ms for t in frame_timings if t.transfer_ms >= 0]
|
| 308 |
+
result.transfer_stats = TimingStats.from_samples(transfer_samples)
|
| 309 |
+
result.gpu_kernel_stats = TimingStats.from_samples([t.gpu_kernel_ms for t in frame_timings])
|
| 310 |
+
result.postprocess_stats = TimingStats.from_samples([t.postprocess_ms for t in frame_timings])
|
| 311 |
+
result.total_stats = TimingStats.from_samples([t.total_ms for t in frame_timings])
|
| 312 |
+
|
| 313 |
+
result.avg_fps = 1000.0 / result.total_stats.mean_ms if result.total_stats.mean_ms > 0 else 0
|
| 314 |
+
result.avg_detections_per_frame = statistics.mean([t.num_detections for t in frame_timings])
|
| 315 |
+
|
| 316 |
+
# GPU memory
|
| 317 |
+
if has_cuda:
|
| 318 |
+
torch.cuda.synchronize()
|
| 319 |
+
result.gpu_peak_memory_mb = round(torch.cuda.max_memory_allocated() / (1024 ** 2), 1)
|
| 320 |
+
result.gpu_allocated_mb = round(torch.cuda.memory_allocated() / (1024 ** 2), 1)
|
| 321 |
+
|
| 322 |
+
return result
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def run_profiled_segmentation(
|
| 326 |
+
video_path: str,
|
| 327 |
+
segmenter_name: str,
|
| 328 |
+
queries: List[str],
|
| 329 |
+
max_frames: int = 100,
|
| 330 |
+
step: int = 20,
|
| 331 |
+
) -> ProfilingResult:
|
| 332 |
+
"""Run profiled segmentation (GSAM2) on a video file.
|
| 333 |
+
|
| 334 |
+
Profiles the GSAM2 stages: GDINO keyframe detection,
|
| 335 |
+
SAM2 image prediction, SAM2 video propagation.
|
| 336 |
+
"""
|
| 337 |
+
import tempfile
|
| 338 |
+
import os
|
| 339 |
+
|
| 340 |
+
result = ProfilingResult(
|
| 341 |
+
detector_name=segmenter_name,
|
| 342 |
+
mode="segmentation",
|
| 343 |
+
warmup_frames=0,
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
# Open video for metadata
|
| 347 |
+
cap = cv2.VideoCapture(video_path)
|
| 348 |
+
if not cap.isOpened():
|
| 349 |
+
raise ValueError(f"Cannot open video: {video_path}")
|
| 350 |
+
result.video_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
| 351 |
+
result.video_resolution = f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}"
|
| 352 |
+
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 353 |
+
cap.release()
|
| 354 |
+
|
| 355 |
+
result.total_frames = min(total, max_frames)
|
| 356 |
+
|
| 357 |
+
has_cuda = torch.cuda.is_available()
|
| 358 |
+
if has_cuda:
|
| 359 |
+
torch.cuda.reset_peak_memory_stats()
|
| 360 |
+
torch.cuda.synchronize()
|
| 361 |
+
|
| 362 |
+
# Run GSAM2 with perf metrics
|
| 363 |
+
import threading
|
| 364 |
+
|
| 365 |
+
metrics = {
|
| 366 |
+
"end_to_end_ms": 0.0,
|
| 367 |
+
"frame_extraction_ms": 0.0,
|
| 368 |
+
"model_load_ms": 0.0,
|
| 369 |
+
"init_state_ms": 0.0,
|
| 370 |
+
"tracking_total_ms": 0.0,
|
| 371 |
+
"gdino_total_ms": 0.0,
|
| 372 |
+
"sam_image_total_ms": 0.0,
|
| 373 |
+
"sam_video_total_ms": 0.0,
|
| 374 |
+
"id_reconciliation_ms": 0.0,
|
| 375 |
+
"render_total_ms": 0.0,
|
| 376 |
+
"writer_total_ms": 0.0,
|
| 377 |
+
"gpu_peak_mem_mb": 0.0,
|
| 378 |
+
}
|
| 379 |
+
lock = threading.Lock()
|
| 380 |
+
|
| 381 |
+
fd, output_path = tempfile.mkstemp(prefix="profile_seg_", suffix=".mp4")
|
| 382 |
+
os.close(fd)
|
| 383 |
+
|
| 384 |
+
try:
|
| 385 |
+
from inference import run_grounded_sam2_tracking
|
| 386 |
+
run_grounded_sam2_tracking(
|
| 387 |
+
video_path,
|
| 388 |
+
output_path,
|
| 389 |
+
queries,
|
| 390 |
+
segmenter_name=segmenter_name,
|
| 391 |
+
step=step,
|
| 392 |
+
enable_gpt=False,
|
| 393 |
+
max_frames=max_frames,
|
| 394 |
+
_perf_metrics=metrics,
|
| 395 |
+
_perf_lock=lock,
|
| 396 |
+
)
|
| 397 |
+
except Exception as e:
|
| 398 |
+
logger.error("Profiled segmentation failed: %s", e)
|
| 399 |
+
raise
|
| 400 |
+
finally:
|
| 401 |
+
try:
|
| 402 |
+
os.remove(output_path)
|
| 403 |
+
except OSError:
|
| 404 |
+
pass
|
| 405 |
+
|
| 406 |
+
# Convert GSAM2 metrics to FrameTiming-like structure
|
| 407 |
+
n_frames = result.total_frames
|
| 408 |
+
n_keyframes = max(1, n_frames // step)
|
| 409 |
+
|
| 410 |
+
# Create synthetic per-frame timings from aggregate metrics
|
| 411 |
+
if n_frames > 0:
|
| 412 |
+
avg_gdino = metrics["gdino_total_ms"] / n_keyframes if n_keyframes else 0
|
| 413 |
+
avg_sam_img = metrics["sam_image_total_ms"] / n_keyframes if n_keyframes else 0
|
| 414 |
+
avg_sam_vid = metrics["sam_video_total_ms"] / max(1, n_frames - n_keyframes)
|
| 415 |
+
avg_render = metrics["render_total_ms"] / n_frames
|
| 416 |
+
|
| 417 |
+
for i in range(n_frames):
|
| 418 |
+
ft = FrameTiming(frame_idx=i)
|
| 419 |
+
is_keyframe = (i % step == 0)
|
| 420 |
+
if is_keyframe:
|
| 421 |
+
ft.preprocess_ms = avg_gdino
|
| 422 |
+
ft.gpu_kernel_ms = avg_sam_img
|
| 423 |
+
else:
|
| 424 |
+
ft.gpu_kernel_ms = avg_sam_vid
|
| 425 |
+
ft.postprocess_ms = avg_render
|
| 426 |
+
ft.decode_ms = metrics["frame_extraction_ms"] / n_frames
|
| 427 |
+
ft.total_ms = ft.decode_ms + ft.preprocess_ms + ft.gpu_kernel_ms + ft.postprocess_ms
|
| 428 |
+
result.frame_timings.append(ft)
|
| 429 |
+
|
| 430 |
+
result.profiled_frames = len(result.frame_timings)
|
| 431 |
+
|
| 432 |
+
if result.frame_timings:
|
| 433 |
+
result.decode_stats = TimingStats.from_samples([t.decode_ms for t in result.frame_timings])
|
| 434 |
+
result.preprocess_stats = TimingStats.from_samples([t.preprocess_ms for t in result.frame_timings])
|
| 435 |
+
result.gpu_kernel_stats = TimingStats.from_samples([t.gpu_kernel_ms for t in result.frame_timings])
|
| 436 |
+
result.postprocess_stats = TimingStats.from_samples([t.postprocess_ms for t in result.frame_timings])
|
| 437 |
+
result.total_stats = TimingStats.from_samples([t.total_ms for t in result.frame_timings])
|
| 438 |
+
result.avg_fps = 1000.0 / result.total_stats.mean_ms if result.total_stats.mean_ms > 0 else 0
|
| 439 |
+
|
| 440 |
+
# Additional GSAM2-specific metrics stored as metadata
|
| 441 |
+
result._gsam2_metrics = metrics # type: ignore[attr-defined]
|
| 442 |
+
|
| 443 |
+
if has_cuda:
|
| 444 |
+
torch.cuda.synchronize()
|
| 445 |
+
result.gpu_peak_memory_mb = max(
|
| 446 |
+
round(torch.cuda.max_memory_allocated() / (1024 ** 2), 1),
|
| 447 |
+
metrics.get("gpu_peak_mem_mb", 0),
|
| 448 |
+
)
|
| 449 |
+
result.gpu_allocated_mb = round(torch.cuda.memory_allocated() / (1024 ** 2), 1)
|
| 450 |
+
|
| 451 |
+
return result
|
utils/roofline.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Roofline model analysis for detection and segmentation pipelines.
|
| 2 |
+
|
| 3 |
+
Computes theoretical maximum throughput, identifies bottlenecks, and
|
| 4 |
+
provides actionable recommendations based on hardware specs and
|
| 5 |
+
profiling measurements.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Approximate GFLOPs per forward pass at reference resolution (640x480 for YOLO, 800x800 for DETR)
|
| 15 |
+
# These are rough estimates; actual FLOPs depend on input resolution and model variant.
|
| 16 |
+
_MODEL_FLOPS: Dict[str, float] = {
|
| 17 |
+
# Detection models (GFLOPs per frame)
|
| 18 |
+
"hf_yolov8": 78.9, # YOLOv8m ~79 GFLOPs at 640px
|
| 19 |
+
"detr_resnet50": 86.0, # DETR-R50 ~86 GFLOPs at 800px
|
| 20 |
+
"grounding_dino": 172.0, # Grounding DINO-B ~172 GFLOPs
|
| 21 |
+
"drone_yolo": 78.9, # Same arch as YOLOv8m
|
| 22 |
+
|
| 23 |
+
# Segmentation models (GFLOPs per keyframe)
|
| 24 |
+
"gsam2_small": 48.0, # SAM2 small encoder
|
| 25 |
+
"gsam2_base": 96.0, # SAM2 base encoder
|
| 26 |
+
"gsam2_large": 200.0, # SAM2 large encoder
|
| 27 |
+
"gsam2_tiny": 24.0, # SAM2 tiny encoder
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
# Approximate bytes moved per forward pass (weights + activations + I/O)
|
| 31 |
+
_MODEL_BYTES: Dict[str, float] = {
|
| 32 |
+
# In MB — approximate weight size + activation memory
|
| 33 |
+
"hf_yolov8": 52.0,
|
| 34 |
+
"detr_resnet50": 166.0,
|
| 35 |
+
"grounding_dino": 340.0,
|
| 36 |
+
"drone_yolo": 52.0,
|
| 37 |
+
"gsam2_small": 92.0,
|
| 38 |
+
"gsam2_base": 180.0,
|
| 39 |
+
"gsam2_large": 400.0,
|
| 40 |
+
"gsam2_tiny": 46.0,
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class BottleneckBreakdown:
|
| 46 |
+
"""Per-phase bottleneck identification."""
|
| 47 |
+
phase: str = "" # "decode", "preprocess", "transfer", "gpu_kernel", "postprocess"
|
| 48 |
+
time_ms: float = 0.0
|
| 49 |
+
fraction: float = 0.0 # Fraction of total pipeline time
|
| 50 |
+
is_bottleneck: bool = False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class RooflineResult:
|
| 55 |
+
"""Complete roofline analysis output."""
|
| 56 |
+
# Hardware ceilings
|
| 57 |
+
peak_fp32_tflops: float = 0.0
|
| 58 |
+
peak_fp16_tflops: float = 0.0
|
| 59 |
+
peak_memory_bandwidth_gbps: float = 0.0
|
| 60 |
+
ridge_point_flop_per_byte: float = 0.0 # = peak_tflops / peak_bw
|
| 61 |
+
|
| 62 |
+
# Workload characteristics
|
| 63 |
+
model_name: str = ""
|
| 64 |
+
model_gflops: float = 0.0
|
| 65 |
+
model_bytes_mb: float = 0.0
|
| 66 |
+
operational_intensity: float = 0.0 # FLOPs / bytes_moved
|
| 67 |
+
|
| 68 |
+
# Achieved performance
|
| 69 |
+
achieved_tflops: float = 0.0
|
| 70 |
+
achieved_bandwidth_gbps: float = 0.0
|
| 71 |
+
|
| 72 |
+
# Bottleneck analysis
|
| 73 |
+
primary_bottleneck: str = "" # "decode", "transfer", "memory", "compute"
|
| 74 |
+
bottleneck_explanation: str = ""
|
| 75 |
+
phase_breakdown: List[BottleneckBreakdown] = field(default_factory=list)
|
| 76 |
+
|
| 77 |
+
# Throughput
|
| 78 |
+
theoretical_max_fps: float = 0.0
|
| 79 |
+
observed_fps: float = 0.0
|
| 80 |
+
utilization_pct: float = 0.0
|
| 81 |
+
|
| 82 |
+
# GPU memory
|
| 83 |
+
gpu_peak_memory_mb: float = 0.0
|
| 84 |
+
gpu_vram_total_mb: float = 0.0
|
| 85 |
+
memory_utilization_pct: float = 0.0
|
| 86 |
+
|
| 87 |
+
# Recommendations
|
| 88 |
+
recommendations: List[str] = field(default_factory=list)
|
| 89 |
+
|
| 90 |
+
# GSAM2-specific metrics (populated for segmentation mode)
|
| 91 |
+
gsam2_metrics: Optional[Dict] = None
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def compute_roofline(hardware, profiling) -> RooflineResult:
|
| 95 |
+
"""Compute roofline analysis from hardware info and profiling results.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
hardware: HardwareInfo dataclass from hardware_info.py
|
| 99 |
+
profiling: ProfilingResult dataclass from profiler.py
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
RooflineResult with theoretical ceilings, achieved performance,
|
| 103 |
+
bottleneck identification, and recommendations.
|
| 104 |
+
"""
|
| 105 |
+
result = RooflineResult()
|
| 106 |
+
result.model_name = profiling.detector_name
|
| 107 |
+
|
| 108 |
+
# --- Hardware ceilings (use first GPU) ---
|
| 109 |
+
if hardware.gpus:
|
| 110 |
+
gpu = hardware.gpus[0]
|
| 111 |
+
result.peak_fp32_tflops = gpu.fp32_tflops or 0.0
|
| 112 |
+
result.peak_fp16_tflops = gpu.fp16_tflops or 0.0
|
| 113 |
+
result.peak_memory_bandwidth_gbps = gpu.memory_bandwidth_gbps or 0.0
|
| 114 |
+
if gpu.vram_total_gb:
|
| 115 |
+
result.gpu_vram_total_mb = gpu.vram_total_gb * 1024
|
| 116 |
+
else:
|
| 117 |
+
logger.warning("No GPU info available; roofline will have zero ceilings")
|
| 118 |
+
|
| 119 |
+
# Ridge point: where compute and memory roofs intersect
|
| 120 |
+
if result.peak_memory_bandwidth_gbps > 0:
|
| 121 |
+
# peak_tflops / peak_bw (TB/s) = FLOPs/byte
|
| 122 |
+
peak_tbps = result.peak_memory_bandwidth_gbps / 1000 # GB/s -> TB/s
|
| 123 |
+
if peak_tbps > 0:
|
| 124 |
+
result.ridge_point_flop_per_byte = result.peak_fp32_tflops / peak_tbps
|
| 125 |
+
|
| 126 |
+
# --- Workload characteristics ---
|
| 127 |
+
model_key = profiling.detector_name
|
| 128 |
+
result.model_gflops = _MODEL_FLOPS.get(model_key, 0.0)
|
| 129 |
+
result.model_bytes_mb = _MODEL_BYTES.get(model_key, 0.0)
|
| 130 |
+
|
| 131 |
+
if result.model_bytes_mb > 0:
|
| 132 |
+
# Operational intensity = FLOPs / bytes_moved
|
| 133 |
+
bytes_moved = result.model_bytes_mb * 1e6 # MB -> bytes
|
| 134 |
+
flops = result.model_gflops * 1e9 # GFLOPs -> FLOPs
|
| 135 |
+
result.operational_intensity = flops / bytes_moved if bytes_moved > 0 else 0
|
| 136 |
+
|
| 137 |
+
# --- Achieved performance ---
|
| 138 |
+
gpu_kernel_ms = profiling.gpu_kernel_stats.mean_ms if profiling.gpu_kernel_stats.count > 0 else 0
|
| 139 |
+
if gpu_kernel_ms > 0 and result.model_gflops > 0:
|
| 140 |
+
# Achieved TFLOPS = GFLOPs / (kernel_time_s)
|
| 141 |
+
kernel_time_s = gpu_kernel_ms / 1000
|
| 142 |
+
result.achieved_tflops = round(result.model_gflops / kernel_time_s / 1000, 4)
|
| 143 |
+
|
| 144 |
+
if gpu_kernel_ms > 0 and result.model_bytes_mb > 0:
|
| 145 |
+
kernel_time_s = gpu_kernel_ms / 1000
|
| 146 |
+
result.achieved_bandwidth_gbps = round(result.model_bytes_mb / kernel_time_s / 1000, 2)
|
| 147 |
+
|
| 148 |
+
# --- Per-phase bottleneck breakdown ---
|
| 149 |
+
phases = [
|
| 150 |
+
("decode", profiling.decode_stats.mean_ms),
|
| 151 |
+
("preprocess", profiling.preprocess_stats.mean_ms),
|
| 152 |
+
]
|
| 153 |
+
# Only include transfer if we have valid measurements
|
| 154 |
+
if profiling.transfer_stats.count > 0 and profiling.transfer_stats.mean_ms >= 0:
|
| 155 |
+
phases.append(("transfer", profiling.transfer_stats.mean_ms))
|
| 156 |
+
phases.extend([
|
| 157 |
+
("gpu_kernel", profiling.gpu_kernel_stats.mean_ms),
|
| 158 |
+
("postprocess", profiling.postprocess_stats.mean_ms),
|
| 159 |
+
])
|
| 160 |
+
|
| 161 |
+
total_phase_ms = sum(ms for _, ms in phases)
|
| 162 |
+
max_phase_name = ""
|
| 163 |
+
max_phase_ms = 0
|
| 164 |
+
|
| 165 |
+
for name, ms in phases:
|
| 166 |
+
bb = BottleneckBreakdown(
|
| 167 |
+
phase=name,
|
| 168 |
+
time_ms=round(ms, 3),
|
| 169 |
+
fraction=round(ms / total_phase_ms, 4) if total_phase_ms > 0 else 0,
|
| 170 |
+
)
|
| 171 |
+
if ms > max_phase_ms:
|
| 172 |
+
max_phase_ms = ms
|
| 173 |
+
max_phase_name = name
|
| 174 |
+
result.phase_breakdown.append(bb)
|
| 175 |
+
|
| 176 |
+
# Mark bottleneck phase
|
| 177 |
+
for bb in result.phase_breakdown:
|
| 178 |
+
if bb.phase == max_phase_name:
|
| 179 |
+
bb.is_bottleneck = True
|
| 180 |
+
|
| 181 |
+
# --- Primary bottleneck classification ---
|
| 182 |
+
if max_phase_name == "decode":
|
| 183 |
+
result.primary_bottleneck = "decode-bound"
|
| 184 |
+
result.bottleneck_explanation = (
|
| 185 |
+
f"Video decoding ({max_phase_ms:.1f}ms) is the slowest phase. "
|
| 186 |
+
"GPU is waiting for frames. Consider hardware-accelerated decoding (NVDEC) "
|
| 187 |
+
"or reducing input resolution."
|
| 188 |
+
)
|
| 189 |
+
elif max_phase_name == "transfer":
|
| 190 |
+
result.primary_bottleneck = "transfer-bound"
|
| 191 |
+
result.bottleneck_explanation = (
|
| 192 |
+
f"CPU->GPU data transfer ({max_phase_ms:.1f}ms) is the slowest phase. "
|
| 193 |
+
"Consider using pinned memory, reducing input tensor size, or "
|
| 194 |
+
"overlapping transfer with computation."
|
| 195 |
+
)
|
| 196 |
+
elif max_phase_name == "gpu_kernel":
|
| 197 |
+
# Sub-classify: memory-bound vs compute-bound
|
| 198 |
+
if result.operational_intensity > 0 and result.ridge_point_flop_per_byte > 0:
|
| 199 |
+
if result.operational_intensity < result.ridge_point_flop_per_byte:
|
| 200 |
+
result.primary_bottleneck = "memory-bound"
|
| 201 |
+
result.bottleneck_explanation = (
|
| 202 |
+
f"GPU kernel ({max_phase_ms:.1f}ms) is memory-bandwidth limited. "
|
| 203 |
+
f"Operational intensity ({result.operational_intensity:.1f} FLOP/byte) "
|
| 204 |
+
f"is below the ridge point ({result.ridge_point_flop_per_byte:.1f} FLOP/byte). "
|
| 205 |
+
"Consider model quantization (FP16/INT8), reducing batch size, "
|
| 206 |
+
"or using a more compute-dense model."
|
| 207 |
+
)
|
| 208 |
+
else:
|
| 209 |
+
result.primary_bottleneck = "compute-bound"
|
| 210 |
+
result.bottleneck_explanation = (
|
| 211 |
+
f"GPU kernel ({max_phase_ms:.1f}ms) is compute-limited. "
|
| 212 |
+
f"Achieved {result.achieved_tflops:.2f} TFLOPS out of "
|
| 213 |
+
f"{result.peak_fp32_tflops:.2f} TFLOPS peak "
|
| 214 |
+
f"({result.achieved_tflops / result.peak_fp32_tflops * 100:.1f}% utilization). "
|
| 215 |
+
"Consider FP16 inference, TensorRT optimization, or a smaller model."
|
| 216 |
+
if result.peak_fp32_tflops > 0
|
| 217 |
+
else "Consider a faster GPU or a smaller model."
|
| 218 |
+
)
|
| 219 |
+
else:
|
| 220 |
+
result.primary_bottleneck = "compute-bound"
|
| 221 |
+
result.bottleneck_explanation = (
|
| 222 |
+
f"GPU kernel ({max_phase_ms:.1f}ms) dominates pipeline time."
|
| 223 |
+
)
|
| 224 |
+
elif max_phase_name == "preprocess":
|
| 225 |
+
result.primary_bottleneck = "preprocess-bound"
|
| 226 |
+
result.bottleneck_explanation = (
|
| 227 |
+
f"CPU preprocessing ({max_phase_ms:.1f}ms) is the slowest phase. "
|
| 228 |
+
"Consider GPU-accelerated preprocessing or reducing input resolution."
|
| 229 |
+
)
|
| 230 |
+
elif max_phase_name == "postprocess":
|
| 231 |
+
result.primary_bottleneck = "postprocess-bound"
|
| 232 |
+
result.bottleneck_explanation = (
|
| 233 |
+
f"CPU post-processing/NMS ({max_phase_ms:.1f}ms) is the slowest phase. "
|
| 234 |
+
"Consider batched NMS on GPU or raising the confidence threshold."
|
| 235 |
+
)
|
| 236 |
+
else:
|
| 237 |
+
result.primary_bottleneck = "unknown"
|
| 238 |
+
result.bottleneck_explanation = "Unable to determine primary bottleneck."
|
| 239 |
+
|
| 240 |
+
# --- Throughput ---
|
| 241 |
+
# Theoretical max FPS = 1000 / max(phase_times)
|
| 242 |
+
if max_phase_ms > 0:
|
| 243 |
+
result.theoretical_max_fps = round(1000 / max_phase_ms, 2)
|
| 244 |
+
result.observed_fps = round(profiling.avg_fps, 2)
|
| 245 |
+
if result.theoretical_max_fps > 0:
|
| 246 |
+
result.utilization_pct = round(result.observed_fps / result.theoretical_max_fps * 100, 1)
|
| 247 |
+
|
| 248 |
+
# --- GPU memory ---
|
| 249 |
+
result.gpu_peak_memory_mb = profiling.gpu_peak_memory_mb
|
| 250 |
+
if result.gpu_vram_total_mb > 0:
|
| 251 |
+
result.memory_utilization_pct = round(
|
| 252 |
+
result.gpu_peak_memory_mb / result.gpu_vram_total_mb * 100, 1
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# --- GSAM2 metrics ---
|
| 256 |
+
gsam2_metrics = getattr(profiling, "_gsam2_metrics", None)
|
| 257 |
+
if gsam2_metrics:
|
| 258 |
+
result.gsam2_metrics = gsam2_metrics
|
| 259 |
+
|
| 260 |
+
# --- Recommendations ---
|
| 261 |
+
recs = []
|
| 262 |
+
|
| 263 |
+
# Bottleneck-specific recommendations
|
| 264 |
+
if result.primary_bottleneck == "decode-bound":
|
| 265 |
+
recs.append("Use NVIDIA NVDEC for hardware-accelerated video decoding")
|
| 266 |
+
recs.append("Reduce input video resolution before processing")
|
| 267 |
+
elif result.primary_bottleneck == "transfer-bound":
|
| 268 |
+
recs.append("Use torch.cuda pinned memory for faster CPU->GPU transfers")
|
| 269 |
+
recs.append("Pre-allocate GPU tensors and reuse across frames")
|
| 270 |
+
elif result.primary_bottleneck == "memory-bound":
|
| 271 |
+
recs.append("Enable FP16 (half-precision) inference to reduce memory bandwidth pressure")
|
| 272 |
+
recs.append("Consider INT8 quantization via TensorRT for further speedup")
|
| 273 |
+
elif result.primary_bottleneck == "compute-bound":
|
| 274 |
+
recs.append("Enable FP16 inference (2x theoretical throughput on Volta+ GPUs)")
|
| 275 |
+
recs.append("Consider TensorRT or torch.compile() for kernel fusion")
|
| 276 |
+
if result.peak_fp32_tflops > 0 and result.achieved_tflops / result.peak_fp32_tflops < 0.3:
|
| 277 |
+
recs.append("Low GPU utilization — consider increasing batch size or using a multi-stream pipeline")
|
| 278 |
+
|
| 279 |
+
# General recommendations
|
| 280 |
+
if result.memory_utilization_pct > 80:
|
| 281 |
+
recs.append(f"GPU memory utilization is high ({result.memory_utilization_pct:.0f}%); "
|
| 282 |
+
"reduce batch size or use gradient checkpointing to avoid OOM")
|
| 283 |
+
elif result.memory_utilization_pct > 0 and result.memory_utilization_pct < 30:
|
| 284 |
+
recs.append(f"GPU memory utilization is low ({result.memory_utilization_pct:.0f}%); "
|
| 285 |
+
"consider processing multiple streams or increasing batch size")
|
| 286 |
+
|
| 287 |
+
if profiling.mode == "detection" and profiling.avg_fps < profiling.video_fps:
|
| 288 |
+
recs.append(
|
| 289 |
+
f"Processing speed ({profiling.avg_fps:.1f} FPS) is below video frame rate "
|
| 290 |
+
f"({profiling.video_fps:.1f} FPS); consider frame skipping or a faster model"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
result.recommendations = recs
|
| 294 |
+
return result
|