Zhen Ye Claude Opus 4.6 commited on
Commit
078b447
·
1 Parent(s): 0ace9ca

feat: add benchmark profiler & roofline analysis system

Browse files

Add hardware extraction, per-frame GPU/CPU profiling with CUDA events,
and automated roofline analysis for detection and segmentation modes.

New endpoints:
- GET /benchmark/hardware — cached hardware specs
- POST /benchmark/profile — per-frame timing breakdown
- POST /benchmark/analysis — full roofline with bottleneck ID

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show
  1. app.py +141 -0
  2. requirements.txt +1 -0
  3. utils/hardware_info.py +385 -0
  4. utils/profiler.py +451 -0
  5. utils/roofline.py +294 -0
app.py CHANGED
@@ -1017,5 +1017,146 @@ async def gpu_monitor_endpoint(duration: int = 180, interval: int = 1):
1017
  return StreamingResponse(_stream(), media_type="text/plain")
1018
 
1019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1020
  if __name__ == "__main__":
1021
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
1017
  return StreamingResponse(_stream(), media_type="text/plain")
1018
 
1019
 
1020
+ # ---------------------------------------------------------------------------
1021
+ # Benchmark Profiler & Roofline Analysis Endpoints
1022
+ # ---------------------------------------------------------------------------
1023
+
1024
+ @app.get("/benchmark/hardware")
1025
+ async def benchmark_hardware():
1026
+ """Return hardware specs JSON (no video needed, cached)."""
1027
+ import dataclasses
1028
+ from utils.hardware_info import get_hardware_info
1029
+
1030
+ hw = await asyncio.to_thread(get_hardware_info)
1031
+ return JSONResponse(dataclasses.asdict(hw))
1032
+
1033
+
1034
+ @app.post("/benchmark/profile")
1035
+ async def benchmark_profile(
1036
+ video: UploadFile = File(...),
1037
+ mode: str = Form("detection"),
1038
+ detector: str = Form("hf_yolov8"),
1039
+ segmenter: str = Form("gsam2_large"),
1040
+ queries: str = Form("person,car,truck"),
1041
+ max_frames: int = Form(100),
1042
+ warmup_frames: int = Form(5),
1043
+ step: int = Form(20),
1044
+ ):
1045
+ """Run profiled inference and return per-frame timing breakdown.
1046
+
1047
+ Args:
1048
+ video: Video file to profile.
1049
+ mode: "detection" or "segmentation".
1050
+ detector: Detector key (for detection mode).
1051
+ segmenter: Segmenter key (for segmentation mode).
1052
+ queries: Comma-separated object classes.
1053
+ max_frames: Maximum frames to profile.
1054
+ warmup_frames: Warmup frames (detection only).
1055
+ step: Keyframe interval (segmentation only).
1056
+ """
1057
+ import dataclasses
1058
+ from utils.profiler import run_profiled_detection, run_profiled_segmentation
1059
+
1060
+ if mode not in ("detection", "segmentation"):
1061
+ raise HTTPException(status_code=400, detail="mode must be 'detection' or 'segmentation'")
1062
+
1063
+ input_path = _save_upload_to_tmp(video)
1064
+ await video.close()
1065
+
1066
+ query_list = [q.strip() for q in queries.split(",") if q.strip()]
1067
+
1068
+ try:
1069
+ if mode == "detection":
1070
+ result = await asyncio.to_thread(
1071
+ run_profiled_detection,
1072
+ input_path, detector, query_list,
1073
+ max_frames=max_frames, warmup_frames=warmup_frames,
1074
+ )
1075
+ else:
1076
+ result = await asyncio.to_thread(
1077
+ run_profiled_segmentation,
1078
+ input_path, segmenter, query_list,
1079
+ max_frames=max_frames, step=step,
1080
+ )
1081
+ except Exception as exc:
1082
+ _safe_delete(input_path)
1083
+ logging.exception("Profiling failed")
1084
+ raise HTTPException(status_code=500, detail=str(exc))
1085
+ finally:
1086
+ _safe_delete(input_path)
1087
+
1088
+ # Serialize dataclass, handling any non-serializable fields
1089
+ out = dataclasses.asdict(result)
1090
+ # Include GSAM2 metrics if present
1091
+ gsam2 = getattr(result, "_gsam2_metrics", None)
1092
+ if gsam2:
1093
+ out["gsam2_metrics"] = gsam2
1094
+ return JSONResponse(out)
1095
+
1096
+
1097
+ @app.post("/benchmark/analysis")
1098
+ async def benchmark_analysis(
1099
+ video: UploadFile = File(...),
1100
+ mode: str = Form("detection"),
1101
+ detector: str = Form("hf_yolov8"),
1102
+ segmenter: str = Form("gsam2_large"),
1103
+ queries: str = Form("person,car,truck"),
1104
+ max_frames: int = Form(100),
1105
+ warmup_frames: int = Form(5),
1106
+ step: int = Form(20),
1107
+ ):
1108
+ """Full roofline analysis: hardware + profiling + theoretical ceilings + bottleneck ID.
1109
+
1110
+ Combines hardware extraction, profiled inference, and roofline model
1111
+ to identify bottlenecks and provide actionable recommendations.
1112
+ """
1113
+ import dataclasses
1114
+ from utils.hardware_info import get_hardware_info
1115
+ from utils.profiler import run_profiled_detection, run_profiled_segmentation
1116
+ from utils.roofline import compute_roofline
1117
+
1118
+ if mode not in ("detection", "segmentation"):
1119
+ raise HTTPException(status_code=400, detail="mode must be 'detection' or 'segmentation'")
1120
+
1121
+ input_path = _save_upload_to_tmp(video)
1122
+ await video.close()
1123
+
1124
+ query_list = [q.strip() for q in queries.split(",") if q.strip()]
1125
+
1126
+ try:
1127
+ # Get hardware info (cached, fast)
1128
+ hardware = await asyncio.to_thread(get_hardware_info)
1129
+
1130
+ # Run profiling
1131
+ if mode == "detection":
1132
+ profiling = await asyncio.to_thread(
1133
+ run_profiled_detection,
1134
+ input_path, detector, query_list,
1135
+ max_frames=max_frames, warmup_frames=warmup_frames,
1136
+ )
1137
+ else:
1138
+ profiling = await asyncio.to_thread(
1139
+ run_profiled_segmentation,
1140
+ input_path, segmenter, query_list,
1141
+ max_frames=max_frames, step=step,
1142
+ )
1143
+
1144
+ # Compute roofline
1145
+ roofline = compute_roofline(hardware, profiling)
1146
+
1147
+ except Exception as exc:
1148
+ _safe_delete(input_path)
1149
+ logging.exception("Benchmark analysis failed")
1150
+ raise HTTPException(status_code=500, detail=str(exc))
1151
+ finally:
1152
+ _safe_delete(input_path)
1153
+
1154
+ return JSONResponse({
1155
+ "hardware": dataclasses.asdict(hardware),
1156
+ "profiling": dataclasses.asdict(profiling),
1157
+ "roofline": dataclasses.asdict(roofline),
1158
+ })
1159
+
1160
+
1161
  if __name__ == "__main__":
1162
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
requirements.txt CHANGED
@@ -13,3 +13,4 @@ sentence-transformers
13
  SAM-2 @ git+https://github.com/facebookresearch/sam2.git
14
  hydra-core>=1.3.2
15
  iopath>=0.1.10
 
 
13
  SAM-2 @ git+https://github.com/facebookresearch/sam2.git
14
  hydra-core>=1.3.2
15
  iopath>=0.1.10
16
+ psutil
utils/hardware_info.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hardware specification extraction for roofline analysis.
2
+
3
+ Extracts CPU, GPU, memory, and storage parameters via system tools
4
+ and torch APIs. All functions have try/except fallbacks returning None
5
+ for inaccessible fields.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import platform
11
+ import re
12
+ import subprocess
13
+ from dataclasses import dataclass, field
14
+ from functools import lru_cache
15
+ from typing import Dict, List, Optional
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # CUDA cores per SM by compute capability (major, minor) -> cores_per_sm
20
+ # Kepler through Blackwell
21
+ _CORES_PER_SM: Dict[tuple, int] = {
22
+ (3, 0): 192, (3, 2): 192, (3, 5): 192, (3, 7): 192, # Kepler
23
+ (5, 0): 128, (5, 2): 128, (5, 3): 128, # Maxwell
24
+ (6, 0): 64, (6, 1): 128, (6, 2): 128, # Pascal
25
+ (7, 0): 64, (7, 2): 64, (7, 5): 64, # Volta / Turing
26
+ (8, 0): 64, (8, 6): 128, (8, 7): 128, (8, 9): 128, # Ampere / Ada
27
+ (9, 0): 128, # Hopper
28
+ (10, 0): 128, # Blackwell
29
+ }
30
+
31
+ # PCIe bandwidth (GB/s, unidirectional) by gen and width
32
+ _PCIE_BW: Dict[int, float] = {
33
+ 3: 0.985, # ~1 GB/s per lane
34
+ 4: 1.969,
35
+ 5: 3.938,
36
+ 6: 7.563,
37
+ }
38
+
39
+
40
+ @dataclass
41
+ class CPUInfo:
42
+ model: Optional[str] = None
43
+ physical_cores: Optional[int] = None
44
+ logical_cores: Optional[int] = None
45
+ frequency_mhz: Optional[float] = None
46
+ cache_l2_kb: Optional[int] = None
47
+ cache_l3_kb: Optional[int] = None
48
+ architecture: Optional[str] = None
49
+
50
+
51
+ @dataclass
52
+ class MemoryInfo:
53
+ total_gb: Optional[float] = None
54
+ available_gb: Optional[float] = None
55
+ estimated_bandwidth_gbps: Optional[float] = None
56
+
57
+
58
+ @dataclass
59
+ class GPUInfo:
60
+ index: int = 0
61
+ name: Optional[str] = None
62
+ sm_count: Optional[int] = None
63
+ cuda_cores: Optional[int] = None
64
+ clock_mhz: Optional[float] = None
65
+ memory_clock_mhz: Optional[float] = None
66
+ memory_bus_width_bits: Optional[int] = None
67
+ vram_total_gb: Optional[float] = None
68
+ vram_free_gb: Optional[float] = None
69
+ memory_bandwidth_gbps: Optional[float] = None
70
+ fp32_tflops: Optional[float] = None
71
+ fp16_tflops: Optional[float] = None
72
+ tensor_core_tflops: Optional[float] = None
73
+ pcie_gen: Optional[int] = None
74
+ pcie_width: Optional[int] = None
75
+ pcie_bandwidth_gbps: Optional[float] = None
76
+ compute_capability: Optional[str] = None
77
+ driver_version: Optional[str] = None
78
+ cuda_version: Optional[str] = None
79
+
80
+
81
+ @dataclass
82
+ class StorageInfo:
83
+ storage_type: Optional[str] = None # "SSD" or "HDD" or "Unknown"
84
+ sequential_read_mbps: Optional[float] = None
85
+
86
+
87
+ @dataclass
88
+ class HardwareInfo:
89
+ cpu: CPUInfo = field(default_factory=CPUInfo)
90
+ memory: MemoryInfo = field(default_factory=MemoryInfo)
91
+ gpus: List[GPUInfo] = field(default_factory=list)
92
+ storage: StorageInfo = field(default_factory=StorageInfo)
93
+ system: Optional[str] = None
94
+ python_version: Optional[str] = None
95
+ torch_version: Optional[str] = None
96
+ cuda_runtime_version: Optional[str] = None
97
+
98
+
99
+ def _run_cmd(cmd: List[str], timeout: int = 10) -> Optional[str]:
100
+ """Run a shell command and return stdout, or None on failure."""
101
+ try:
102
+ result = subprocess.run(
103
+ cmd, capture_output=True, text=True, timeout=timeout,
104
+ )
105
+ if result.returncode == 0:
106
+ return result.stdout.strip()
107
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
108
+ pass
109
+ return None
110
+
111
+
112
+ def _nvidia_smi_query(*fields: str) -> Optional[Dict[str, str]]:
113
+ """Query nvidia-smi for given fields. Returns dict of field->value."""
114
+ field_str = ",".join(fields)
115
+ out = _run_cmd([
116
+ "nvidia-smi",
117
+ f"--query-gpu={field_str}",
118
+ "--format=csv,noheader,nounits",
119
+ ])
120
+ if not out:
121
+ return None
122
+ values = [v.strip() for v in out.split("\n")[0].split(",")]
123
+ if len(values) != len(fields):
124
+ return None
125
+ return dict(zip(fields, values))
126
+
127
+
128
+ def get_cpu_info() -> CPUInfo:
129
+ info = CPUInfo()
130
+ try:
131
+ info.architecture = platform.machine()
132
+ info.logical_cores = os.cpu_count()
133
+
134
+ try:
135
+ import psutil
136
+ info.physical_cores = psutil.cpu_count(logical=False)
137
+ freq = psutil.cpu_freq()
138
+ if freq:
139
+ info.frequency_mhz = freq.current or freq.max
140
+ except ImportError:
141
+ pass
142
+
143
+ system = platform.system()
144
+ if system == "Linux":
145
+ out = _run_cmd(["lscpu"])
146
+ if out:
147
+ for line in out.split("\n"):
148
+ if "Model name" in line:
149
+ info.model = line.split(":", 1)[1].strip()
150
+ elif "L2 cache" in line:
151
+ val = line.split(":", 1)[1].strip()
152
+ m = re.search(r"([\d.]+)", val)
153
+ if m:
154
+ kb = float(m.group(1))
155
+ if "MiB" in val or "M" in val:
156
+ kb *= 1024
157
+ info.cache_l2_kb = int(kb)
158
+ elif "L3 cache" in line:
159
+ val = line.split(":", 1)[1].strip()
160
+ m = re.search(r"([\d.]+)", val)
161
+ if m:
162
+ kb = float(m.group(1))
163
+ if "MiB" in val or "M" in val:
164
+ kb *= 1024
165
+ info.cache_l3_kb = int(kb)
166
+ elif system == "Darwin":
167
+ brand = _run_cmd(["sysctl", "-n", "machdep.cpu.brand_string"])
168
+ if brand:
169
+ info.model = brand
170
+ l2 = _run_cmd(["sysctl", "-n", "hw.l2cachesize"])
171
+ if l2:
172
+ try:
173
+ info.cache_l2_kb = int(l2) // 1024
174
+ except ValueError:
175
+ pass
176
+ l3 = _run_cmd(["sysctl", "-n", "hw.l3cachesize"])
177
+ if l3:
178
+ try:
179
+ info.cache_l3_kb = int(l3) // 1024
180
+ except ValueError:
181
+ pass
182
+ except Exception:
183
+ logger.debug("CPU info extraction partially failed", exc_info=True)
184
+ return info
185
+
186
+
187
+ def get_memory_info() -> MemoryInfo:
188
+ info = MemoryInfo()
189
+ try:
190
+ try:
191
+ import psutil
192
+ vm = psutil.virtual_memory()
193
+ info.total_gb = round(vm.total / (1024 ** 3), 2)
194
+ info.available_gb = round(vm.available / (1024 ** 3), 2)
195
+ except ImportError:
196
+ # Fallback: /proc/meminfo on Linux
197
+ if os.path.exists("/proc/meminfo"):
198
+ with open("/proc/meminfo") as f:
199
+ for line in f:
200
+ if line.startswith("MemTotal:"):
201
+ kb = int(line.split()[1])
202
+ info.total_gb = round(kb / (1024 ** 2), 2)
203
+ elif line.startswith("MemAvailable:"):
204
+ kb = int(line.split()[1])
205
+ info.available_gb = round(kb / (1024 ** 2), 2)
206
+
207
+ # Rough estimate: DDR4 ~40 GB/s, DDR5 ~60 GB/s
208
+ # Without dmidecode we can't know for sure, default to DDR4 estimate
209
+ if info.total_gb:
210
+ info.estimated_bandwidth_gbps = 40.0 # conservative DDR4 dual-channel
211
+ except Exception:
212
+ logger.debug("Memory info extraction partially failed", exc_info=True)
213
+ return info
214
+
215
+
216
+ def get_gpu_info() -> List[GPUInfo]:
217
+ gpus: List[GPUInfo] = []
218
+ try:
219
+ import torch
220
+ if not torch.cuda.is_available():
221
+ return gpus
222
+
223
+ device_count = torch.cuda.device_count()
224
+
225
+ # Get driver/cuda version from nvidia-smi
226
+ driver_version = None
227
+ smi_cuda_version = None
228
+ nv = _nvidia_smi_query("driver_version")
229
+ if nv:
230
+ driver_version = nv.get("driver_version")
231
+ # nvidia-smi reports the max supported CUDA runtime
232
+ nv2 = _run_cmd(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"])
233
+ smi_out = _run_cmd(["nvidia-smi"])
234
+ if smi_out:
235
+ m = re.search(r"CUDA Version:\s+([\d.]+)", smi_out)
236
+ if m:
237
+ smi_cuda_version = m.group(1)
238
+
239
+ for i in range(device_count):
240
+ gpu = GPUInfo(index=i)
241
+ props = torch.cuda.get_device_properties(i)
242
+
243
+ gpu.name = props.name
244
+ gpu.sm_count = props.multi_processor_count
245
+ gpu.vram_total_gb = round(props.total_mem / (1024 ** 3), 2)
246
+ cc = (props.major, props.minor)
247
+ gpu.compute_capability = f"{props.major}.{props.minor}"
248
+ gpu.driver_version = driver_version
249
+ gpu.cuda_version = smi_cuda_version
250
+
251
+ # CUDA cores
252
+ cores_per_sm = _CORES_PER_SM.get(cc)
253
+ if cores_per_sm and gpu.sm_count:
254
+ gpu.cuda_cores = gpu.sm_count * cores_per_sm
255
+
256
+ # nvidia-smi per-GPU queries
257
+ nv_data = _run_cmd([
258
+ "nvidia-smi",
259
+ f"--id={i}",
260
+ "--query-gpu=clocks.max.graphics,clocks.max.memory,memory.bus_width,pcie.link.gen.current,pcie.link.width.current,memory.free",
261
+ "--format=csv,noheader,nounits",
262
+ ])
263
+ if nv_data:
264
+ parts = [p.strip() for p in nv_data.split(",")]
265
+ if len(parts) >= 6:
266
+ try:
267
+ gpu.clock_mhz = float(parts[0])
268
+ except (ValueError, TypeError):
269
+ pass
270
+ try:
271
+ gpu.memory_clock_mhz = float(parts[1])
272
+ except (ValueError, TypeError):
273
+ pass
274
+ try:
275
+ gpu.memory_bus_width_bits = int(parts[2])
276
+ except (ValueError, TypeError):
277
+ pass
278
+ try:
279
+ gpu.pcie_gen = int(parts[3])
280
+ except (ValueError, TypeError):
281
+ pass
282
+ try:
283
+ gpu.pcie_width = int(parts[4])
284
+ except (ValueError, TypeError):
285
+ pass
286
+ try:
287
+ gpu.vram_free_gb = round(float(parts[5]) / 1024, 2)
288
+ except (ValueError, TypeError):
289
+ pass
290
+
291
+ # Derived: memory bandwidth
292
+ # GDDR: bandwidth = mem_clock * bus_width * 2 (DDR) / 8 (bits->bytes) / 1000 (MHz->GHz)
293
+ # HBM: bandwidth = mem_clock * bus_width * 2 / 8 / 1000
294
+ if gpu.memory_clock_mhz and gpu.memory_bus_width_bits:
295
+ gpu.memory_bandwidth_gbps = round(
296
+ gpu.memory_clock_mhz * gpu.memory_bus_width_bits * 2 / 8 / 1000, 1
297
+ )
298
+
299
+ # Derived: FP32 TFLOPS = cuda_cores * clock_mhz * 2 (FMA) / 1e6
300
+ if gpu.cuda_cores and gpu.clock_mhz:
301
+ gpu.fp32_tflops = round(gpu.cuda_cores * gpu.clock_mhz * 2 / 1e6, 2)
302
+ # FP16 is typically 2x FP32 on Volta+
303
+ if props.major >= 7:
304
+ gpu.fp16_tflops = round(gpu.fp32_tflops * 2, 2)
305
+ else:
306
+ gpu.fp16_tflops = gpu.fp32_tflops
307
+
308
+ # Tensor core TFLOPS (rough: 8x FP32 on Ampere+, 4x on Volta/Turing)
309
+ if gpu.fp32_tflops:
310
+ if props.major >= 8:
311
+ gpu.tensor_core_tflops = round(gpu.fp32_tflops * 8, 2)
312
+ elif props.major >= 7:
313
+ gpu.tensor_core_tflops = round(gpu.fp32_tflops * 4, 2)
314
+
315
+ # Derived: PCIe bandwidth
316
+ if gpu.pcie_gen and gpu.pcie_width:
317
+ per_lane = _PCIE_BW.get(gpu.pcie_gen, 0)
318
+ gpu.pcie_bandwidth_gbps = round(per_lane * gpu.pcie_width, 2)
319
+
320
+ gpus.append(gpu)
321
+
322
+ except Exception:
323
+ logger.debug("GPU info extraction partially failed", exc_info=True)
324
+ return gpus
325
+
326
+
327
+ def get_storage_info() -> StorageInfo:
328
+ info = StorageInfo()
329
+ try:
330
+ system = platform.system()
331
+ if system == "Linux":
332
+ # Check if root device is rotational
333
+ out = _run_cmd(["lsblk", "-d", "-o", "NAME,ROTA", "--noheadings"])
334
+ if out:
335
+ for line in out.strip().split("\n"):
336
+ parts = line.split()
337
+ if len(parts) == 2:
338
+ info.storage_type = "HDD" if parts[1] == "1" else "SSD"
339
+ break
340
+
341
+ # Quick sequential read test with dd (1GB)
342
+ dd_out = _run_cmd(
343
+ ["dd", "if=/dev/zero", "of=/dev/null", "bs=1M", "count=256"],
344
+ timeout=15,
345
+ )
346
+ # dd prints throughput to stderr, but _run_cmd only captures stdout
347
+ # Try a different approach
348
+ try:
349
+ result = subprocess.run(
350
+ ["dd", "if=/dev/zero", "of=/dev/null", "bs=1M", "count=256"],
351
+ capture_output=True, text=True, timeout=15,
352
+ )
353
+ stderr = result.stderr
354
+ m = re.search(r"([\d.]+)\s*(GB|MB)/s", stderr)
355
+ if m:
356
+ speed = float(m.group(1))
357
+ if m.group(2) == "GB":
358
+ speed *= 1000
359
+ info.sequential_read_mbps = round(speed, 0)
360
+ except Exception:
361
+ pass
362
+ elif system == "Darwin":
363
+ info.storage_type = "SSD" # Modern Macs use NVMe SSDs
364
+ except Exception:
365
+ logger.debug("Storage info extraction partially failed", exc_info=True)
366
+ return info
367
+
368
+
369
+ @lru_cache(maxsize=1)
370
+ def get_hardware_info() -> HardwareInfo:
371
+ """Aggregate all hardware info (cached)."""
372
+ import torch
373
+
374
+ hw = HardwareInfo()
375
+ hw.cpu = get_cpu_info()
376
+ hw.memory = get_memory_info()
377
+ hw.gpus = get_gpu_info()
378
+ hw.storage = get_storage_info()
379
+ hw.system = f"{platform.system()} {platform.release()}"
380
+ hw.python_version = platform.python_version()
381
+ hw.torch_version = torch.__version__
382
+ hw.cuda_runtime_version = (
383
+ torch.version.cuda if torch.cuda.is_available() else None
384
+ )
385
+ return hw
utils/profiler.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Per-frame GPU/CPU profiling for detection and segmentation pipelines.
2
+
3
+ Provides CUDA event-based timing and decomposed profiling for
4
+ transformers-based and opaque (YOLO) detectors. Runs in a dedicated
5
+ single-threaded path for accurate, reproducible measurements.
6
+ """
7
+
8
+ import logging
9
+ import statistics
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from typing import Dict, List, Optional, Sequence
13
+
14
+ import cv2
15
+ import numpy as np
16
+ import torch
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Detectors whose predict() can be decomposed into processor -> model -> post_process
21
+ _DECOMPOSABLE_DETECTORS = {"detr_resnet50", "grounding_dino"}
22
+ # Detectors with opaque predict() calls (YOLO-based)
23
+ _OPAQUE_DETECTORS = {"hf_yolov8", "drone_yolo"}
24
+
25
+
26
+ @dataclass
27
+ class TimingStats:
28
+ """Aggregate statistics for a set of measurements (in ms)."""
29
+ min_ms: float = 0.0
30
+ max_ms: float = 0.0
31
+ mean_ms: float = 0.0
32
+ std_ms: float = 0.0
33
+ p50_ms: float = 0.0
34
+ p95_ms: float = 0.0
35
+ p99_ms: float = 0.0
36
+ count: int = 0
37
+
38
+ @staticmethod
39
+ def from_samples(samples: List[float]) -> "TimingStats":
40
+ if not samples:
41
+ return TimingStats()
42
+ sorted_s = sorted(samples)
43
+ n = len(sorted_s)
44
+ return TimingStats(
45
+ min_ms=sorted_s[0],
46
+ max_ms=sorted_s[-1],
47
+ mean_ms=statistics.mean(sorted_s),
48
+ std_ms=statistics.stdev(sorted_s) if n > 1 else 0.0,
49
+ p50_ms=sorted_s[n // 2],
50
+ p95_ms=sorted_s[int(n * 0.95)],
51
+ p99_ms=sorted_s[int(n * 0.99)],
52
+ count=n,
53
+ )
54
+
55
+
56
+ @dataclass
57
+ class FrameTiming:
58
+ """Timing breakdown for a single frame (all values in ms)."""
59
+ frame_idx: int = 0
60
+ decode_ms: float = 0.0
61
+ preprocess_ms: float = 0.0 # CPU: image processor / resize
62
+ transfer_ms: float = 0.0 # CPU->GPU data transfer
63
+ gpu_kernel_ms: float = 0.0 # GPU model forward pass
64
+ postprocess_ms: float = 0.0 # CPU: post-processing + NMS
65
+ total_ms: float = 0.0
66
+ num_detections: int = 0
67
+
68
+
69
+ @dataclass
70
+ class ProfilingResult:
71
+ """Full profiling result for a video."""
72
+ detector_name: str = ""
73
+ mode: str = ""
74
+ total_frames: int = 0
75
+ warmup_frames: int = 0
76
+ profiled_frames: int = 0
77
+ video_resolution: str = ""
78
+ video_fps: float = 0.0
79
+
80
+ # Per-frame timings
81
+ frame_timings: List[FrameTiming] = field(default_factory=list)
82
+
83
+ # Aggregate stats
84
+ decode_stats: TimingStats = field(default_factory=TimingStats)
85
+ preprocess_stats: TimingStats = field(default_factory=TimingStats)
86
+ transfer_stats: TimingStats = field(default_factory=TimingStats)
87
+ gpu_kernel_stats: TimingStats = field(default_factory=TimingStats)
88
+ postprocess_stats: TimingStats = field(default_factory=TimingStats)
89
+ total_stats: TimingStats = field(default_factory=TimingStats)
90
+
91
+ # GPU memory
92
+ gpu_peak_memory_mb: float = 0.0
93
+ gpu_allocated_mb: float = 0.0
94
+
95
+ # Throughput
96
+ avg_fps: float = 0.0
97
+ avg_detections_per_frame: float = 0.0
98
+
99
+
100
+ class CudaTimer:
101
+ """Non-blocking GPU timer using CUDA events.
102
+
103
+ Records start/stop on the current CUDA stream; synchronizes lazily
104
+ on ``elapsed_ms()`` call.
105
+ """
106
+
107
+ def __init__(self):
108
+ self._start = torch.cuda.Event(enable_timing=True)
109
+ self._end = torch.cuda.Event(enable_timing=True)
110
+
111
+ def start(self):
112
+ self._start.record()
113
+
114
+ def stop(self):
115
+ self._end.record()
116
+
117
+ def elapsed_ms(self) -> float:
118
+ self._end.synchronize()
119
+ return self._start.elapsed_time(self._end)
120
+
121
+
122
+ def _profile_decomposed(detector, frame: np.ndarray, queries: Sequence[str]) -> FrameTiming:
123
+ """Profile a transformers-based detector with decomposed phases.
124
+
125
+ Works for DETR and Grounding DINO where we can separate:
126
+ processor(image) -> .to(device) -> model(**inputs) -> post_process()
127
+ """
128
+ timing = FrameTiming()
129
+
130
+ # 1. Preprocess (CPU)
131
+ t0 = time.perf_counter()
132
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
133
+
134
+ if hasattr(detector, "processor"):
135
+ processor = detector.processor
136
+ if hasattr(detector, "_build_prompt"):
137
+ # Grounding DINO
138
+ prompt = detector._build_prompt(queries)
139
+ inputs = processor(images=frame_rgb, text=prompt, return_tensors="pt")
140
+ else:
141
+ # DETR
142
+ inputs = processor(images=frame_rgb, return_tensors="pt")
143
+ else:
144
+ timing.preprocess_ms = (time.perf_counter() - t0) * 1000
145
+ return timing
146
+ timing.preprocess_ms = (time.perf_counter() - t0) * 1000
147
+
148
+ # 2. Transfer to GPU
149
+ cuda_timer_transfer = CudaTimer()
150
+ cuda_timer_transfer.start()
151
+ inputs = {key: value.to(detector.device) for key, value in inputs.items()}
152
+ cuda_timer_transfer.stop()
153
+ timing.transfer_ms = cuda_timer_transfer.elapsed_ms()
154
+
155
+ # 3. GPU forward pass
156
+ cuda_timer_kernel = CudaTimer()
157
+ cuda_timer_kernel.start()
158
+ with torch.no_grad():
159
+ outputs = detector.model(**inputs)
160
+ cuda_timer_kernel.stop()
161
+ timing.gpu_kernel_ms = cuda_timer_kernel.elapsed_ms()
162
+
163
+ # 4. Post-process (CPU)
164
+ t0 = time.perf_counter()
165
+ target_sizes = torch.tensor([frame.shape[:2]], device=detector.device)
166
+ if hasattr(detector, "_post_process"):
167
+ # Grounding DINO
168
+ processed_list = detector._post_process(outputs, inputs["input_ids"], target_sizes)
169
+ else:
170
+ # DETR
171
+ processed_list = detector.processor.post_process_object_detection(
172
+ outputs, threshold=detector.score_threshold, target_sizes=target_sizes,
173
+ )
174
+ result = detector._parse_single_result(processed_list[0])
175
+ timing.postprocess_ms = (time.perf_counter() - t0) * 1000
176
+ timing.num_detections = len(result.boxes)
177
+
178
+ timing.total_ms = timing.preprocess_ms + timing.transfer_ms + timing.gpu_kernel_ms + timing.postprocess_ms
179
+ return timing
180
+
181
+
182
+ def _profile_opaque(detector, frame: np.ndarray, queries: Sequence[str]) -> FrameTiming:
183
+ """Profile an opaque detector (YOLO) where internals aren't separable."""
184
+ timing = FrameTiming()
185
+
186
+ # Wrap entire predict() with CUDA events
187
+ cuda_timer = CudaTimer()
188
+
189
+ t0 = time.perf_counter()
190
+ cuda_timer.start()
191
+ result = detector.predict(frame, queries)
192
+ cuda_timer.stop()
193
+ wall_ms = (time.perf_counter() - t0) * 1000
194
+
195
+ timing.gpu_kernel_ms = cuda_timer.elapsed_ms()
196
+ timing.preprocess_ms = 0.0 # Included in gpu_kernel
197
+ timing.transfer_ms = -1.0 # Not separable
198
+ timing.postprocess_ms = max(0, wall_ms - timing.gpu_kernel_ms)
199
+ timing.total_ms = wall_ms
200
+ timing.num_detections = len(result.boxes)
201
+
202
+ return timing
203
+
204
+
205
+ def run_profiled_detection(
206
+ video_path: str,
207
+ detector_name: str,
208
+ queries: List[str],
209
+ max_frames: int = 100,
210
+ warmup_frames: int = 5,
211
+ ) -> ProfilingResult:
212
+ """Run profiled detection on a video file.
213
+
214
+ Single-threaded profiling path (not injected into the multi-threaded
215
+ production pipeline) for accurate, reproducible measurements.
216
+ """
217
+ from models.model_loader import load_detector
218
+ from utils.video import VideoReader
219
+
220
+ result = ProfilingResult(
221
+ detector_name=detector_name,
222
+ mode="detection",
223
+ warmup_frames=warmup_frames,
224
+ )
225
+
226
+ # Load detector
227
+ detector = load_detector(detector_name)
228
+ device = getattr(detector, "device", None)
229
+ has_cuda = device is not None and str(device).startswith("cuda")
230
+
231
+ if not has_cuda:
232
+ logger.warning("No CUDA device found for profiling; GPU timings will be 0")
233
+
234
+ # Open video
235
+ reader = VideoReader(video_path)
236
+ result.video_resolution = f"{reader.width}x{reader.height}"
237
+ result.video_fps = reader.fps
238
+
239
+ is_decomposable = detector_name in _DECOMPOSABLE_DETECTORS
240
+
241
+ # Reset CUDA peak memory
242
+ if has_cuda:
243
+ torch.cuda.reset_peak_memory_stats()
244
+ torch.cuda.synchronize()
245
+
246
+ frame_timings: List[FrameTiming] = []
247
+ frame_idx = 0
248
+
249
+ for frame in reader:
250
+ if frame_idx >= max_frames:
251
+ break
252
+
253
+ # Decode timing
254
+ t_decode_start = time.perf_counter()
255
+ # frame is already decoded by VideoReader, so decode = iteration time
256
+ # We measure it before predict for consistency
257
+ decode_ms = 0.0 # Measured below
258
+
259
+ if frame_idx < warmup_frames:
260
+ # Warmup: run prediction but don't record
261
+ if is_decomposable:
262
+ _profile_decomposed(detector, frame, queries)
263
+ else:
264
+ _profile_opaque(detector, frame, queries)
265
+ frame_idx += 1
266
+ continue
267
+
268
+ # Time the decode (approximated as read time for next frame)
269
+ t_before = time.perf_counter()
270
+
271
+ # Profile prediction
272
+ if is_decomposable:
273
+ timing = _profile_decomposed(detector, frame, queries)
274
+ else:
275
+ timing = _profile_opaque(detector, frame, queries)
276
+
277
+ timing.frame_idx = frame_idx
278
+ # decode_ms is effectively 0 here since VideoReader pre-decoded;
279
+ # for a real decode benchmark we'd time cv2.read separately.
280
+ # We'll measure a representative decode cost from the first non-warmup frame.
281
+ if frame_idx == warmup_frames:
282
+ # Benchmark decode cost: re-read one frame
283
+ cap = cv2.VideoCapture(video_path)
284
+ if cap.isOpened():
285
+ td0 = time.perf_counter()
286
+ cap.read()
287
+ timing.decode_ms = (time.perf_counter() - td0) * 1000
288
+ cap.release()
289
+ else:
290
+ # Approximate: use same decode cost as first frame
291
+ if frame_timings:
292
+ timing.decode_ms = frame_timings[0].decode_ms
293
+
294
+ frame_timings.append(timing)
295
+ frame_idx += 1
296
+
297
+ reader.close()
298
+
299
+ # Aggregate results
300
+ result.total_frames = frame_idx
301
+ result.profiled_frames = len(frame_timings)
302
+ result.frame_timings = frame_timings
303
+
304
+ if frame_timings:
305
+ result.decode_stats = TimingStats.from_samples([t.decode_ms for t in frame_timings])
306
+ result.preprocess_stats = TimingStats.from_samples([t.preprocess_ms for t in frame_timings])
307
+ transfer_samples = [t.transfer_ms for t in frame_timings if t.transfer_ms >= 0]
308
+ result.transfer_stats = TimingStats.from_samples(transfer_samples)
309
+ result.gpu_kernel_stats = TimingStats.from_samples([t.gpu_kernel_ms for t in frame_timings])
310
+ result.postprocess_stats = TimingStats.from_samples([t.postprocess_ms for t in frame_timings])
311
+ result.total_stats = TimingStats.from_samples([t.total_ms for t in frame_timings])
312
+
313
+ result.avg_fps = 1000.0 / result.total_stats.mean_ms if result.total_stats.mean_ms > 0 else 0
314
+ result.avg_detections_per_frame = statistics.mean([t.num_detections for t in frame_timings])
315
+
316
+ # GPU memory
317
+ if has_cuda:
318
+ torch.cuda.synchronize()
319
+ result.gpu_peak_memory_mb = round(torch.cuda.max_memory_allocated() / (1024 ** 2), 1)
320
+ result.gpu_allocated_mb = round(torch.cuda.memory_allocated() / (1024 ** 2), 1)
321
+
322
+ return result
323
+
324
+
325
+ def run_profiled_segmentation(
326
+ video_path: str,
327
+ segmenter_name: str,
328
+ queries: List[str],
329
+ max_frames: int = 100,
330
+ step: int = 20,
331
+ ) -> ProfilingResult:
332
+ """Run profiled segmentation (GSAM2) on a video file.
333
+
334
+ Profiles the GSAM2 stages: GDINO keyframe detection,
335
+ SAM2 image prediction, SAM2 video propagation.
336
+ """
337
+ import tempfile
338
+ import os
339
+
340
+ result = ProfilingResult(
341
+ detector_name=segmenter_name,
342
+ mode="segmentation",
343
+ warmup_frames=0,
344
+ )
345
+
346
+ # Open video for metadata
347
+ cap = cv2.VideoCapture(video_path)
348
+ if not cap.isOpened():
349
+ raise ValueError(f"Cannot open video: {video_path}")
350
+ result.video_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
351
+ result.video_resolution = f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}"
352
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
353
+ cap.release()
354
+
355
+ result.total_frames = min(total, max_frames)
356
+
357
+ has_cuda = torch.cuda.is_available()
358
+ if has_cuda:
359
+ torch.cuda.reset_peak_memory_stats()
360
+ torch.cuda.synchronize()
361
+
362
+ # Run GSAM2 with perf metrics
363
+ import threading
364
+
365
+ metrics = {
366
+ "end_to_end_ms": 0.0,
367
+ "frame_extraction_ms": 0.0,
368
+ "model_load_ms": 0.0,
369
+ "init_state_ms": 0.0,
370
+ "tracking_total_ms": 0.0,
371
+ "gdino_total_ms": 0.0,
372
+ "sam_image_total_ms": 0.0,
373
+ "sam_video_total_ms": 0.0,
374
+ "id_reconciliation_ms": 0.0,
375
+ "render_total_ms": 0.0,
376
+ "writer_total_ms": 0.0,
377
+ "gpu_peak_mem_mb": 0.0,
378
+ }
379
+ lock = threading.Lock()
380
+
381
+ fd, output_path = tempfile.mkstemp(prefix="profile_seg_", suffix=".mp4")
382
+ os.close(fd)
383
+
384
+ try:
385
+ from inference import run_grounded_sam2_tracking
386
+ run_grounded_sam2_tracking(
387
+ video_path,
388
+ output_path,
389
+ queries,
390
+ segmenter_name=segmenter_name,
391
+ step=step,
392
+ enable_gpt=False,
393
+ max_frames=max_frames,
394
+ _perf_metrics=metrics,
395
+ _perf_lock=lock,
396
+ )
397
+ except Exception as e:
398
+ logger.error("Profiled segmentation failed: %s", e)
399
+ raise
400
+ finally:
401
+ try:
402
+ os.remove(output_path)
403
+ except OSError:
404
+ pass
405
+
406
+ # Convert GSAM2 metrics to FrameTiming-like structure
407
+ n_frames = result.total_frames
408
+ n_keyframes = max(1, n_frames // step)
409
+
410
+ # Create synthetic per-frame timings from aggregate metrics
411
+ if n_frames > 0:
412
+ avg_gdino = metrics["gdino_total_ms"] / n_keyframes if n_keyframes else 0
413
+ avg_sam_img = metrics["sam_image_total_ms"] / n_keyframes if n_keyframes else 0
414
+ avg_sam_vid = metrics["sam_video_total_ms"] / max(1, n_frames - n_keyframes)
415
+ avg_render = metrics["render_total_ms"] / n_frames
416
+
417
+ for i in range(n_frames):
418
+ ft = FrameTiming(frame_idx=i)
419
+ is_keyframe = (i % step == 0)
420
+ if is_keyframe:
421
+ ft.preprocess_ms = avg_gdino
422
+ ft.gpu_kernel_ms = avg_sam_img
423
+ else:
424
+ ft.gpu_kernel_ms = avg_sam_vid
425
+ ft.postprocess_ms = avg_render
426
+ ft.decode_ms = metrics["frame_extraction_ms"] / n_frames
427
+ ft.total_ms = ft.decode_ms + ft.preprocess_ms + ft.gpu_kernel_ms + ft.postprocess_ms
428
+ result.frame_timings.append(ft)
429
+
430
+ result.profiled_frames = len(result.frame_timings)
431
+
432
+ if result.frame_timings:
433
+ result.decode_stats = TimingStats.from_samples([t.decode_ms for t in result.frame_timings])
434
+ result.preprocess_stats = TimingStats.from_samples([t.preprocess_ms for t in result.frame_timings])
435
+ result.gpu_kernel_stats = TimingStats.from_samples([t.gpu_kernel_ms for t in result.frame_timings])
436
+ result.postprocess_stats = TimingStats.from_samples([t.postprocess_ms for t in result.frame_timings])
437
+ result.total_stats = TimingStats.from_samples([t.total_ms for t in result.frame_timings])
438
+ result.avg_fps = 1000.0 / result.total_stats.mean_ms if result.total_stats.mean_ms > 0 else 0
439
+
440
+ # Additional GSAM2-specific metrics stored as metadata
441
+ result._gsam2_metrics = metrics # type: ignore[attr-defined]
442
+
443
+ if has_cuda:
444
+ torch.cuda.synchronize()
445
+ result.gpu_peak_memory_mb = max(
446
+ round(torch.cuda.max_memory_allocated() / (1024 ** 2), 1),
447
+ metrics.get("gpu_peak_mem_mb", 0),
448
+ )
449
+ result.gpu_allocated_mb = round(torch.cuda.memory_allocated() / (1024 ** 2), 1)
450
+
451
+ return result
utils/roofline.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Roofline model analysis for detection and segmentation pipelines.
2
+
3
+ Computes theoretical maximum throughput, identifies bottlenecks, and
4
+ provides actionable recommendations based on hardware specs and
5
+ profiling measurements.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from typing import Dict, List, Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Approximate GFLOPs per forward pass at reference resolution (640x480 for YOLO, 800x800 for DETR)
15
+ # These are rough estimates; actual FLOPs depend on input resolution and model variant.
16
+ _MODEL_FLOPS: Dict[str, float] = {
17
+ # Detection models (GFLOPs per frame)
18
+ "hf_yolov8": 78.9, # YOLOv8m ~79 GFLOPs at 640px
19
+ "detr_resnet50": 86.0, # DETR-R50 ~86 GFLOPs at 800px
20
+ "grounding_dino": 172.0, # Grounding DINO-B ~172 GFLOPs
21
+ "drone_yolo": 78.9, # Same arch as YOLOv8m
22
+
23
+ # Segmentation models (GFLOPs per keyframe)
24
+ "gsam2_small": 48.0, # SAM2 small encoder
25
+ "gsam2_base": 96.0, # SAM2 base encoder
26
+ "gsam2_large": 200.0, # SAM2 large encoder
27
+ "gsam2_tiny": 24.0, # SAM2 tiny encoder
28
+ }
29
+
30
+ # Approximate bytes moved per forward pass (weights + activations + I/O)
31
+ _MODEL_BYTES: Dict[str, float] = {
32
+ # In MB — approximate weight size + activation memory
33
+ "hf_yolov8": 52.0,
34
+ "detr_resnet50": 166.0,
35
+ "grounding_dino": 340.0,
36
+ "drone_yolo": 52.0,
37
+ "gsam2_small": 92.0,
38
+ "gsam2_base": 180.0,
39
+ "gsam2_large": 400.0,
40
+ "gsam2_tiny": 46.0,
41
+ }
42
+
43
+
44
+ @dataclass
45
+ class BottleneckBreakdown:
46
+ """Per-phase bottleneck identification."""
47
+ phase: str = "" # "decode", "preprocess", "transfer", "gpu_kernel", "postprocess"
48
+ time_ms: float = 0.0
49
+ fraction: float = 0.0 # Fraction of total pipeline time
50
+ is_bottleneck: bool = False
51
+
52
+
53
+ @dataclass
54
+ class RooflineResult:
55
+ """Complete roofline analysis output."""
56
+ # Hardware ceilings
57
+ peak_fp32_tflops: float = 0.0
58
+ peak_fp16_tflops: float = 0.0
59
+ peak_memory_bandwidth_gbps: float = 0.0
60
+ ridge_point_flop_per_byte: float = 0.0 # = peak_tflops / peak_bw
61
+
62
+ # Workload characteristics
63
+ model_name: str = ""
64
+ model_gflops: float = 0.0
65
+ model_bytes_mb: float = 0.0
66
+ operational_intensity: float = 0.0 # FLOPs / bytes_moved
67
+
68
+ # Achieved performance
69
+ achieved_tflops: float = 0.0
70
+ achieved_bandwidth_gbps: float = 0.0
71
+
72
+ # Bottleneck analysis
73
+ primary_bottleneck: str = "" # "decode", "transfer", "memory", "compute"
74
+ bottleneck_explanation: str = ""
75
+ phase_breakdown: List[BottleneckBreakdown] = field(default_factory=list)
76
+
77
+ # Throughput
78
+ theoretical_max_fps: float = 0.0
79
+ observed_fps: float = 0.0
80
+ utilization_pct: float = 0.0
81
+
82
+ # GPU memory
83
+ gpu_peak_memory_mb: float = 0.0
84
+ gpu_vram_total_mb: float = 0.0
85
+ memory_utilization_pct: float = 0.0
86
+
87
+ # Recommendations
88
+ recommendations: List[str] = field(default_factory=list)
89
+
90
+ # GSAM2-specific metrics (populated for segmentation mode)
91
+ gsam2_metrics: Optional[Dict] = None
92
+
93
+
94
+ def compute_roofline(hardware, profiling) -> RooflineResult:
95
+ """Compute roofline analysis from hardware info and profiling results.
96
+
97
+ Args:
98
+ hardware: HardwareInfo dataclass from hardware_info.py
99
+ profiling: ProfilingResult dataclass from profiler.py
100
+
101
+ Returns:
102
+ RooflineResult with theoretical ceilings, achieved performance,
103
+ bottleneck identification, and recommendations.
104
+ """
105
+ result = RooflineResult()
106
+ result.model_name = profiling.detector_name
107
+
108
+ # --- Hardware ceilings (use first GPU) ---
109
+ if hardware.gpus:
110
+ gpu = hardware.gpus[0]
111
+ result.peak_fp32_tflops = gpu.fp32_tflops or 0.0
112
+ result.peak_fp16_tflops = gpu.fp16_tflops or 0.0
113
+ result.peak_memory_bandwidth_gbps = gpu.memory_bandwidth_gbps or 0.0
114
+ if gpu.vram_total_gb:
115
+ result.gpu_vram_total_mb = gpu.vram_total_gb * 1024
116
+ else:
117
+ logger.warning("No GPU info available; roofline will have zero ceilings")
118
+
119
+ # Ridge point: where compute and memory roofs intersect
120
+ if result.peak_memory_bandwidth_gbps > 0:
121
+ # peak_tflops / peak_bw (TB/s) = FLOPs/byte
122
+ peak_tbps = result.peak_memory_bandwidth_gbps / 1000 # GB/s -> TB/s
123
+ if peak_tbps > 0:
124
+ result.ridge_point_flop_per_byte = result.peak_fp32_tflops / peak_tbps
125
+
126
+ # --- Workload characteristics ---
127
+ model_key = profiling.detector_name
128
+ result.model_gflops = _MODEL_FLOPS.get(model_key, 0.0)
129
+ result.model_bytes_mb = _MODEL_BYTES.get(model_key, 0.0)
130
+
131
+ if result.model_bytes_mb > 0:
132
+ # Operational intensity = FLOPs / bytes_moved
133
+ bytes_moved = result.model_bytes_mb * 1e6 # MB -> bytes
134
+ flops = result.model_gflops * 1e9 # GFLOPs -> FLOPs
135
+ result.operational_intensity = flops / bytes_moved if bytes_moved > 0 else 0
136
+
137
+ # --- Achieved performance ---
138
+ gpu_kernel_ms = profiling.gpu_kernel_stats.mean_ms if profiling.gpu_kernel_stats.count > 0 else 0
139
+ if gpu_kernel_ms > 0 and result.model_gflops > 0:
140
+ # Achieved TFLOPS = GFLOPs / (kernel_time_s)
141
+ kernel_time_s = gpu_kernel_ms / 1000
142
+ result.achieved_tflops = round(result.model_gflops / kernel_time_s / 1000, 4)
143
+
144
+ if gpu_kernel_ms > 0 and result.model_bytes_mb > 0:
145
+ kernel_time_s = gpu_kernel_ms / 1000
146
+ result.achieved_bandwidth_gbps = round(result.model_bytes_mb / kernel_time_s / 1000, 2)
147
+
148
+ # --- Per-phase bottleneck breakdown ---
149
+ phases = [
150
+ ("decode", profiling.decode_stats.mean_ms),
151
+ ("preprocess", profiling.preprocess_stats.mean_ms),
152
+ ]
153
+ # Only include transfer if we have valid measurements
154
+ if profiling.transfer_stats.count > 0 and profiling.transfer_stats.mean_ms >= 0:
155
+ phases.append(("transfer", profiling.transfer_stats.mean_ms))
156
+ phases.extend([
157
+ ("gpu_kernel", profiling.gpu_kernel_stats.mean_ms),
158
+ ("postprocess", profiling.postprocess_stats.mean_ms),
159
+ ])
160
+
161
+ total_phase_ms = sum(ms for _, ms in phases)
162
+ max_phase_name = ""
163
+ max_phase_ms = 0
164
+
165
+ for name, ms in phases:
166
+ bb = BottleneckBreakdown(
167
+ phase=name,
168
+ time_ms=round(ms, 3),
169
+ fraction=round(ms / total_phase_ms, 4) if total_phase_ms > 0 else 0,
170
+ )
171
+ if ms > max_phase_ms:
172
+ max_phase_ms = ms
173
+ max_phase_name = name
174
+ result.phase_breakdown.append(bb)
175
+
176
+ # Mark bottleneck phase
177
+ for bb in result.phase_breakdown:
178
+ if bb.phase == max_phase_name:
179
+ bb.is_bottleneck = True
180
+
181
+ # --- Primary bottleneck classification ---
182
+ if max_phase_name == "decode":
183
+ result.primary_bottleneck = "decode-bound"
184
+ result.bottleneck_explanation = (
185
+ f"Video decoding ({max_phase_ms:.1f}ms) is the slowest phase. "
186
+ "GPU is waiting for frames. Consider hardware-accelerated decoding (NVDEC) "
187
+ "or reducing input resolution."
188
+ )
189
+ elif max_phase_name == "transfer":
190
+ result.primary_bottleneck = "transfer-bound"
191
+ result.bottleneck_explanation = (
192
+ f"CPU->GPU data transfer ({max_phase_ms:.1f}ms) is the slowest phase. "
193
+ "Consider using pinned memory, reducing input tensor size, or "
194
+ "overlapping transfer with computation."
195
+ )
196
+ elif max_phase_name == "gpu_kernel":
197
+ # Sub-classify: memory-bound vs compute-bound
198
+ if result.operational_intensity > 0 and result.ridge_point_flop_per_byte > 0:
199
+ if result.operational_intensity < result.ridge_point_flop_per_byte:
200
+ result.primary_bottleneck = "memory-bound"
201
+ result.bottleneck_explanation = (
202
+ f"GPU kernel ({max_phase_ms:.1f}ms) is memory-bandwidth limited. "
203
+ f"Operational intensity ({result.operational_intensity:.1f} FLOP/byte) "
204
+ f"is below the ridge point ({result.ridge_point_flop_per_byte:.1f} FLOP/byte). "
205
+ "Consider model quantization (FP16/INT8), reducing batch size, "
206
+ "or using a more compute-dense model."
207
+ )
208
+ else:
209
+ result.primary_bottleneck = "compute-bound"
210
+ result.bottleneck_explanation = (
211
+ f"GPU kernel ({max_phase_ms:.1f}ms) is compute-limited. "
212
+ f"Achieved {result.achieved_tflops:.2f} TFLOPS out of "
213
+ f"{result.peak_fp32_tflops:.2f} TFLOPS peak "
214
+ f"({result.achieved_tflops / result.peak_fp32_tflops * 100:.1f}% utilization). "
215
+ "Consider FP16 inference, TensorRT optimization, or a smaller model."
216
+ if result.peak_fp32_tflops > 0
217
+ else "Consider a faster GPU or a smaller model."
218
+ )
219
+ else:
220
+ result.primary_bottleneck = "compute-bound"
221
+ result.bottleneck_explanation = (
222
+ f"GPU kernel ({max_phase_ms:.1f}ms) dominates pipeline time."
223
+ )
224
+ elif max_phase_name == "preprocess":
225
+ result.primary_bottleneck = "preprocess-bound"
226
+ result.bottleneck_explanation = (
227
+ f"CPU preprocessing ({max_phase_ms:.1f}ms) is the slowest phase. "
228
+ "Consider GPU-accelerated preprocessing or reducing input resolution."
229
+ )
230
+ elif max_phase_name == "postprocess":
231
+ result.primary_bottleneck = "postprocess-bound"
232
+ result.bottleneck_explanation = (
233
+ f"CPU post-processing/NMS ({max_phase_ms:.1f}ms) is the slowest phase. "
234
+ "Consider batched NMS on GPU or raising the confidence threshold."
235
+ )
236
+ else:
237
+ result.primary_bottleneck = "unknown"
238
+ result.bottleneck_explanation = "Unable to determine primary bottleneck."
239
+
240
+ # --- Throughput ---
241
+ # Theoretical max FPS = 1000 / max(phase_times)
242
+ if max_phase_ms > 0:
243
+ result.theoretical_max_fps = round(1000 / max_phase_ms, 2)
244
+ result.observed_fps = round(profiling.avg_fps, 2)
245
+ if result.theoretical_max_fps > 0:
246
+ result.utilization_pct = round(result.observed_fps / result.theoretical_max_fps * 100, 1)
247
+
248
+ # --- GPU memory ---
249
+ result.gpu_peak_memory_mb = profiling.gpu_peak_memory_mb
250
+ if result.gpu_vram_total_mb > 0:
251
+ result.memory_utilization_pct = round(
252
+ result.gpu_peak_memory_mb / result.gpu_vram_total_mb * 100, 1
253
+ )
254
+
255
+ # --- GSAM2 metrics ---
256
+ gsam2_metrics = getattr(profiling, "_gsam2_metrics", None)
257
+ if gsam2_metrics:
258
+ result.gsam2_metrics = gsam2_metrics
259
+
260
+ # --- Recommendations ---
261
+ recs = []
262
+
263
+ # Bottleneck-specific recommendations
264
+ if result.primary_bottleneck == "decode-bound":
265
+ recs.append("Use NVIDIA NVDEC for hardware-accelerated video decoding")
266
+ recs.append("Reduce input video resolution before processing")
267
+ elif result.primary_bottleneck == "transfer-bound":
268
+ recs.append("Use torch.cuda pinned memory for faster CPU->GPU transfers")
269
+ recs.append("Pre-allocate GPU tensors and reuse across frames")
270
+ elif result.primary_bottleneck == "memory-bound":
271
+ recs.append("Enable FP16 (half-precision) inference to reduce memory bandwidth pressure")
272
+ recs.append("Consider INT8 quantization via TensorRT for further speedup")
273
+ elif result.primary_bottleneck == "compute-bound":
274
+ recs.append("Enable FP16 inference (2x theoretical throughput on Volta+ GPUs)")
275
+ recs.append("Consider TensorRT or torch.compile() for kernel fusion")
276
+ if result.peak_fp32_tflops > 0 and result.achieved_tflops / result.peak_fp32_tflops < 0.3:
277
+ recs.append("Low GPU utilization — consider increasing batch size or using a multi-stream pipeline")
278
+
279
+ # General recommendations
280
+ if result.memory_utilization_pct > 80:
281
+ recs.append(f"GPU memory utilization is high ({result.memory_utilization_pct:.0f}%); "
282
+ "reduce batch size or use gradient checkpointing to avoid OOM")
283
+ elif result.memory_utilization_pct > 0 and result.memory_utilization_pct < 30:
284
+ recs.append(f"GPU memory utilization is low ({result.memory_utilization_pct:.0f}%); "
285
+ "consider processing multiple streams or increasing batch size")
286
+
287
+ if profiling.mode == "detection" and profiling.avg_fps < profiling.video_fps:
288
+ recs.append(
289
+ f"Processing speed ({profiling.avg_fps:.1f} FPS) is below video frame rate "
290
+ f"({profiling.video_fps:.1f} FPS); consider frame skipping or a faster model"
291
+ )
292
+
293
+ result.recommendations = recs
294
+ return result