-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathraft_tracker_backend.py
More file actions
408 lines (332 loc) · 15.6 KB
/
raft_tracker_backend.py
File metadata and controls
408 lines (332 loc) · 15.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
RAFT Tracker Backend - Isolated Subprocess
This script runs in a completely separate Python process.
ONNX imports here will NEVER contaminate the calling code.
"""
# Force UTF-8 output encoding on Windows
import sys
if sys.platform == 'win32':
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
# CRITICAL: onnxruntime must be imported BEFORE torch to avoid DLL conflicts on Windows
try:
import onnxruntime as ort
except ImportError:
ort = None
print("Warning: onnxruntime not found. DML (iGPU) mode will not work.")
import numpy as np
import os
import sys
import torch
import torch.nn.functional as F
import torchvision.models.optical_flow as of
from multiprocessing import shared_memory
import argparse
# ============================================================================
# HYBRID RAFT TRACKER
# ============================================================================
class RaftTracker:
"""
Hybrid RAFT Tracker with Integrated Sampling. Handles CUDA and DML backends.
"""
MODEL_DIR = 'models'
def __init__(self, H, W, num_landmarks, dtype=torch.float16, device='cuda', model_type='small'):
self.device = device
self.device_str = str(device).lower()
self.dtype = dtype
self.H = H
self.W = W
# 1. Logic & Config
if model_type.lower() == 'small':
self.weights = of.Raft_Small_Weights.DEFAULT
self.builder = of.raft_small
self.cache_base = "raft_small"
else:
self.weights = of.Raft_Large_Weights.DEFAULT
self.builder = of.raft_large
self.cache_base = "raft_large"
# 2. Execution Device Setup
if self.device_str in ['igpu', 'dml', 'directml'] and ort is not None:
self.mode = 'dml'
self.exec_device = torch.device('cpu')
print(f"[RAFT] Mode: DML (iGPU/ONNX). Logic execution on CPU.")
elif torch.cuda.is_available() and 'cuda' in self.device_str:
self.mode = 'cuda'
self.exec_device = torch.device('cuda', torch.cuda.current_device())
print(f"[RAFT] Mode: Standard CUDA ({self.exec_device}).")
else:
self.mode = 'cpu'
self.exec_device = torch.device('cpu')
print(f"[RAFT] Mode: Standard CPU.")
# 3. Caching Config
self.cache_base = f"{self.cache_base}_{H}x{W}_{str(self.dtype).split('.')[-1]}_w"
os.makedirs(self.MODEL_DIR, exist_ok=True)
# 4. Normalization Constants
self.mean = torch.tensor([0.5, 0.5, 0.5], dtype=self.dtype, device=self.exec_device).view(1, 3, 1, 1)
self.std = torch.tensor([0.5, 0.5, 0.5], dtype=self.dtype, device=self.exec_device).view(1, 3, 1, 1)
# 5. Buffer Allocations
self.img1 = torch.empty((1, 3, H, W), dtype=self.dtype, device=self.exec_device)
self.img2 = torch.empty((1, 3, H, W), dtype=self.dtype, device=self.exec_device)
# Bounds logic
self.bound_w = torch.tensor(W - W // 10, dtype=torch.float32, device=self.exec_device)
self.bound_h = torch.tensor(H - H // 10, dtype=torch.float32, device=self.exec_device)
self.min_pad_w = torch.tensor(W // 10, dtype=torch.float32, device=self.exec_device)
self.min_pad_h = torch.tensor(H // 10, dtype=torch.float32, device=self.exec_device)
# Pre-allocate masks
self.valid_mask = torch.empty(num_landmarks, dtype=torch.bool, device=self.exec_device)
self.temp_bool = torch.empty(num_landmarks, dtype=torch.bool, device=self.exec_device)
self.zero_points = torch.empty((0, 2), dtype=torch.float32, device=self.exec_device)
self.zero_mask = torch.empty(0, dtype=torch.bool, device=self.exec_device)
# 6. Init Backend
if self.mode == 'dml':
self._init_dml()
else:
self._init_cuda()
print(f"OK RAFT Tracker Ready: {H}x{W}")
def _init_dml(self):
onnx_file = f"{self.cache_base}_sampled_opset16.onnx"
onnx_path = os.path.join(self.MODEL_DIR, onnx_file)
if not os.path.exists(onnx_path):
print(f"[RAFT] Tracing & Exporting ONNX...")
self._export_onnx_with_sampling(onnx_path)
print(f"[RAFT] Loading DirectML Session...")
opts = ort.SessionOptions()
# 1. ENABLE OPTIMIZATIONS
opts.enable_mem_pattern = True
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# 2. Set provider options
providers = [('DmlExecutionProvider', {
'device_id': 0,
# 'skip_onnx_opt': False # Ensure DML specific opts are on
})]
self.session = ort.InferenceSession(onnx_path, sess_options=opts, providers=providers)
# 3. Setup IO Binding (Pre-calculating shapes helps)
self.binding = self.session.io_binding()
self.input_names = [node.name for node in self.session.get_inputs()]
self.output_names = [node.name for node in self.session.get_outputs()]
print(f"[RAFT] DirectML Session Ready")
def _export_onnx_with_sampling(self, path):
class RaftWithSampling(torch.nn.Module):
def __init__(self, builder, weights):
super().__init__()
self.model = builder(weights=weights)
def forward(self, img1, img2, norm_grid):
list_of_flows = self.model(img1, img2)
flow = list_of_flows[-1]
sampled_delta = F.grid_sample(
flow.to(dtype=torch.float32), norm_grid, mode='bilinear', padding_mode='border', align_corners=True
)
return sampled_delta.squeeze(2).permute(0, 2, 1)
model = RaftWithSampling(self.builder, self.weights).eval()
model = model.to('cuda')
if self.dtype == torch.float16:
model = model.half()
H, W = self.H, self.W
d_img = torch.randn(1, 3, H, W, dtype=self.dtype, device='cuda')
d_grid = torch.randn(1, 1, 128, 2, dtype=torch.float32, device='cuda')
print(f"[RAFT] Exporting...")
torch.onnx.export(
model, (d_img, d_img, d_grid), path,
input_names=["img1", "img2", "norm_grid"],
output_names=["sampled_delta"],
opset_version=16,
do_constant_folding=True,
dynamic_axes={'norm_grid': {2: 'num_points'}, 'sampled_delta': {1: 'num_points'}}
)
print(f"[RAFT] Export complete.")
def _init_cuda(self):
jit_file = f"{self.cache_base}_sampled_jit.pt"
jit_path = os.path.join(self.MODEL_DIR, jit_file)
if os.path.exists(jit_path):
print(f"[RAFT] Loading JIT from cache...")
self.traced_model = torch.jit.load(jit_path, map_location=self.exec_device).eval()
self.traced_model = torch.jit.freeze(self.traced_model)
else:
print(f"[RAFT] Tracing CUDA JIT with Sampling...")
class RaftWithSampling(torch.nn.Module):
def __init__(self, builder, weights):
super().__init__()
self.model = builder(weights=weights)
def forward(self, img1, img2, norm_grid):
flow = self.model(img1, img2)[-1]
sampled = F.grid_sample(flow.to(dtype=torch.float32), norm_grid, mode='bilinear',
padding_mode='border', align_corners=True)
return sampled.squeeze(2).permute(0, 2, 1)
model = RaftWithSampling(self.builder, self.weights).to(self.exec_device).eval()
if self.dtype == torch.float16:
model = model.half()
H, W = self.H, self.W
d_img = torch.randn(1, 3, H, W, device=self.exec_device, dtype=self.dtype)
d_grid = torch.randn(1, 1, 128, 2, device=self.exec_device, dtype=torch.float32)
with torch.no_grad(), torch.amp.autocast('cuda', dtype=self.dtype):
self.traced_model = torch.jit.trace(model, (d_img, d_img, d_grid), check_trace=False)
self.traced_model.eval()
self.traced_model = torch.jit.freeze(self.traced_model)
torch.jit.save(self.traced_model, jit_path)
print(f"[RAFT] Saved JIT to {jit_path}")
@torch.inference_mode()
def track_points(self, img1, img2, points):
"""
img1, img2: (3, H, W) RGB/BGR tensor
points: (N, 2) coordinates [x, y]
"""
# 1. Coordinate Normalization
norm_grid = points.unsqueeze(0).unsqueeze(1).clone()
if points.shape[0] == 0:
return self.zero_points, self.zero_mask, None, None
bw = self.bound_w.to(self.dtype)
bh = self.bound_h.to(self.dtype)
norm_grid[..., 0].mul_(2.0).div_(bw - 1).sub_(1.0)
norm_grid[..., 1].mul_(2.0).div_(bh - 1).sub_(1.0)
# 2. Image Prep
i1_norm = self.img1.copy_(img1[None]).div_(255.0).sub_(self.mean).div_(self.std)
i2_norm = self.img2.copy_(img2[None]).div_(255.0).sub_(self.mean).div_(self.std)
# 3. Inference
if self.mode == 'dml':
inputs = {
self.input_names[0]: i1_norm.cpu().numpy(),
self.input_names[1]: i2_norm.cpu().numpy(),
self.input_names[2]: norm_grid.cpu().numpy()
}
res = self.session.run(None, inputs)
delta = torch.from_numpy(res[0]).to(self.exec_device)
else:
ctx = torch.amp.autocast('cuda', dtype=self.dtype) if self.dtype == torch.float16 else torch.no_grad()
with ctx:
delta = self.traced_model(i1_norm, i2_norm, norm_grid)
# 4. Update
new_points = points + delta[0]
# 5. Masking
x = new_points[:, 0]
y = new_points[:, 1]
N = x.shape[0]
mask = self.valid_mask[:N]
torch.ge(x, self.min_pad_w, out=mask)
torch.lt(x, self.bound_w, out=self.temp_bool[:N])
mask.bitwise_and_(self.temp_bool[:N])
torch.ge(y, self.min_pad_h, out=self.temp_bool[:N])
mask.bitwise_and_(self.temp_bool[:N])
torch.lt(y, self.bound_h, out=self.temp_bool[:N])
mask.bitwise_and_(self.temp_bool[:N])
return new_points, mask, None, None
# ============================================================================
# SUBPROCESS MAIN LOOP
# ============================================================================
def main():
parser = argparse.ArgumentParser(description='RAFT Tracker Backend Subprocess')
parser.add_argument('--shm-in', required=True, help='Input shared memory name')
parser.add_argument('--shm-out', required=True, help='Output shared memory name')
parser.add_argument('--height', type=int, required=True)
parser.add_argument('--width', type=int, required=True)
parser.add_argument('--num-points', type=int, required=True)
parser.add_argument('--dtype', required=True, choices=['float16', 'float32'])
parser.add_argument('--device', required=True)
parser.add_argument('--model-type', required=True, choices=['small', 'large'])
# Event names (cross-platform file-based)
parser.add_argument('--evt-start', required=True)
parser.add_argument('--evt-done', required=True)
parser.add_argument('--evt-stop', required=True)
args = parser.parse_args()
# Reconstruct dtype
dtype = torch.float16 if args.dtype == 'float16' else torch.float32
d_size = 2 if dtype == torch.float16 else 4
H, W, N = args.height, args.width, args.num_points
# Shared memory layout
img_size = 3 * H * W * d_size
pts_size = N * 4 * d_size
# Attach to shared memory
shm_in = shared_memory.SharedMemory(name=args.shm_in)
shm_out = shared_memory.SharedMemory(name=args.shm_out)
# Reconstruct file-based events
class SimpleEvent:
"""File-based event for cross-process signaling"""
def __init__(self, name):
self.name = name
self.path = os.path.join(os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else '.',
f"raft_event_{name}.flag")
# Use temp directory on Windows
if sys.platform == 'win32':
import tempfile
self.path = os.path.join(tempfile.gettempdir(), f"raft_event_{name}.flag")
def set(self):
with open(self.path, 'wb') as f:
f.write(b'\x01')
def clear(self):
try:
with open(self.path, 'wb') as f:
f.write(b'\x00')
except:
pass
def wait(self, timeout=None):
import time
start = time.time()
while True:
try:
with open(self.path, 'rb') as f:
if f.read(1) == b'\x01':
return True
except:
pass
if timeout and (time.time() - start) > timeout:
return False
time.sleep(0.001)
def is_set(self):
try:
with open(self.path, 'rb') as f:
return f.read(1) == b'\x01'
except:
return False
evt_start = SimpleEvent(args.evt_start)
evt_done = SimpleEvent(args.evt_done)
evt_stop = SimpleEvent(args.evt_stop)
# Initialize tracker
print("[Backend] Initializing tracker...")
sys.stdout.flush() # Force flush before slow operation
tracker = RaftTracker(H, W, N, dtype=dtype, device=args.device, model_type=args.model_type)
print("[Backend] Initialization complete. Ready for frames.")
sys.stdout.flush() # Force flush
evt_done.set() # Signal init complete
try:
while True:
# Wait for start signal
evt_start.wait()
if evt_stop.is_set():
break
evt_start.clear()
# Read inputs from shared memory
np_img1 = np.ndarray((3, H, W), dtype=np.float16 if dtype == torch.float16 else np.float32,
buffer=shm_in.buf, offset=0)
t_img1 = torch.from_numpy(np_img1).to(tracker.exec_device)
np_img2 = np.ndarray((3, H, W), dtype=np.float16 if dtype == torch.float16 else np.float32,
buffer=shm_in.buf, offset=img_size)
t_img2 = torch.from_numpy(np_img2).to(tracker.exec_device)
np_pts = np.ndarray((N, 2), dtype=np.float32,
buffer=shm_in.buf, offset=img_size + img_size)
t_pts = torch.from_numpy(np_pts).to(tracker.exec_device)
# Run tracking
new_pts, mask, _, _ = tracker.track_points(t_img1, t_img2, t_pts)
# Write outputs
res_pts_np = new_pts.cpu().numpy()
np_out_pts = np.ndarray((N, 2), dtype=np.float32, buffer=shm_out.buf, offset=0)
np_out_pts[:] = res_pts_np[:]
res_mask_np = mask.cpu().numpy().astype(np.bool_)
np_out_mask = np.ndarray((N,), dtype=np.bool_, buffer=shm_out.buf, offset=pts_size)
np_out_mask[:] = res_mask_np[:]
# Signal done
evt_done.set()
except KeyboardInterrupt:
print("[Backend] Interrupted")
except Exception as e:
print(f"[Backend] ERROR: {e}")
import traceback
traceback.print_exc()
finally:
print("[Backend] Shutting down.")
shm_in.close()
shm_out.close()
if __name__ == "__main__":
main()