Skip to content

Commit 2bf34f5

Browse files
authored
PyTorch Hub amp.autocast() inference (#2641)
I think this should help speed up CUDA inference, as currently models may be running in FP32 inference mode on CUDA devices unnecesarily.
1 parent ee16983 commit 2bf34f5

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

models/common.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import torch
99
import torch.nn as nn
1010
from PIL import Image
11+
from torch.cuda import amp
1112

1213
from utils.datasets import letterbox
1314
from utils.general import non_max_suppression, make_divisible, scale_coords, xyxy2xywh
@@ -219,17 +220,17 @@ def forward(self, imgs, size=640, augment=False, profile=False):
219220
x = torch.from_numpy(x).to(p.device).type_as(p) / 255. # uint8 to fp16/32
220221
t.append(time_synchronized())
221222

222-
# Inference
223-
with torch.no_grad():
223+
with torch.no_grad(), amp.autocast(enabled=p.device.type != 'cpu'):
224+
# Inference
224225
y = self.model(x, augment, profile)[0] # forward
225-
t.append(time_synchronized())
226+
t.append(time_synchronized())
226227

227-
# Post-process
228-
y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes) # NMS
229-
for i in range(n):
230-
scale_coords(shape1, y[i][:, :4], shape0[i])
231-
t.append(time_synchronized())
228+
# Post-process
229+
y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes) # NMS
230+
for i in range(n):
231+
scale_coords(shape1, y[i][:, :4], shape0[i])
232232

233+
t.append(time_synchronized())
233234
return Detections(imgs, y, files, t, self.names, x.shape)
234235

235236

0 commit comments

Comments
 (0)