可视化 anchor-target 赋值¶
[1]:
import os
import random
from pathlib import Path
import numpy as np
import torch
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="4"
device = torch.device('cpu')
[2]:
# Deterministic REPRODUCIBILITY
torch.manual_seed(24)
random.seed(5)
np.random.seed(95)
[3]:
import io
import contextlib
import cv2
from torchvision.ops import box_convert
[4]:
from yolort.data.data_module import COCODetectionDataModule
from yolort.models.transform import YOLOTransform
from yolort.utils.image_utils import (
color_list,
plot_one_box,
cv2_imshow,
load_names,
parse_single_image,
parse_images,
)
为测试设定 coco128 dataset 和 dataloader¶
[5]:
# Get COCO label names and COLORS list
LABELS = (
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush',
)
COLORS = color_list()
[8]:
# Acquire the images and labels from the coco128 dataset
data_path = Path('/media/pc/data/4tb/lxw/tests/datasets')
coco128_dirname = 'coco'
coco128_path = data_path / coco128_dirname
image_root = coco128_path / 'images' / 'train2017'
annotation_path = coco128_path / 'annotations'
batch_size = 8
with contextlib.redirect_stdout(io.StringIO()):
datamodule = COCODetectionDataModule(
image_root,
anno_path=annotation_path,
skip_val_set=True,
batch_size=batch_size,
)
---------------------------------------------------------------------------
JSONDecodeError Traceback (most recent call last)
/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb Cell 8' in <cell line: 10>()
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=7'>8</a> batch_size = 8
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=9'>10</a> with contextlib.redirect_stdout(io.StringIO()):
---> <a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=10'>11</a> datamodule = COCODetectionDataModule(
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=11'>12</a> image_root,
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=12'>13</a> anno_path=annotation_path,
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=13'>14</a> skip_val_set=True,
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=14'>15</a> batch_size=batch_size,
<a href='vscode-notebook-cell://ssh-remote%2Bxin/media/pc/data/4tb/lxw/books/yolov5-rt-stack/docs/source/notebooks/anchor-label-assignment-visualization.ipynb#ch0000007vscode-remote?line=15'>16</a> )
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/site-packages/yolort/data/data_module.py:111, in COCODetectionDataModule.__init__(self, data_path, anno_path, num_classes, data_task, train_set, val_set, skip_train_set, skip_val_set, train_transform, val_transform, batch_size, num_workers, *args, **kwargs)
107 train_ann_file = anno_path / f"{data_task}_{train_set}.json"
108 val_ann_file = anno_path / f"{data_task}_{val_set}.json"
110 train_dataset = (
--> 111 None if skip_train_set else COCODetection(data_path, train_ann_file, train_transform())
112 )
113 val_dataset = None if skip_val_set else COCODetection(data_path, val_ann_file, val_transform())
115 super().__init__(
116 train_dataset=train_dataset,
117 val_dataset=val_dataset,
(...)
121 **kwargs,
122 )
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/site-packages/yolort/data/coco.py:16, in COCODetection.__init__(self, img_folder, ann_file, transforms, return_masks)
15 def __init__(self, img_folder, ann_file, transforms, return_masks=False):
---> 16 super().__init__(img_folder, ann_file)
17 self._transforms = transforms
19 json_category_id_to_contiguous_id = {v: i for i, v in enumerate(self.coco.getCatIds())}
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/site-packages/torchvision/datasets/coco.py:36, in CocoDetection.__init__(self, root, annFile, transform, target_transform, transforms)
33 super().__init__(root, transforms, transform, target_transform)
34 from pycocotools.coco import COCO
---> 36 self.coco = COCO(annFile)
37 self.ids = list(sorted(self.coco.imgs.keys()))
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/site-packages/pycocotools-2.0.4-py3.10-linux-x86_64.egg/pycocotools/coco.py:82, in COCO.__init__(self, annotation_file)
80 tic = time.time()
81 with open(annotation_file, 'r') as f:
---> 82 dataset = json.load(f)
83 assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
84 print('Done (t={:0.2f}s)'.format(time.time()- tic))
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/json/__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
275 parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
276 """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
277 a JSON document) to a Python object.
278
(...)
291 kwarg; otherwise ``JSONDecoder`` is used.
292 """
--> 293 return loads(fp.read(),
294 cls=cls, object_hook=object_hook,
295 parse_float=parse_float, parse_int=parse_int,
296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
341 s = s.decode(detect_encoding(s), 'surrogatepass')
343 if (cls is None and object_hook is None and
344 parse_int is None and parse_float is None and
345 parse_constant is None and object_pairs_hook is None and not kw):
--> 346 return _default_decoder.decode(s)
347 if cls is None:
348 cls = JSONDecoder
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of ``s`` (a ``str`` instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
File /media/pc/data/4tb/lxw/anaconda3/envs/torch/lib/python3.10/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end
JSONDecodeError: Expecting value: line 1 column 455147521 (char 455147520)
[ ]:
test_dataloader = iter(datamodule.train_dataloader())
采样图片和目标¶
[ ]:
images, annotations = next(test_dataloader)
[ ]:
idx = random.randrange(batch_size)
img_raw = cv2.cvtColor(parse_single_image(images[idx]), cv2.COLOR_RGB2BGR) # For visualization
for box, label in zip(annotations[idx]['boxes'].tolist(), annotations[idx]['labels'].tolist()):
img_raw = plot_one_box(box, img_raw, color=COLORS[label % len(COLORS)], label=LABELS[label])
cv2_imshow(img_raw, imshow_scale=0.5)
Training Batch in Pipeline¶
[ ]:
from yolort.models import yolov5s
model = yolov5s()
model = model.train()
[ ]:
samples, targets = model.transform(images, annotations)
[ ]:
inputs = parse_images(samples.tensors)
[ ]:
attach_idx = torch.where(targets[:, 0].to(dtype=torch.int32) == idx)[0]
img_training = cv2.cvtColor(inputs[idx], cv2.COLOR_RGB2BGR) # For visualization
img_h, img_w = img_training.shape[:2]
targets_training = targets[attach_idx]
for box, label in zip(targets_training[:, 2:], targets[attach_idx][:, 1]):
label = int(label.tolist())
box = box_convert(box, in_fmt='cxcywh', out_fmt='xyxy')
box = (box * torch.tensor([img_h, img_w, img_h, img_w])).tolist()
img_training = plot_one_box(box, img_training, color=COLORS[label % len(COLORS)], label=LABELS[label])
cv2_imshow(img_training, imshow_scale=0.5)
提取中间特征¶
[ ]:
from yolort.utils import FeatureExtractor
[ ]:
yolo_features = FeatureExtractor(model.model, return_layers=['backbone', 'head'])
intermediate_features = yolo_features(samples.tensors, targets)
features = intermediate_features['backbone']
head_outputs = intermediate_features['head']
获取 Anchors 和 Strides¶
[ ]:
num_layers = len(head_outputs)
anchors = torch.as_tensor(model.model.anchor_generator.anchor_grids, dtype=torch.float32, device=device)
strides = torch.as_tensor(model.model.anchor_generator.strides, dtype=torch.float32, device=device)
anchors = anchors.view(num_layers, -1, 2) / strides.view(-1, 1, 1)
Assign Targets to Anchors¶
[ ]:
# Build targets for compute_loss(), input targets(image,class,x,y,w,h)
num_anchors = len(model.model.anchor_generator.anchor_grids) # number of anchors
num_targets = len(targets) # number of targets
targets_cls, targets_box, anchors_encode = [], [], []
indices = []
grid_assigner = [] # Anchor Visulization
gain = torch.ones(7, device=device) # normalized to gridspace gain
# same as .repeat_interleave(num_targets)
ai = torch.arange(num_anchors, device=device).float().view(num_anchors, 1).repeat(1, num_targets)
targets_append = torch.cat((targets.repeat(num_anchors, 1, 1), ai[:, :, None]), 2) # append anchor indices
g = 0.5 # bias
off = torch.tensor([[0, 0],
[1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m
# [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm
], device=device).float() * g # offsets
[ ]:
anchor_threshold = 4.0
What’s actually going on is the image is subdivided into a grid of squares, and the coordinates in grid[]
are the coordinates of the upper-left corner of that square.
The neural network provides \(x\), \(y\) coordinates in the range \((0, 1)\) (enforced by sigmoid) which covers the square, centered at 0.5. Multiplying by two allows detected \(x\), \(y\) coordinates to cover a larger range, slightly outside the square – otherwise it’s difficult to detect objects centered at grid boundaries. Subtracting 0.5 shifts the resulting range to \((-0.5, 1.5)\) which is centered around \((0, 1)\).
[ ]:
for i in range(num_layers):
anchors_per_layer = anchors[i]
gain[2:6] = torch.tensor(head_outputs[i].shape)[[3, 2, 3, 2]] # xyxy gain
# Match targets to anchors
targets_with_gain = targets_append * gain
if num_targets:
# Matches
ratios_wh = targets_with_gain[:, :, 4:6] / anchors_per_layer[:, None] # wh ratio
ratios_filtering = torch.max(ratios_wh, 1. / ratios_wh).max(2)[0]
inds = torch.where(ratios_filtering < anchor_threshold)
targets_with_gain = targets_with_gain[inds] # filter
# Offsets
grid_xy = targets_with_gain[:, 2:4] # grid xy
grid_xy_inverse = gain[[2, 3]] - grid_xy # inverse
inds_jk = (grid_xy % 1. < g) & (grid_xy > 1.)
inds_lm = (grid_xy_inverse % 1. < g) & (grid_xy_inverse > 1.)
inds_ones = torch.ones_like(inds_jk[:, 0])[:, None]
inds = torch.cat((inds_ones, inds_jk, inds_lm), dim=1).T
targets_with_gain = targets_with_gain.repeat((5, 1, 1))[inds]
offsets = (torch.zeros_like(grid_xy)[None] + off[:, None])[inds]
else:
targets_with_gain = targets_append[0]
offsets = torch.tensor(0, device=device)
# Define
bc = targets_with_gain[:, :2].long().T # image, class
grid_xy = targets_with_gain[:, 2:4] # grid xy
grid_wh = targets_with_gain[:, 4:6] # grid wh
grid_ij = (grid_xy - offsets).long()
# Append
a = targets_with_gain[:, 6].long() # anchor indices
# image, anchor, grid indices
indices.append((bc[0], a, grid_ij[:, 1].clamp_(0, gain[3] - 1),
grid_ij[:, 0].clamp_(0, gain[2] - 1)))
targets_box.append(torch.cat((grid_xy - grid_ij, grid_wh), 1)) # box
grid_assigner.append(torch.cat((grid_xy, grid_wh), 1))
anchors_encode.append(anchors_per_layer[a]) # anchors
targets_cls.append(bc[1]) # class
Visulization Anchor¶
[ ]:
from yolort.utils.image_utils import anchor_match_visualize
[ ]:
images_with_anchor = anchor_match_visualize(samples.tensors, grid_assigner, indices, anchors_encode, head_outputs)
[ ]:
cv2_imshow(images_with_anchor[idx], imshow_scale=0.5)
View this document as a notebook: https://github.com/zhiqwang/yolov5-rt-stack/blob/main/notebooks/anchor-label-assignment-visualization.ipynb