Add MS COCO utils

taehoonlee · taehoonlee · commit 4a34243891e6 · 2018-07-09T11:27:00.000+09:00
diff --git a/README.md b/README.md
@@ -299,23 +299,30 @@ with tf.Session() as sess:
   * `TinyYOLOv2VOC`: `TinyYOLOv2(inputs, TinyDarknet19)`,
   * `FasterRCNN_ZF_VOC`: `FasterRCNN(inputs, ZF)`,
   * `FasterRCNN_VGG16_VOC`: `FasterRCNN(inputs, VGG16, stem_out='conv5/3')`.
-- The mAPs were obtained with TensorNets on **PASCAL VOC2007 test set** and may slightly differ from the original ones.
+- The mAPs were obtained with TensorNets and may slightly differ from the original ones.
 - The test input sizes were the numbers reported as the best in the papers:
   * `YOLOv3`, `YOLOv2`: 416x416
   * `FasterRCNN`: min\_shorter\_side=600, max\_longer\_side=1000
 - The sizes stand for rounded the number of parameters.
 - The computation times were measured on NVIDIA Tesla P100 (3584 cores, 16 GB global memory) with cuDNN 6.0 and CUDA 8.0.
-  * Speed: milliseconds only for network inferences of a 416x416 single image
+  * Speed: milliseconds only for network inferences of a 416x416 or 608x608 single image
   * FPS: 1000 / speed
 
-|                                                                        | mAP    | Size   | Speed |  FPS  | References |
+| PASCAL VOC2007 test                                                    | mAP    | Size   | Speed |  FPS  | References |
 |------------------------------------------------------------------------|--------|--------|-------|-------|------------|
-| [YOLOv3VOC](tensornets/references/yolos.py#L175)                       | 0.7423 | 62M    | 24.09 | 41.51 | [[paper]](https://pjreddie.com/media/files/papers/YOLOv3.pdf) [[darknet]](https://pjreddie.com/darknet/yolo/) [[darkflow]](https://github.com/thtrieu/darkflow) |
-| [YOLOv2VOC](tensornets/references/yolos.py#L195)                       | 0.7320 | 51M    | 14.75 | 67.80 | [[paper]](https://arxiv.org/abs/1612.08242) [[darknet]](https://pjreddie.com/darknet/yolov2/) [[darkflow]](https://github.com/thtrieu/darkflow) |
-| [TinyYOLOv2VOC](tensornets/references/yolos.py#L205)                   | 0.5303 | 16M    | 6.534 | 153.0 | [[paper]](https://arxiv.org/abs/1612.08242) [[darknet]](https://pjreddie.com/darknet/yolov2/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+| [YOLOv3VOC(416)](tensornets/references/yolos.py#L175)                  | 0.7423 | 62M    | 24.09 | 41.51 | [[paper]](https://pjreddie.com/media/files/papers/YOLOv3.pdf) [[darknet]](https://pjreddie.com/darknet/yolo/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+| [YOLOv2VOC(416)](tensornets/references/yolos.py#L195)                  | 0.7320 | 51M    | 14.75 | 67.80 | [[paper]](https://arxiv.org/abs/1612.08242) [[darknet]](https://pjreddie.com/darknet/yolov2/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+| [TinyYOLOv2VOC(416)](tensornets/references/yolos.py#L205)              | 0.5303 | 16M    | 6.534 | 153.0 | [[paper]](https://arxiv.org/abs/1612.08242) [[darknet]](https://pjreddie.com/darknet/yolov2/) [[darkflow]](https://github.com/thtrieu/darkflow) |
 | [FasterRCNN\_ZF\_VOC](tensornets/references/rcnns.py#L151)               | 0.4466 | 59M    | 241.4 | 3.325 | [[paper]](https://arxiv.org/abs/1506.01497) [[caffe]](https://github.com/rbgirshick/py-faster-rcnn) [[roi-pooling]](https://github.com/deepsense-ai/roi-pooling) |
 | [FasterRCNN\_VGG16\_VOC](tensornets/references/rcnns.py#L187)            | 0.6872 | 137M   | 300.7 | 4.143 | [[paper]](https://arxiv.org/abs/1506.01497) [[caffe]](https://github.com/rbgirshick/py-faster-rcnn) [[roi-pooling]](https://github.com/deepsense-ai/roi-pooling) |
 
+| MS COCO val2014                                                        | mAP    | Size   | Speed |  FPS  | References |
+|------------------------------------------------------------------------|--------|--------|-------|-------|------------|
+| [YOLOv3COCO(608)](tensornets/references/yolos.py#L167)                 | 0.6016 | 62M    | 60.66 | 16.49 | [[paper]](https://pjreddie.com/media/files/papers/YOLOv3.pdf) [[darknet]](https://pjreddie.com/darknet/yolo/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+| [YOLOv3COCO(416)](tensornets/references/yolos.py#L167)                 | 0.6028 | 62M    | 40.23 | 24.85 | [[paper]](https://pjreddie.com/media/files/papers/YOLOv3.pdf) [[darknet]](https://pjreddie.com/darknet/yolo/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+| [YOLOv2COCO(608)](tensornets/references/yolos.py#L187)                 | 0.5189 | 51M    | 45.88 | 21.80 | [[paper]](https://arxiv.org/abs/1612.08242) [[darknet]](https://pjreddie.com/darknet/yolov2/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+| [YOLOv2COCO(416)](tensornets/references/yolos.py#L187)                 | 0.4922 | 51M    | 21.66 | 46.17 | [[paper]](https://arxiv.org/abs/1612.08242) [[darknet]](https://pjreddie.com/darknet/yolov2/) [[darkflow]](https://github.com/thtrieu/darkflow) |
+
 ## News 📰
 
 - PNASNetlarge is released, [12 May 2018](https://github.com/taehoonlee/tensornets/commit/e2e0f0f7791731d3b7dfa989cae569c15a22cdd6).
@@ -329,6 +336,6 @@ with tf.Session() as sess:
 - Add image classification models (PolyNet).
 - Add object detection models (MaskRCNN, SSD).
 - Add image segmentation models (FCN, UNet).
-- Add image datasets (COCO, OpenImages).
+- Add image datasets (OpenImages).
 - Add style transfer examples which can be coupled with any network in TensorNets.
 - Add speech and language models with representative datasets (WaveNet, ByteNet).
diff --git a/tensornets/datasets/__init__.py b/tensornets/datasets/__init__.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
 
+from . import coco
 from . import imagenet
 from . import voc
diff --git a/tensornets/datasets/coco.names b/tensornets/datasets/coco.names
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/tensornets/datasets/coco.py b/tensornets/datasets/coco.py
@@ -0,0 +1,204 @@
+"""Collection of MS COCO utils
+
+The codes were adapted from [py-faster-rcnn](https://github.com/
+rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py).
+"""
+from __future__ import division
+
+import os
+import json
+import numpy as np
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+
+try:
+    from pycocotools.coco import COCO
+except ImportError:
+    COCO = None
+
+try:
+    xrange          # Python 2
+except NameError:
+    xrange = range  # Python 3
+
+
+metas = {}
+
+with open(os.path.join(os.path.dirname(__file__), 'coco.names'), 'r') as f:
+    classnames = [line.rstrip() for line in f.readlines()]
+
+
+def classidx(classname):
+    return dict((k, i) for (i, k) in enumerate(classnames))[classname]
+
+
+def area(box):
+    if box.ndim == 1:
+        return (box[2] - box[0] + 1.) * (box[3] - box[1] + 1.)
+    else:
+        return (box[:, 2] - box[:, 0] + 1.) * (box[:, 3] - box[:, 1] + 1.)
+
+
+def get_files(data_dir, data_name, total_num=None):
+    assert COCO is not None, '`datasets.coco` requires `pycocotools`.'
+    if data_name not in metas:
+        metas[data_name] = COCO("%s/annotations/instances_%s.json" %
+                                (data_dir, data_name))
+    images = metas[data_name].imgs
+    fileids = images.keys()
+    if total_num is not None:
+        fileids = fileids[:total_num]
+    files = [images[i]['file_name'] for i in fileids]
+    return fileids, files
+
+
+def get_annotations(data_dir, data_name, ids):
+    assert COCO is not None, '`datasets.coco` requires `pycocotools`.'
+    if data_name not in metas:
+        metas[data_name] = COCO("%s/annotations/instances_%s.json" %
+                                (data_dir, data_name))
+    cmap = dict([(b, a) for (a, b) in enumerate(metas[data_name].getCatIds())])
+    annotations = {}
+    for i in ids:
+        annids = metas[data_name].getAnnIds(imgIds=i, iscrowd=None)
+        objs = metas[data_name].loadAnns(annids)
+        annotations[i] = [[] for _ in range(80)]
+        width = metas[data_name].imgs[i]['width']
+        height = metas[data_name].imgs[i]['height']
+        valid_objs = []
+        for obj in objs:
+            x1 = np.max((0, obj['bbox'][0]))
+            y1 = np.max((0, obj['bbox'][1]))
+            x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1))))
+            y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1))))
+            if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                obj_struct = {'bbox': [x1, y1, x2, y2]}
+                cidx = cmap[obj['category_id']]
+                annotations[i][cidx].append(obj_struct)
+    return annotations
+
+
+def load(data_dir, data_name, min_shorter_side=None, max_longer_side=1000,
+         batch_size=1, total_num=None):
+    assert cv2 is not None, '`load` requires `cv2`.'
+    _, files = get_files(data_dir, data_name, total_num)
+    total_num = len(files)
+
+    for batch_start in range(0, total_num, batch_size):
+        x = cv2.imread("%s/%s/%s" % (data_dir, data_name, files[batch_start]))
+        if min_shorter_side is not None:
+            scale = float(min_shorter_side) / np.min(x.shape[:2])
+        else:
+            scale = 1.0
+        if round(scale * np.max(x.shape[:2])) > max_longer_side:
+            scale = float(max_longer_side) / np.max(x.shape[:2])
+        x = cv2.resize(x, None, None, fx=scale, fy=scale,
+                       interpolation=cv2.INTER_LINEAR)
+        x = np.array([x], dtype=np.float32)
+        scale = np.array([scale], dtype=np.float32)
+        yield x, scale
+        del x
+
+
+def evaluate_class(ids, scores, boxes, annotations, files, ovthresh):
+    if scores.shape[0] == 0:
+        return 0.0, np.zeros(len(ids)), np.zeros(len(ids))
+
+    # extract gt objects for this class
+    diff = [np.array([0 for obj in annotations[filename]])
+            for filename in files]
+    total = sum([sum(x == 0) for x in diff])
+    detected = dict(zip(files, [[False] * len(x) for x in diff]))
+
+    # sort by confidence
+    sorted_ind = np.argsort(-scores)
+    ids = ids[sorted_ind]
+    boxes = boxes[sorted_ind, :]
+
+    # go down dets and mark TPs and FPs
+    tp_list = []
+    fp_list = []
+    for d in range(len(ids)):
+        actual = np.array([x['bbox'] for x in annotations[ids[d]]])
+        difficult = np.array([0 for x in annotations[ids[d]]])
+
+        if actual.size > 0:
+            iw = np.maximum(np.minimum(actual[:, 2], boxes[d, 2]) -
+                            np.maximum(actual[:, 0], boxes[d, 0]) + 1, 0)
+            ih = np.maximum(np.minimum(actual[:, 3], boxes[d, 3]) -
+                            np.maximum(actual[:, 1], boxes[d, 1]) + 1, 0)
+            inters = iw * ih
+            overlaps = inters / (area(actual) + area(boxes[d, :]) - inters)
+            jmax = np.argmax(overlaps)
+            ovmax = overlaps[jmax]
+        else:
+            ovmax = -np.inf
+
+        tp = 0.
+        fp = 0.
+        if ovmax > ovthresh:
+            if difficult[jmax] == 0:
+                if not detected[ids[d]][jmax]:
+                    tp = 1.
+                    detected[ids[d]][jmax] = True
+                else:
+                    fp = 1.
+        else:
+            fp = 1.
+        tp_list.append(tp)
+        fp_list.append(fp)
+
+    tp = np.cumsum(tp_list)
+    fp = np.cumsum(fp_list)
+    recall = tp / float(total)
+    precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = np.mean([0 if np.sum(recall >= t) == 0
+                  else np.max(precision[recall >= t])
+                  for t in np.linspace(0, 1, 11)])
+
+    return ap, precision, recall
+
+
+def evaluate(results, data_dir, data_name, ovthresh=0.5, verbose=True):
+    fileids, _ = get_files(data_dir, data_name)
+    fileids = fileids[:len(results)]
+    annotations = get_annotations(data_dir, data_name, fileids)
+    aps = []
+
+    for c in range(80):
+        ids = []
+        scores = []
+        boxes = []
+        for (i, fileid) in enumerate(fileids):
+            pred = results[i][c]
+            if pred.shape[0] > 0:
+                for k in xrange(pred.shape[0]):
+                    ids.append(fileid)
+                    scores.append(pred[k, -1])
+                    boxes.append(pred[k, :4] + 1)
+        ids = np.array(ids)
+        scores = np.array(scores)
+        boxes = np.array(boxes)
+        _annotations = dict((k, v[c]) for (k, v) in annotations.iteritems())
+        ap, _, _ = evaluate_class(ids, scores, boxes, _annotations,
+                                  fileids, ovthresh)
+        aps += [ap]
+
+    strs = ''
+    for c in range(80):
+        strs += "| %6s " % classnames[c][:6]
+    strs += '|\n'
+
+    for ap in aps:
+        strs += '|--------'
+    strs += '|\n'
+
+    for ap in aps:
+        strs += "| %.4f " % ap
+    strs += '|\n'
+
+    strs += "Mean = %.4f" % np.mean(aps)
+    return strs