Commit a43dff4d authored by nikhil_rayaprolu's avatar nikhil_rayaprolu
Browse files

Initial commit

parents
This diff is collapsed.
"""
Mask R-CNN
Base Configurations class.
Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""
import math
import numpy as np
# Base Configuration Class
# Don't use this class directly. Instead, sub-class it and override
# the configurations you need to change.
class Config(object):
"""Base configuration class. For custom configurations, create a
sub-class that inherits from this one and override properties
that need to be changed.
"""
# Name the configurations. For example, 'COCO', 'Experiment 3', ...etc.
# Useful if your code needs to do things differently depending on which
# experiment is running.
NAME = None # Override in sub-classes
# NUMBER OF GPUs to use. For CPU training, use 1
GPU_COUNT = 1
# Number of images to train with on each GPU. A 12GB GPU can typically
# handle 2 images of 1024x1024px.
# Adjust based on your GPU memory and image sizes. Use the highest
# number that your GPU can handle for best performance.
IMAGES_PER_GPU = 2
# Number of training steps per epoch
# This doesn't need to match the size of the training set. Tensorboard
# updates are saved at the end of each epoch, so setting this to a
# smaller number means getting more frequent TensorBoard updates.
# Validation stats are also calculated at each epoch end and they
# might take a while, so don't set this too small to avoid spending
# a lot of time on validation stats.
STEPS_PER_EPOCH = 1000
# Number of validation steps to run at the end of every training epoch.
# A bigger number improves accuracy of validation stats, but slows
# down the training.
VALIDATION_STEPS = 50
# Backbone network architecture
# Supported values are: resnet50, resnet101
BACKBONE = "resnet101"
# The strides of each layer of the FPN Pyramid. These values
# are based on a Resnet101 backbone.
BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# Number of classification classes (including background)
NUM_CLASSES = 1 # Override in sub-classes
# Length of square anchor side in pixels
RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)
# Ratios of anchors at each cell (width/height)
# A value of 1 represents a square anchor, and 0.5 is a wide anchor
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
# Anchor stride
# If 1 then anchors are created for each cell in the backbone feature map.
# If 2, then anchors are created for every other cell, and so on.
RPN_ANCHOR_STRIDE = 1
# Non-max suppression threshold to filter RPN proposals.
# You can increase this during training to generate more propsals.
RPN_NMS_THRESHOLD = 0.7
# How many anchors per image to use for RPN training
RPN_TRAIN_ANCHORS_PER_IMAGE = 256
# ROIs kept after non-maximum supression (training and inference)
POST_NMS_ROIS_TRAINING = 2000
POST_NMS_ROIS_INFERENCE = 1000
# If enabled, resizes instance masks to a smaller size to reduce
# memory load. Recommended when using high-resolution images.
USE_MINI_MASK = True
MINI_MASK_SHAPE = (56, 56) # (height, width) of the mini-mask
# Input image resizing
# Generally, use the "square" resizing mode for training and inferencing
# and it should work well in most cases. In this mode, images are scaled
# up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
# scaling doesn't make the long side > IMAGE_MAX_DIM. Then the image is
# padded with zeros to make it a square so multiple images can be put
# in one batch.
# Available resizing modes:
# none: No resizing or padding. Return the image unchanged.
# square: Resize and pad with zeros to get a square image
# of size [max_dim, max_dim].
# pad64: Pads width and height with zeros to make them multiples of 64.
# If IMAGE_MIN_DIM is not None, then scale the small side to
# that size before padding. IMAGE_MAX_DIM is ignored in this mode.
# The multiple of 64 is needed to ensure smooth scaling of feature
# maps up and down the 6 levels of the FPN pyramid (2**6=64).
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_DIM = 800
IMAGE_MAX_DIM = 1024
# Image mean (RGB)
MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
# Number of ROIs per image to feed to classifier/mask heads
# The Mask RCNN paper uses 512 but often the RPN doesn't generate
# enough positive proposals to fill this and keep a positive:negative
# ratio of 1:3. You can increase the number of proposals by adjusting
# the RPN NMS threshold.
TRAIN_ROIS_PER_IMAGE = 200
# Percent of positive ROIs used to train classifier/mask heads
ROI_POSITIVE_RATIO = 0.33
# Pooled ROIs
POOL_SIZE = 7
MASK_POOL_SIZE = 14
# Shape of output mask
# To change this you also need to change the neural network mask branch
MASK_SHAPE = [28, 28]
# Maximum number of ground truth instances to use in one image
MAX_GT_INSTANCES = 100
# Bounding box refinement standard deviation for RPN and final detections.
RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Max number of final detections
DETECTION_MAX_INSTANCES = 100
# Minimum probability value to accept a detected instance
# ROIs below this threshold are skipped
DETECTION_MIN_CONFIDENCE = 0.7
# Non-maximum suppression threshold for detection
DETECTION_NMS_THRESHOLD = 0.3
# Learning rate and momentum
# The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
# weights to explode. Likely due to differences in optimzer
# implementation.
LEARNING_RATE = 0.001
LEARNING_MOMENTUM = 0.9
# Weight decay regularization
WEIGHT_DECAY = 0.0001
# Use RPN ROIs or externally generated ROIs for training
# Keep this True for most situations. Set to False if you want to train
# the head branches on ROI generated by code rather than the ROIs from
# the RPN. For example, to debug the classifier head without having to
# train the RPN.
USE_RPN_ROIS = True
# Train or freeze batch normalization layers
# None: Train BN layers. This is the normal mode
# False: Freeze BN layers. Good when using a small batch size
# True: (don't use). Set layer in training mode even when inferencing
TRAIN_BN = False # Defaulting to False since batch size is often small
# Gradient norm clipping
GRADIENT_CLIP_NORM = 5.0
def __init__(self):
"""Set values of computed attributes."""
# Effective batch size
self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
# Input image size
self.IMAGE_SHAPE = np.array(
[self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM, 3])
# Image meta data length
# See compose_image_meta() for details
self.IMAGE_META_SIZE = 1 + 3 + 3 + 4 + 1 + self.NUM_CLASSES
def display(self):
"""Display Configuration values."""
print("\nConfigurations:")
for a in dir(self):
if not a.startswith("__") and not callable(getattr(self, a)):
print("{:30} {}".format(a, getattr(self, a)))
print("\n")
from mrcnn import utils
import numpy as np
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from pycocotools import mask as maskUtils
import os
class MappingChallengeDataset(utils.Dataset):
def load_dataset(self, dataset_dir, load_small=False, return_coco=True):
""" Loads dataset released for the crowdAI Mapping Challenge(https://www.crowdai.org/challenges/mapping-challenge)
Params:
- dataset_dir : root directory of the dataset (can point to the train/val folder)
- load_small : Boolean value which signals if the annotations for all the images need to be loaded into the memory,
or if only a small subset of the same should be loaded into memory
"""
self.load_small = load_small
if self.load_small:
annotation_path = os.path.join(dataset_dir, "annotation-small.json")
else:
annotation_path = os.path.join(dataset_dir, "annotation.json")
image_dir = os.path.join(dataset_dir, "images")
print("Annotation Path ", annotation_path)
print("Image Dir ", image_dir)
assert os.path.exists(annotation_path) and os.path.exists(image_dir)
self.coco = COCO(annotation_path)
self.image_dir = image_dir
# Load all classes (Only Building in this version)
classIds = self.coco.getCatIds()
# Load all images
image_ids = list(self.coco.imgs.keys())
# register classes
for _class_id in classIds:
self.add_class("crowdai-mapping-challenge", _class_id, self.coco.loadCats(_class_id)[0]["name"])
# Register Images
for _img_id in image_ids:
assert(os.path.exists(os.path.join(image_dir, self.coco.imgs[_img_id]['file_name'])))
self.add_image(
"crowdai-mapping-challenge", image_id=_img_id,
path=os.path.join(image_dir, self.coco.imgs[_img_id]['file_name']),
width=self.coco.imgs[_img_id]["width"],
height=self.coco.imgs[_img_id]["height"],
annotations=self.coco.loadAnns(self.coco.getAnnIds(
imgIds=[_img_id],
catIds=classIds,
iscrowd=None)))
if return_coco:
return self.coco
def load_mask(self, image_id):
""" Loads instance mask for a given image
This function converts mask from the coco format to a
a bitmap [height, width, instance]
Params:
- image_id : reference id for a given image
Returns:
masks : A bool array of shape [height, width, instances] with
one mask per instance
class_ids : a 1D array of classIds of the corresponding instance masks
(In this version of the challenge it will be of shape [instances] and always be filled with the class-id of the "Building" class.)
"""
image_info = self.image_info[image_id]
assert image_info["source"] == "crowdai-mapping-challenge"
instance_masks = []
class_ids = []
annotations = self.image_info[image_id]["annotations"]
# Build mask of shape [height, width, instance_count] and list
# of class IDs that correspond to each channel of the mask.
for annotation in annotations:
class_id = self.map_source_class_id(
"crowdai-mapping-challenge.{}".format(annotation['category_id']))
if class_id:
m = self.annToMask(annotation, image_info["height"],
image_info["width"])
# Some objects are so small that they're less than 1 pixel area
# and end up rounded out. Skip those objects.
if m.max() < 1:
continue
# Ignore the notion of "is_crowd" as specified in the coco format
# as we donot have the said annotation in the current version of the dataset
instance_masks.append(m)
class_ids.append(class_id)
# Pack instance masks into an array
if class_ids:
mask = np.stack(instance_masks, axis=2)
class_ids = np.array(class_ids, dtype=np.int32)
return mask, class_ids
else:
# Call super class to return an empty mask
return super(MappingChallengeDataset, self).load_mask(image_id)
def image_reference(self, image_id):
"""Return a reference for a particular image
Ideally you this function is supposed to return a URL
but in this case, we will simply return the image_id
"""
return "crowdai-mapping-challenge::{}".format(image_id)
# The following two functions are from pycocotools with a few changes.
def annToRLE(self, ann, height, width):
"""
Convert annotation which can be polygons, uncompressed RLE to RLE.
:return: binary mask (numpy 2D array)
"""
segm = ann['segmentation']
if isinstance(segm, list):
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = maskUtils.frPyObjects(segm, height, width)
rle = maskUtils.merge(rles)
elif isinstance(segm['counts'], list):
# uncompressed RLE
rle = maskUtils.frPyObjects(segm, height, width)
else:
# rle
rle = ann['segmentation']
return rle
def annToMask(self, ann, height, width):
"""
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
:return: binary mask (numpy 2D array)
"""
rle = self.annToRLE(ann, height, width)
m = maskUtils.decode(rle)
return m
from pycocotools.coco import COCO
from mrcnn.cocoeval import COCOeval
from pycocotools import mask as maskUtils
import time
import numpy as np
############################################################
# COCO Evaluation
############################################################
def build_coco_results(dataset, image_ids, rois, class_ids, scores, masks):
"""Arrange resutls to match COCO specs in http://cocodataset.org/#format
"""
# If no results, return an empty list
if rois is None:
return []
results = []
for image_id in image_ids:
# Loop through detections
for i in range(rois.shape[0]):
class_id = class_ids[i]
score = scores[i]
bbox = np.around(rois[i], 1)
mask = masks[:, :, i]
result = {
"image_id": image_id,
"category_id": dataset.get_source_class_id(class_id, "crowdai-mapping-challenge"),
"bbox": [bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]],
"score": score,
"segmentation": maskUtils.encode(np.asfortranarray(mask)).encode('utf-8')
}
results.append(result)
return results
def evaluate_coco(model, dataset, coco, eval_type="bbox", limit=0, image_ids=None):
"""Runs official COCO evaluation.
dataset: A Dataset object with valiadtion data
eval_type: "bbox" or "segm" for bounding box or segmentation evaluation
limit: if not 0, it's the number of images to use for evaluation
"""
# Pick COCO images from the dataset
image_ids = image_ids or dataset.image_ids
# Limit to a subset
if limit:
image_ids = image_ids[:limit]
# Get corresponding COCO image IDs.
coco_image_ids = [dataset.image_info[id]["id"] for id in image_ids]
t_prediction = 0
t_start = time.time()
results = []
for i, image_id in enumerate(image_ids):
# Load image
image = dataset.load_image(image_id)
# Run detection
t = time.time()
print("="*100)
print("Image shape ", image.shape)
r = model.detect([image])
r = r[0]
t_prediction += (time.time() - t)
print("Prediction time : ", (time.time() - t))
# Convert results to COCO format
image_results = build_coco_results(dataset, coco_image_ids[i:i + 1],
r["rois"], r["class_ids"],
r["scores"], r["masks"])
print("Number of detections : ", len(r["rois"]))
print("Classes Predicted : ", r["class_ids"])
print("Scores : ", r["scores"])
results.extend(image_results)
# Load results. This modifies results with additional attributes.
coco_results = coco.loadRes(results)
# Evaluate
cocoEval = COCOeval(coco, coco_results, eval_type)
cocoEval.params.imgIds = coco_image_ids
cocoEval.evaluate()
cocoEval.accumulate()
ap = cocoEval._summarize(ap=1, iouThr=0.5, areaRng="all", maxDets=100)
ar = cocoEval._summarize(ap=0, areaRng="all", maxDets=100)
print("Precision : ", ap, " Recall : ", ar)
print("Prediction time: {}. Average {}/image".format(
t_prediction, t_prediction / len(image_ids)))
print("Total time: ", time.time() - t_start)
This diff is collapsed.
"""
Mask R-CNN
Multi-GPU Support for Keras.
Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
Ideas and a small code snippets from these sources:
https://github.com/fchollet/keras/issues/2436
https://medium.com/@kuza55/transparent-multi-gpu-training-on-tensorflow-with-keras-8b0016fd9012
https://github.com/avolkov1/keras_experiments/blob/master/keras_exp/multigpu/
https://github.com/fchollet/keras/blob/master/keras/utils/training_utils.py
"""
import tensorflow as tf
import keras.backend as K
import keras.layers as KL
import keras.models as KM
class ParallelModel(KM.Model):
"""Subclasses the standard Keras Model and adds multi-GPU support.
It works by creating a copy of the model on each GPU. Then it slices
the inputs and sends a slice to each copy of the model, and then
merges the outputs together and applies the loss on the combined
outputs.
"""
def __init__(self, keras_model, gpu_count):
"""Class constructor.
keras_model: The Keras model to parallelize
gpu_count: Number of GPUs. Must be > 1
"""
self.inner_model = keras_model
self.gpu_count = gpu_count
merged_outputs = self.make_parallel()
super(ParallelModel, self).__init__(inputs=self.inner_model.inputs,
outputs=merged_outputs)
def __getattribute__(self, attrname):
"""Redirect loading and saving methods to the inner model. That's where
the weights are stored."""
if 'load' in attrname or 'save' in attrname:
return getattr(self.inner_model, attrname)
return super(ParallelModel, self).__getattribute__(attrname)
def summary(self, *args, **kwargs):
"""Override summary() to display summaries of both, the wrapper
and inner models."""
super(ParallelModel, self).summary(*args, **kwargs)
self.inner_model.summary(*args, **kwargs)
def make_parallel(self):
"""Creates a new wrapper model that consists of multiple replicas of
the original model placed on different GPUs.
"""
# Slice inputs. Slice inputs on the CPU to avoid sending a copy
# of the full inputs to all GPUs. Saves on bandwidth and memory.
input_slices = {name: tf.split(x, self.gpu_count)
for name, x in zip(self.inner_model.input_names,
self.inner_model.inputs)}
output_names = self.inner_model.output_names
outputs_all = []
for i in range(len(self.inner_model.outputs)):
outputs_all.append([])
# Run the model call() on each GPU to place the ops there
for i in range(self.gpu_count):
with tf.device('/gpu:%d' % i):
with tf.name_scope('tower_%d' % i):
# Run a slice of inputs through this replica
zipped_inputs = zip(self.inner_model.input_names,
self.inner_model.inputs)
inputs = [
KL.Lambda(lambda s: input_slices[name][i],
output_shape=lambda s: (None,) + s[1:])(tensor)
for name, tensor in zipped_inputs]
# Create the model replica and get the outputs
outputs = self.inner_model(inputs)
if not isinstance(outputs, list):
outputs = [outputs]
# Save the outputs for merging back together later
for l, o in enumerate(outputs):
outputs_all[l].append(o)
# Merge outputs on CPU
with tf.device('/cpu:0'):
merged = []
for outputs, name in zip(outputs_all, output_names):
# If outputs are numbers without dimensions, add a batch dim.
def add_dim(tensor):
"""Add a dimension to tensors that don't have any."""
if K.int_shape(tensor) == ():
return KL.Lambda(lambda t: K.reshape(t, [1, 1]))(tensor)
return tensor
outputs = list(map(add_dim, outputs))
# Concatenate
merged.append(KL.Concatenate(axis=0, name=name)(outputs))
return merged
if __name__ == "__main__":
# Testing code below. It creates a simple model to train on MNIST and
# tries to run it on 2 GPUs. It saves the graph so it can be viewed
# in TensorBoard. Run it as:
#
# python3 parallel_model.py
import os
import numpy as np
import keras.optimizers
from keras.datasets import mnist
from keras.preprocessing.image import ImageDataGenerator
GPU_COUNT = 2
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")
def build_model(x_train, num_classes):
# Reset default graph. Keras leaves old ops in the graph,
# which are ignored for execution but clutter graph
# visualization in TensorBoard.
tf.reset_default_graph()
inputs = KL.Input(shape=x_train.shape[1:], name="input_image")
x = KL.Conv2D(32, (3, 3), activation='relu', padding="same",
name="conv1")(inputs)
x = KL.Conv2D(64, (3, 3), activation='relu', padding="same",
name="conv2")(x)
x = KL.MaxPooling2D(pool_size=(2, 2), name="pool1")(x)
x = KL.Flatten(name="flat1")(x)
x = KL.Dense(128, activation='relu', name="dense1")(x)
x = KL.Dense(num_classes, activation='softmax', name="dense2")(x)
return KM.Model(inputs, x, "digit_classifier_model")
# Load MNIST Data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = np.expand_dims(x_train, -1).astype('float32') / 255
x_test = np.expand_dims(x_test, -1).astype('float32') / 255
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
# Build data generator and model
datagen = ImageDataGenerator()