windows编译tensorflow tensorflow单机多卡程序的框架 tensorflow的操作 tensorflow的变量初始化和scope 人体姿态检测 segmentation标注工具 tensorflow模型恢复与inference的模型简化 利用多线程读取数据加快网络训练 tensorflow使用LSTM pytorch examples 利用tensorboard调参 深度学习中的loss函数汇总 纯C++代码实现的faster rcnn tensorflow使用记录 windows下配置caffe_ssd use ubuntu caffe as libs use windows caffe like opencv windows caffe implement caffe model convert to keras model flappyBird DQN Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Neural Networks Fast-style-transfer tensorflow安装 tensorflow DQN Fully Convolutional Models for Semantic Segmentation Transposed Convolution, Fractionally Strided Convolution or Deconvolution 基于tensorflow的分布式部署 用python实现mlp bp算法 用tensorflow和tflearn搭建经典网络结构 Data Augmentation Tensorflow examples Training Faster RCNN with Online Hard Example Mining 使用Tensorflow做Prisma图像风格迁移 RNN(循环神经网络)推导 深度学习中的稀疏编码思想 利用caffe与lmdb读写图像数据 分析voc2007检测数据 用python写caffe网络配置 ssd开发 将KITTI的数据格式转换为VOC Pascal的xml格式 Faster RCNN 源码分析 在Caffe中建立Python layer 在Caffe中建立C++ layer 为什么CNN反向传播计算梯度时需要将权重旋转180度 Caffe使用教程(下) Caffe使用教程(上) CNN反向传播 Softmax回归 Caffe Ubuntu下环境配置

ssd开发

2016年06月08日

SSD开发

在我的github使用SSD检测算法在KITTI数据集上训练了一个车辆行人的模型,下面列出一些修改的小模块。

训练好的模型如何测试

SSD给出了一个VideoData的层用于读取摄像头视频进行检测,现在修改这个layer使它支持本地视频检测,首先需要修改的是caffe.proto里的layer定义

message VideoDataParameter{
  enum VideoType {
    WEBCAM = 0;
    LOCAL_SOURCE = 1;
  }
  optional VideoType video_type = 1 [default = WEBCAM];
  optional int32 device_id = 2 [default = 0];
  optional string source = 3;
}

定义一个本地视频读取VideoType的LOCAL_SOURCE和对应的路径optional string source = 3,这样test时候VideoData使用video_tye为LOCAL_SOURCE并指定source路径并修改部分源码就能支持本地视频检测。

比如

layer {
  name: "data"
  type: "VideoData"
  top: "data"
  transform_param {
    mean_value: 104
    mean_value: 117
    mean_value: 123
    resize_param {
      prob: 1
      resize_mode: WARP
      height: 600
      width: 600
      interp_mode: LINEAR
    }
  }
  data_param {
    batch_size: 1
  }
  video_data_param {
    video_type: WEBCAM
    device_id: 0
  }
}

读取摄像头视频

layer {
  name: "data"
  type: "VideoData"
  top: "data"
  transform_param {
    mean_value: 104
    mean_value: 117
    mean_value: 123
    resize_param {
      prob: 1
      resize_mode: WARP
      height: 600
      width: 600
      interp_mode: LINEAR
    }
  }
  data_param {
    batch_size: 1
  }
  video_data_param {
    video_type: LOCAL_SOURCE
    #device_id: 0
    source: "/home/bsl/Debug/ssd_caffe/04050833_2639.MP4"
  }
}

读取本地视频

下面给出修改后VideoData的源码

#ifdef USE_OPENCV
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#endif  // USE_OPENCV

#include <stdint.h>
#include <algorithm>
#include <map>
#include <vector>

#include "caffe/data_transformer.hpp"
#include "caffe/layers/video_data_layer.hpp"
#include "caffe/util/benchmark.hpp"

namespace caffe {

template <typename Dtype>
VideoDataLayer<Dtype>::VideoDataLayer(const LayerParameter& param)
  : BasePrefetchingDataLayer<Dtype>(param) {
}

template <typename Dtype>
VideoDataLayer<Dtype>::~VideoDataLayer() {
  this->StopInternalThread();
  if (video_type_ == VideoDataParameter_VideoType_WEBCAM) {
    cap_.release();
  }
//添加一个LOCAL_SOURCE的析构函数
  else if (video_type_ == VideoDataParameter_VideoType_LOCAL_SOURCE) {
    cap_.release();
  }
}

template <typename Dtype>
void VideoDataLayer<Dtype>::DataLayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  const int batch_size = this->layer_param_.data_param().batch_size();
  const VideoDataParameter& video_data_param =
      this->layer_param_.video_data_param();
  video_type_ = video_data_param.video_type();

  vector<int> top_shape;
  if (video_type_ == VideoDataParameter_VideoType_WEBCAM) {
    const int device_id = video_data_param.device_id();
    if (!cap_.open(device_id)) {
      LOG(FATAL) << "Failed to open webcam: " << device_id;
    }
    // Read an image, and use it to initialize the top blob.
    cv::Mat cv_img;
    cap_ >> cv_img;
    CHECK(cv_img.data) << "Could not load image from webcam!";
    // Use data_transformer to infer the expected blob shape from a cv_image.
    top_shape = this->data_transformer_->InferBlobShape(cv_img);
    this->transformed_data_.Reshape(top_shape);
  }
//添加一个LOCAL_SOURCE的初始化
  else if (video_type_ == VideoDataParameter_VideoType_LOCAL_SOURCE) {
    const string source = video_data_param.source();
    if (!cap_.open(source)) {
      LOG(FATAL) << "Failed to open source: "<<source ;
    }
    // Read an image, and use it to initialize the top blob.
    cv::Mat cv_img;
    cap_ >> cv_img;
    CHECK(cv_img.data) << "Could not load image from source file!";
    // Use data_transformer to infer the expected blob shape from a cv_image.
    top_shape = this->data_transformer_->InferBlobShape(cv_img);
    this->transformed_data_.Reshape(top_shape);
  }
  top_shape[0] = batch_size;
  top[0]->Reshape(top_shape);
  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    this->prefetch_[i].data_.Reshape(top_shape);
  }
  LOG(INFO) << "output data size: " << top[0]->num() << ","
      << top[0]->channels() << "," << top[0]->height() << ","
      << top[0]->width();
  // label
  if (this->output_labels_) {
    vector<int> label_shape(1, batch_size);
    top[1]->Reshape(label_shape);
    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
      this->prefetch_[i].label_.Reshape(label_shape);
    }
  }
}

// This function is called on prefetch thread
template<typename Dtype>
void VideoDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
  CPUTimer batch_timer;
  batch_timer.Start();
  double read_time = 0;
  double trans_time = 0;
  CPUTimer timer;
  CHECK(batch->data_.count());
  CHECK(this->transformed_data_.count());

  // Reshape according to the first anno_datum of each batch
  // on single input batches allows for inputs of varying dimension.
  const int batch_size = this->layer_param_.data_param().batch_size();
  vector<int> top_shape;
  if (video_type_ == VideoDataParameter_VideoType_WEBCAM) {
    cv::Mat cv_img;
    cap_ >> cv_img;
    CHECK(cv_img.data) << "Could not load image from webcam!";
    // Use data_transformer to infer the expected blob shape from a cv_img.
    top_shape = this->data_transformer_->InferBlobShape(cv_img);
  }

  else if (video_type_ == VideoDataParameter_VideoType_LOCAL_SOURCE) {
    cv::Mat cv_img;
    cap_ >> cv_img;
    CHECK(cv_img.data) << "Could not load image from local source!";
    // Use data_transformer to infer the expected blob shape from a cv_img.
    top_shape = this->data_transformer_->InferBlobShape(cv_img);
  }

  this->transformed_data_.Reshape(top_shape);
  // Reshape batch according to the batch_size.
  top_shape[0] = batch_size;
  batch->data_.Reshape(top_shape);

  Dtype* top_data = batch->data_.mutable_cpu_data();
  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
  if (this->output_labels_) {
    top_label = batch->label_.mutable_cpu_data();
  }

  for (int item_id = 0; item_id < batch_size; ++item_id) {
    timer.Start();
    if (video_type_ == VideoDataParameter_VideoType_WEBCAM) {
      cv::Mat cv_img;
      cap_ >> cv_img;
      CHECK(cv_img.data) << "Could not load image from webcam!";
      read_time += timer.MicroSeconds();
      timer.Start();
      // Apply transformations (mirror, crop...) to the image
      int offset = batch->data_.offset(item_id);
      this->transformed_data_.set_cpu_data(top_data + offset);
      this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
      trans_time += timer.MicroSeconds();
    }
    else if (video_type_ == VideoDataParameter_VideoType_LOCAL_SOURCE) {
      cv::Mat cv_img;
      cap_ >> cv_img;
      CHECK(cv_img.data) << "Could not load image from local source!";
      read_time += timer.MicroSeconds();
      timer.Start();
      // Apply transformations (mirror, crop...) to the image
      int offset = batch->data_.offset(item_id);
      this->transformed_data_.set_cpu_data(top_data + offset);
      this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
      trans_time += timer.MicroSeconds();
    }
    if (this->output_labels_) {
      top_label[item_id] = 0;
    }
  }
  timer.Stop();
  batch_timer.Stop();
  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
}

INSTANTIATE_CLASS(VideoDataLayer);
REGISTER_LAYER_CLASS(VideoData);

}  // namespace caffe

如何训练和测试

训练和测试

cd /home/bsl/Debug/ssd_caffe
./build/tools/caffe train \
--solver="/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/solver.prototxt" \ 
--weights="/home/bsl/Debug/ssd_caffe/models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel" \#fine-tune model
--iterations=7481 \ #测试7481张图片
--gpu 0 2>&1 | tee /home/bsl/Debug/ssd_caffe/jobs/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150.log

solver文件

train_net: "/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/train.prototxt"
test_net: "/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/test.prototxt"
test_iter: 7481
test_interval: 10000
base_lr: 0.001
display: 10
max_iter: 60000
lr_policy: "step"
gamma: 0.1
momentum: 0.9
weight_decay: 0.0005
stepsize: 40000
snapshot: 40000
snapshot_prefix: "/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150"
solver_mode: GPU
device_id: 0
debug_info: false
snapshot_after_train: true
test_initialization: false
average_loss: 10
iter_size: 1
type: "SGD"
eval_type: "detection"
ap_version: "11point"

已有模型直接测试

cd /home/bsl/Debug/ssd_caffe
./build/tools/caffe train \
--solver="/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/solver_test.prototxt" \
--weights="/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150_iter_60000.caffemodel" \ #训练好的模型
--iterations=7481 \
--gpu 0 2>&1 | tee /home/bsl/Debug/ssd_caffe/jobs/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150_TEST.log

solver文件

train_net: "/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/train.prototxt"
test_net: "/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/test.prototxt"
test_iter: 7481
test_interval: 10000
base_lr: 0.001
display: 10
max_iter: 0 #max_iter设为0后就不进行训练了直接进入测试网络阶段
lr_policy: "step"
gamma: 0.1
momentum: 0.9
weight_decay: 0.0005
stepsize: 40000
snapshot: 40000
snapshot_prefix: "/home/bsl/Debug/ssd_caffe/models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150"
solver_mode: GPU
device_id: 0
debug_info: false
snapshot_after_train: true
test_initialization: false
average_loss: 10
iter_size: 1
type: "SGD"
eval_type: "detection"
ap_version: "11point"

如何导出python接口

由于ssd工程全在c++下编写,在使用时非常不方便,所以可以参考caffe的python重写一个ssd的接口,下面给出python接口:

检测图片

import sys
sys.path.append('/home/bsl/Debug/ssd_caffe/python/')#添加caffe的python接口
import caffe
import os
import numpy as np
import cv2
import time

#定义一个计时的类用于测试计算时间
class Timer(object):
    """A simple timer."""
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff

caffe_root = "/home/bsl/Debug/ssd_caffe/"
if os.path.isfile(caffe_root + 'models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150_iter_60000.caffemodel'):
    print 'CaffeNet found.'
else:
    print 'CaffeNet not found'
model_def = caffe_root + 'models/VGGNet/KITTI/SSD_600x150/deploy.prototxt'
model_weights = caffe_root + 'models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150_iter_60000.caffemodel'

net = caffe.Net(model_def,model_weights,caffe.TEST)
caffe.set_device(0)  # if we have multiple GPUs, pick the first one
caffe.set_mode_gpu()
mu = np.array([104, 117, 123])
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2,0,1))  # move image channels to outermost dimension
transformer.set_mean('data', mu)            # subtract the dataset-mean value in each channel
transformer.set_raw_scale('data', 255)      # rescale from [0, 1] to [0, 255]
transformer.set_channel_swap('data', (2,1,0))  # swap channels from RGB to BGR
net.blobs['data'].reshape(1,3,150, 600)

test_image_path=caffe_root+'data/KITTI/training/data_object_image_2/training/image_2'
color=[(255,0,0),(0,255,0),(0,0,255)]
visualize_threshold=0.6
for parent, dirnames, filenames in os.walk(test_image_path):
    for filename in filenames:
        timer = Timer()
        img_path=caffe_root + 'data/KITTI/training/data_object_image_2/training/image_2/'+filename
        result_path=caffe_root + 'data/KITTI/results/'+filename
        image = caffe.io.load_image(img_path)
        transformed_image = transformer.preprocess('data', image)
        net.blobs['data'].data[...] = transformed_image
        timer.tic()
        output = net.forward() #detectors 1*1*N*7 N*(image-id, label, confidence, xmin, ymin, xmax, ymax)
        timer.toc()
        shape=output['detection_out'].shape
        detectors=output['detection_out'].reshape(shape[2],shape[3])
        #visualize
        img=cv2.imread(img_path)
        size=img.shape
        for i in xrange(detectors.shape[0]):
            if detectors[i][2]>=visualize_threshold:
                xmin=int(detectors[i][3]*size[1])
                ymin=int(detectors[i][4]*size[0])
                xmax=int(detectors[i][5]*size[1])
                ymax=int(detectors[i][6]*size[0])
                label=detectors[i][1]
                rect_start=(xmin,ymin)
                rect_end=(xmax,ymax)
                cv2.rectangle(img, rect_start, rect_end, color[int(label-1)], 2)
        #cv2.imshow('image',img)
        #cv2.waitKey(0)
        cv2.imwrite(result_path,img)
        print ('Detection took {:.3f}s').format(timer.total_time)

检测视频

import sys
sys.path.append('/home/bsl/Debug/ssd_caffe/python/')
import caffe
import os
import numpy as np
import cv2
import time

class Timer(object):
    """A simple timer."""
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff
caffe_root = "/home/bsl/Debug/ssd_caffe/"
if os.path.isfile(caffe_root + 'models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150_iter_60000.caffemodel'):
    print 'CaffeNet found.'
else:
    print 'CaffeNet not found'
model_def = caffe_root + 'models/VGGNet/KITTI/SSD_600x150/deploy_large.prototxt'
model_weights = caffe_root + 'models/VGGNet/KITTI/SSD_600x150/VGG_KITTI_SSD_600x150_iter_60000.caffemodel'

net = caffe.Net(model_def,model_weights,caffe.TEST)
caffe.set_device(0)  # if we have multiple GPUs, pick the first one
caffe.set_mode_gpu()
mu = np.array([104, 117, 123])
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2,0,1))  # move image channels to outermost dimension
transformer.set_mean('data', mu)            # subtract the dataset-mean value in each channel
transformer.set_raw_scale('data', 255)      # rescale from [0, 1] to [0, 255]
transformer.set_channel_swap('data', (2,1,0))  # swap channels from RGB to BGR
net.blobs['data'].reshape(1,3,270, 480)#可作适当调整

test_image_path=caffe_root+'data/KITTI/training/data_object_image_2/testing/image_2'
color=[(255,0,0),(0,255,0),(0,0,255)]
visualize_threshold=0.6


dir_name='04041652_2624.MP4'
dir_root=os.path.join(caffe_root,dir_name)
videoCapture = cv2.VideoCapture(dir_root)
fps = videoCapture.get(cv2.cv.CV_CAP_PROP_FPS)
#fps=25
size = (int(videoCapture.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT))/2,int(videoCapture.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH))/2)
success, frame = videoCapture.read()


#cv2.cv.CV_FOURCC('I','4','2','0') avi
#cv2.cv.CV_FOURCC('P','I','M','1') avi
#cv2.cv.CV_FOURCC('M','J','P','G') avi
#cv2.cv.CV_FOURCC('T','H','E','O') ogv
#cv2.cv.CV_FOURCC('F','L','V','1') flv
video=cv2.VideoWriter(dir_name, cv2.cv.CV_FOURCC('M','J','P','G'), int(fps),size)

while success:
    timer=Timer()
    image=frame/255.
    #image = caffe.io.load_image(img_path)
    transformed_image = transformer.preprocess('data', image)
    net.blobs['data'].data[...] = transformed_image
    timer.tic()
    output = net.forward() #detectors 1*1*N*7 N*(image-id, label, confidence, xmin, ymin, xmax, ymax)
    timer.toc()
    shape=output['detection_out'].shape
    detectors=output['detection_out'].reshape(shape[2],shape[3])
    #visualize
    img=cv2.resize(frame,(size[1],size[0]))
    for i in xrange(detectors.shape[0]):
        if detectors[i][2]>=visualize_threshold:
            xmin=int(detectors[i][3]*size[1])
            ymin=int(detectors[i][4]*size[0])
            xmax=int(detectors[i][5]*size[1])
            ymax=int(detectors[i][6]*size[0])
            label=detectors[i][1]
            rect_start=(xmin,ymin)
            rect_end=(xmax,ymax)
            cv2.rectangle(img, rect_start, rect_end, color[int(label-1)], 2)
    cv2.imshow('image',img)
    cv2.waitKey(1)
    print ('Detection took {:.3f}s').format(timer.total_time)
    success, frame = videoCapture.read()

看我写的辛苦求打赏啊!!!有学术讨论和指点请加微信manutdzou,注明

20


blog comments powered by Disqus