ncnn源码分析_6
分析了几个比较常见的算子 forward 过程, 比如 abs, bias,argmax, conv,pool, bn
1. abs
// 绝对值层特性: 单输入,单输出,可直接对输入进行修改
int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads) // openmp
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);// 当前通道数据的起始指针
for (int i=0; i<size; i++)
{
if (ptr[i] < 0)
ptr[i] = - ptr[i]; // 小于零取相反数,大于零保持原样
}
}
return 0;
}
2. bias
int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q); // 每个通道数据起始指针
float bias = bias_data[q]; // 需要添加的偏置数据 前面从模型中载入的参数 每通道偏置参数一样
for (int i=0; i<size; i++)
{
ptr[i] += bias;// 加上偏置
}
}
return 0;
}
2. argmax
// 层参数包含两个参数,第一个是是否需要包含值对应在源blob中的位置,第二个是需要前多少个最大的数
int ArgMax::load_param(const ParamDict& pd)
{
out_max_val = pd.get(0, 0);// 是否 需要存储位置
topk = pd.get(1, 1); // 在前topk个最大的数
return 0;
}
int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int size = bottom_blob.total(); // 输入blob参数总数量
// 创建一个新的输出 blob
if (out_max_val)
top_blob.create(topk, 2, 4u, opt.blob_allocator); // topk个值 + topk个值对应的位置
else
top_blob.create(topk, 1, 4u, opt.blob_allocator); // 只存 topk个值,不存位置
if (top_blob.empty())
return -100;
const float* ptr = bottom_blob;
// partial sort topk with index, optional value
std::vector< std::pair<float, int> > vec;
vec.resize(size);
for (int i=0; i<size; i++)
{
vec[i] = std::make_pair(ptr[i], i);// 源 输入blob 的参数的 值:位置id 键值对
}
std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
std::greater< std::pair<float, int> >());// 按第一列排序,获取前 topk个
// 保存前面最大的 topk 个参数
float* outptr = top_blob;
if (out_max_val)
{
float* valptr = outptr + topk; // 前面topk的位置存值,后面存对应值在源输入 blob 中的位置ID
for (int i=0; i<topk; i++)
{
outptr[i] = vec[i].first; // 存值
valptr[i] = vec[i].second;// 存位置
}
}
else
{
for (int i=0; i<topk; i++)
{
outptr[i] = vec[i].second;// 只存值
}
}
return 0;
}
3.concat
int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
int dims = bottom_blobs[0].dims;
size_t elemsize = bottom_blobs[0].elemsize;
if (dims == 1) // axis == 0
{
int top_w = 0; // 输出长度
for (size_t b = 0; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob = bottom_blobs[b];
top_w += bottom_blob.w;
}
// 创建输出 blob
Mat& top_blob = top_blobs[0];
top_blob.create(top_w, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
// 将不同的输入复制到输出
unsigned char* outptr = top_blob;
for (size_t b = 0; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob = bottom_blobs[b];
int w = bottom_blob.w;
const unsigned char* ptr = bottom_blob;
memcpy(outptr, ptr, w * elemsize);
outptr += w * elemsize;
}
return 0;
}
if (dims == 2 && axis == 0){
// ...
}
if (dims == 2 && axis == 1){
// ...
}
if (dims == 3 && axis == 0){
// ...
}
if (dims == 3 && axis == 1){
// ...
}
if (dims == 3 && axis == 2){
// ...
}
return 0;
}
4. 卷积层
int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
// value = value + bias
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
return forward_int8(bottom_blob, top_blob, opt);
}
// flattened blob, implement as InnerProduct
//...
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
// NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
Mat bottom_blob_bordered;
make_padding(bottom_blob, bottom_blob_bordered, opt);
if (bottom_blob_bordered.empty())
return -100;
w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;
const int maxk = kernel_w * kernel_h;
// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}
// 申请输出
top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < num_output; p++) // 逐输出通道
{
float* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++) // 输出高度
{
for (int j = 0; j < outw; j++) // 输出宽度
{
float sum = 0.f;
if (bias_term)
sum = bias_data[p];
const float* kptr = (const float*)weight_data + maxk * channels * p; // 卷积单元对应的起始位置
for (int q = 0; q < channels; q++) // 输入
{
const Mat m = bottom_blob_bordered.channel(q);
const float* sptr = m.row(i * stride_h) + j * stride_w;
for (int k = 0; k < maxk; k++) // 29.23
{
float val = sptr[space_ofs[k]]; // 20.72
float w = kptr[k];
sum += val * w; // 41.45
}
kptr += maxk;
}
// 激活函数
// ......
outptr[j] = sum;
}
outptr += outw;
}
}
return 0;
}
5. pooling
int Pooling::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// max value in NxN window
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
if (global_pooling)
{
// ...
}
Mat bottom_blob_bordered;
make_padding(bottom_blob, bottom_blob_bordered, opt);
if (bottom_blob_bordered.empty())
return -100;
w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
int outw = (w - kernel_w) / stride_w + 1;
int outh = (h - kernel_h) / stride_h + 1;、
// 申请输出 blob
top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
const int maxk = kernel_w * kernel_h;
// kernel offsets
std::vector<int> _space_ofs(maxk); //
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w - kernel_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2++;
}
p2 += gap;
}
}
if (pooling_type == PoolMethod_MAX)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++) // 通道
{
const Mat m = bottom_blob_bordered.channel(q);
float* outptr = top_blob.channel(q);
for (int i = 0; i < outh; i++) // 输出层高度
{
for (int j = 0; j < outw; j++) // 输出层宽度
{
const float* sptr = m.row(i * stride_h) + j * stride_w; // 池化单元对应的起始位置
float max = sptr[0];
// 求每个池化单元对应的最大值
for (int k = 0; k < maxk; k++)
{
float val = sptr[space_ofs[k]];
max = std::max(max, val);
}
outptr[j] = max;
}
outptr += outw;
}
}
}
else if (pooling_type == PoolMethod_AVE)
{
// ...
}
return 0;
}
6. Batchnorm
// 可以看出来, 这里的其实就是 value = b * value + a, 完全可以和 conv 进行合并
int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
// value = b * value + a
int dims = bottom_top_blob.dims;
if (dims == 1) // 1维度====================
{
// ...
}
if (dims == 2) // 2维度======================
{
// ...
}
if (dims == 3) // 3维度================================
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float a = a_data[q];
float b = b_data[q];
for (int i=0; i<size; i++)
{
ptr[i] = b * ptr[i] + a;
}
}
}
return 0;
}
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!