ncnn源码分析_6

分析了几个比较常见的算子 forward 过程, 比如 abs, bias,argmax, conv,pool, bn

1. abs

// 绝对值层特性: 单输入,单输出,可直接对输入进行修改
int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)  // openmp 
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);// 当前通道数据的起始指针

        for (int i=0; i<size; i++)
        {
            if (ptr[i] < 0)
                ptr[i] = - ptr[i]; // 小于零取相反数,大于零保持原样
        }
    }

    return 0;
}

2. bias

int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);  // 每个通道数据起始指针

        float bias = bias_data[q];   // 需要添加的偏置数据 前面从模型中载入的参数 每通道偏置参数一样

        for (int i=0; i<size; i++)
        {
            ptr[i] += bias;// 加上偏置
        }
    }

    return 0;
}

2. argmax

// 层参数包含两个参数,第一个是是否需要包含值对应在源blob中的位置,第二个是需要前多少个最大的数
int ArgMax::load_param(const ParamDict& pd)
{
    out_max_val = pd.get(0, 0);// 是否 需要存储位置
    topk = pd.get(1, 1);       // 在前topk个最大的数
   
    return 0;
}

int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int size = bottom_blob.total(); // 输入blob参数总数量
    
    // 创建一个新的输出 blob 
    if (out_max_val)
        top_blob.create(topk, 2, 4u, opt.blob_allocator); // topk个值 + topk个值对应的位置
    else
        top_blob.create(topk, 1, 4u, opt.blob_allocator); // 只存  topk个值,不存位置
    if (top_blob.empty())
        return -100;

    const float* ptr = bottom_blob;
    
    // partial sort topk with index, optional value
    std::vector< std::pair<float, int> > vec;
    vec.resize(size);
    for (int i=0; i<size; i++)
    {
        vec[i] = std::make_pair(ptr[i], i);// 源 输入blob 的参数的 值:位置id 键值对
    }
    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater< std::pair<float, int> >());// 按第一列排序,获取前 topk个
			
    // 保存前面最大的 topk 个参数
    float* outptr = top_blob;
    if (out_max_val)
    {
        float* valptr = outptr + topk; // 前面topk的位置存值,后面存对应值在源输入 blob 中的位置ID
        for (int i=0; i<topk; i++)
        {
            outptr[i] = vec[i].first; // 存值
            valptr[i] = vec[i].second;// 存位置
        }
    }
    else
    {
        for (int i=0; i<topk; i++)
        {
            outptr[i] = vec[i].second;// 只存值
        }
    }

    return 0;
}

3.concat

int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int dims = bottom_blobs[0].dims;
    size_t elemsize = bottom_blobs[0].elemsize;

    if (dims == 1) // axis == 0
    {
        int top_w = 0; // 输出长度
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }
        // 创建输出 blob
        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        // 将不同的输入复制到输出
        unsigned char* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int w = bottom_blob.w;

            const unsigned char* ptr = bottom_blob;
            memcpy(outptr, ptr, w * elemsize);

            outptr += w * elemsize;
        }

        return 0;
    }

    if (dims == 2 && axis == 0){
        // ...
    }

    if (dims == 2 && axis == 1){
        // ...
    }

    if (dims == 3 && axis == 0){
        //  ...
    }

    if (dims == 3 && axis == 1){
      // ...
    }

    if (dims == 3 && axis == 2){
       // ...
    }

    return 0;
}

4. 卷积层

int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // convolv with NxN kernel
    // value = value + bias

    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }

    // flattened blob, implement as InnerProduct
    //...


    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    // NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // 申请输出
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < num_output; p++) // 逐输出通道
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++) // 输出高度
        {
            for (int j = 0; j < outw; j++) // 输出宽度
            {
                float sum = 0.f;

                if (bias_term)
                    sum = bias_data[p];

                const float* kptr = (const float*)weight_data + maxk * channels * p; // 卷积单元对应的起始位置

                for (int q = 0; q < channels; q++) // 输入
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    const float* sptr = m.row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++) // 29.23
                    {
                        float val = sptr[space_ofs[k]]; // 20.72
                        float w = kptr[k];
                        sum += val * w; // 41.45
                    }

                    kptr += maxk;
                }
                
                // 激活函数
                // ......

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }

    return 0;
}

5. pooling

int Pooling::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // max value in NxN window
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    if (global_pooling)
    {
        // ...
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;、
    
    // 申请输出 blob
    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk); // 
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w - kernel_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2++;
            }
            p2 += gap;
        }
    }

    if (pooling_type == PoolMethod_MAX)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++) // 通道
        {
            const Mat m = bottom_blob_bordered.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < outh; i++) // 输出层高度
            {
                for (int j = 0; j < outw; j++) // 输出层宽度
                {
                    const float* sptr = m.row(i * stride_h) + j * stride_w; // 池化单元对应的起始位置

                    float max = sptr[0];
                    // 求每个池化单元对应的最大值
                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[space_ofs[k]];
                        max = std::max(max, val);
                    }

                    outptr[j] = max;
                }

                outptr += outw;
            }
        }
    }
    else if (pooling_type == PoolMethod_AVE)
    {
        // ...
    }

    return 0;
}

6. Batchnorm

// 可以看出来, 这里的其实就是 value = b * value + a, 完全可以和 conv 进行合并
int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    int dims = bottom_top_blob.dims;
    
    if (dims == 1) // 1维度====================
    {
        // ...
    }

    if (dims == 2) // 2维度======================
    {
        // ...
    }

    if (dims == 3) // 3维度================================
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float a = a_data[q];
            float b = b_data[q];

            for (int i=0; i<size; i++)
            {
                ptr[i] = b * ptr[i] + a;
            }
        }
    }

    return 0;
}

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!