Skip to content

Commit

Permalink
split convolution winograd transform input output (#3688)
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui authored Apr 10, 2022
1 parent 32560f4 commit 9298d05
Show file tree
Hide file tree
Showing 34 changed files with 5,955 additions and 7,073 deletions.
590 changes: 12 additions & 578 deletions src/layer/arm/convolution_3x3_pack4.h

Large diffs are not rendered by default.

595 changes: 14 additions & 581 deletions src/layer/arm/convolution_3x3_pack4_bf16s.h

Large diffs are not rendered by default.

377 changes: 8 additions & 369 deletions src/layer/arm/convolution_3x3_pack4_fp16s.h

Large diffs are not rendered by default.

310 changes: 6 additions & 304 deletions src/layer/arm/convolution_3x3_pack4to1.h

Large diffs are not rendered by default.

315 changes: 8 additions & 307 deletions src/layer/arm/convolution_3x3_pack4to1_bf16s.h

Large diffs are not rendered by default.

595 changes: 14 additions & 581 deletions src/layer/arm/convolution_3x3_pack8_fp16s.h

Large diffs are not rendered by default.

315 changes: 8 additions & 307 deletions src/layer/arm/convolution_3x3_pack8to1_fp16s.h

Large diffs are not rendered by default.

377 changes: 8 additions & 369 deletions src/layer/arm/convolution_3x3_pack8to4_fp16s.h

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions src/layer/arm/convolution_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
namespace ncnn {

#include "convolution_sgemm.h"
#include "convolution_winograd_transform.h"

#include "convolution_1x1.h"
#include "convolution_2x2.h"
Expand All @@ -39,6 +40,7 @@ namespace ncnn {
#if NCNN_BF16
#include "convolution_bf16s.h"
#include "convolution_sgemm_bf16s.h"
#include "convolution_winograd_transform_bf16s.h"
#include "convolution_1x1_bf16s.h"
#endif // NCNN_BF16

Expand All @@ -56,6 +58,7 @@ namespace ncnn {
#include "convolution_sgemm_pack4.h"
#include "convolution_sgemm_pack1to4.h"
#include "convolution_sgemm_pack4to1.h"
#include "convolution_winograd_transform_pack4.h"
#include "convolution_1x1_pack4.h"
#include "convolution_1x1_pack1to4.h"
#include "convolution_1x1_pack4to1.h"
Expand All @@ -72,6 +75,7 @@ namespace ncnn {
#include "convolution_sgemm_pack4_bf16s.h"
#include "convolution_sgemm_pack1to4_bf16s.h"
#include "convolution_sgemm_pack4to1_bf16s.h"
#include "convolution_winograd_transform_pack4_bf16s.h"
#include "convolution_1x1_pack4_bf16s.h"
#include "convolution_1x1_pack1to4_bf16s.h"
#include "convolution_1x1_pack4to1_bf16s.h"
Expand Down Expand Up @@ -115,6 +119,9 @@ namespace ncnn {
#include "convolution_sgemm_pack8_fp16s.h"
#include "convolution_sgemm_pack8to4_fp16s.h"
#include "convolution_sgemm_pack8to1_fp16s.h"
#include "convolution_winograd_transform_fp16s.h"
#include "convolution_winograd_transform_pack4_fp16s.h"
#include "convolution_winograd_transform_pack8_fp16s.h"
#include "convolution_1x1_fp16s.h"
#include "convolution_1x1_pack4_fp16s.h"
#include "convolution_1x1_pack1to4_fp16s.h"
Expand Down
125 changes: 125 additions & 0 deletions src/layer/arm/convolution_winograd_transform.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_winograd64_transform_output_neon(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
const int outw = top_blob.w;
const int outh = top_blob.h;
const int outch = top_blob.c;

const int w_tiles = outw / 6;
const int h_tiles = outh / 6;
const int tiles = w_tiles * h_tiles;

const float* biasptr = bias;

// const float otm[6][8] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
// };

// 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
// 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
// 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
// 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
// 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
// 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
const Mat out0_tm = top_blob_tm.channel(p);
Mat out0 = top_blob.channel(p);

const float bias0 = biasptr ? biasptr[p] : 0.f;

float tmp[6][8];

// tile
for (int i = 0; i < h_tiles; i++)
{
for (int j = 0; j < w_tiles; j++)
{
const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
const float* output0_tm_1 = output0_tm_0 + tiles * 1;
const float* output0_tm_2 = output0_tm_0 + tiles * 2;
const float* output0_tm_3 = output0_tm_0 + tiles * 3;
const float* output0_tm_4 = output0_tm_0 + tiles * 4;
const float* output0_tm_5 = output0_tm_0 + tiles * 5;
const float* output0_tm_6 = output0_tm_0 + tiles * 6;
const float* output0_tm_7 = output0_tm_0 + tiles * 7;

// TODO neon optimize
for (int m = 0; m < 8; m++)
{
float tmp024a = output0_tm_1[0] + output0_tm_2[0];
float tmp135a = output0_tm_1[0] - output0_tm_2[0];

float tmp024b = output0_tm_3[0] + output0_tm_4[0];
float tmp135b = output0_tm_3[0] - output0_tm_4[0];

float tmp024c = output0_tm_5[0] + output0_tm_6[0];
float tmp135c = output0_tm_5[0] - output0_tm_6[0];

tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;

tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;

output0_tm_0 += tiles * 8;
output0_tm_1 += tiles * 8;
output0_tm_2 += tiles * 8;
output0_tm_3 += tiles * 8;
output0_tm_4 += tiles * 8;
output0_tm_5 += tiles * 8;
output0_tm_6 += tiles * 8;
output0_tm_7 += tiles * 8;
}

float* output0 = out0.row(i * 6) + j * 6;

for (int m = 0; m < 6; m++)
{
const float* tmp0 = tmp[m];

float tmp024a = tmp0[1] + tmp0[2];
float tmp135a = tmp0[1] - tmp0[2];

float tmp024b = tmp0[3] + tmp0[4];
float tmp135b = tmp0[3] - tmp0[4];

float tmp024c = tmp0[5] + tmp0[6];
float tmp135c = tmp0[5] - tmp0[6];

output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;

output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;

output0 += outw;
}
}
}
}
}
125 changes: 125 additions & 0 deletions src/layer/arm/convolution_winograd_transform_bf16s.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_winograd64_transform_output_bf16s_neon(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
const int outw = top_blob.w;
const int outh = top_blob.h;
const int outch = top_blob.c;

const int w_tiles = outw / 6;
const int h_tiles = outh / 6;
const int tiles = w_tiles * h_tiles;

const float* biasptr = bias;

// const float otm[6][8] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
// };

// 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
// 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
// 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
// 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
// 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
// 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
const Mat out0_tm = top_blob_tm.channel(p);
Mat out0 = top_blob.channel(p);

const float bias0 = biasptr ? biasptr[p] : 0.f;

float tmp[6][8];

// tile
for (int i = 0; i < h_tiles; i++)
{
for (int j = 0; j < w_tiles; j++)
{
const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
const float* output0_tm_1 = output0_tm_0 + tiles * 1;
const float* output0_tm_2 = output0_tm_0 + tiles * 2;
const float* output0_tm_3 = output0_tm_0 + tiles * 3;
const float* output0_tm_4 = output0_tm_0 + tiles * 4;
const float* output0_tm_5 = output0_tm_0 + tiles * 5;
const float* output0_tm_6 = output0_tm_0 + tiles * 6;
const float* output0_tm_7 = output0_tm_0 + tiles * 7;

// TODO neon optimize
for (int m = 0; m < 8; m++)
{
float tmp024a = output0_tm_1[0] + output0_tm_2[0];
float tmp135a = output0_tm_1[0] - output0_tm_2[0];

float tmp024b = output0_tm_3[0] + output0_tm_4[0];
float tmp135b = output0_tm_3[0] - output0_tm_4[0];

float tmp024c = output0_tm_5[0] + output0_tm_6[0];
float tmp135c = output0_tm_5[0] - output0_tm_6[0];

tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;

tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;

output0_tm_0 += tiles * 8;
output0_tm_1 += tiles * 8;
output0_tm_2 += tiles * 8;
output0_tm_3 += tiles * 8;
output0_tm_4 += tiles * 8;
output0_tm_5 += tiles * 8;
output0_tm_6 += tiles * 8;
output0_tm_7 += tiles * 8;
}

unsigned short* output0 = out0.row<unsigned short>(i * 6) + j * 6;

for (int m = 0; m < 6; m++)
{
const float* tmp0 = tmp[m];

float tmp024a = tmp0[1] + tmp0[2];
float tmp135a = tmp0[1] - tmp0[2];

float tmp024b = tmp0[3] + tmp0[4];
float tmp135b = tmp0[3] - tmp0[4];

float tmp024c = tmp0[5] + tmp0[6];
float tmp135c = tmp0[5] - tmp0[6];

output0[0] = float32_to_bfloat16(bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32);
output0[2] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 4 + tmp024c * 8);
output0[4] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c);

output0[1] = float32_to_bfloat16(bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16);
output0[3] = float32_to_bfloat16(bias0 + tmp135a + tmp135b * 8 + tmp135c * 4);
output0[5] = float32_to_bfloat16(bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c);

output0 += outw;
}
}
}
}
}
Loading

0 comments on commit 9298d05

Please sign in to comment.