split convolution winograd transform input output (#3688)

Tencent · Apr 10, 2022 · 9298d05 · 9298d05
1 parent 32560f4
commit 9298d05
Show file tree

Hide file tree

Showing 34 changed files with 5,955 additions and 7,073 deletions.
diff --git a/src/layer/arm/convolution_3x3_pack4.h b/src/layer/arm/convolution_3x3_pack4.h
diff --git a/src/layer/arm/convolution_3x3_pack4_bf16s.h b/src/layer/arm/convolution_3x3_pack4_bf16s.h
diff --git a/src/layer/arm/convolution_3x3_pack4_fp16s.h b/src/layer/arm/convolution_3x3_pack4_fp16s.h
diff --git a/src/layer/arm/convolution_3x3_pack4to1.h b/src/layer/arm/convolution_3x3_pack4to1.h
diff --git a/src/layer/arm/convolution_3x3_pack4to1_bf16s.h b/src/layer/arm/convolution_3x3_pack4to1_bf16s.h
diff --git a/src/layer/arm/convolution_3x3_pack8_fp16s.h b/src/layer/arm/convolution_3x3_pack8_fp16s.h
diff --git a/src/layer/arm/convolution_3x3_pack8to1_fp16s.h b/src/layer/arm/convolution_3x3_pack8to1_fp16s.h
diff --git a/src/layer/arm/convolution_3x3_pack8to4_fp16s.h b/src/layer/arm/convolution_3x3_pack8to4_fp16s.h
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
@@ -28,6 +28,7 @@
 namespace ncnn {
 
 #include "convolution_sgemm.h"
+#include "convolution_winograd_transform.h"
 
 #include "convolution_1x1.h"
 #include "convolution_2x2.h"
@@ -39,6 +40,7 @@ namespace ncnn {
 #if NCNN_BF16
 #include "convolution_bf16s.h"
 #include "convolution_sgemm_bf16s.h"
+#include "convolution_winograd_transform_bf16s.h"
 #include "convolution_1x1_bf16s.h"
 #endif // NCNN_BF16
 
@@ -56,6 +58,7 @@ namespace ncnn {
 #include "convolution_sgemm_pack4.h"
 #include "convolution_sgemm_pack1to4.h"
 #include "convolution_sgemm_pack4to1.h"
+#include "convolution_winograd_transform_pack4.h"
 #include "convolution_1x1_pack4.h"
 #include "convolution_1x1_pack1to4.h"
 #include "convolution_1x1_pack4to1.h"
@@ -72,6 +75,7 @@ namespace ncnn {
 #include "convolution_sgemm_pack4_bf16s.h"
 #include "convolution_sgemm_pack1to4_bf16s.h"
 #include "convolution_sgemm_pack4to1_bf16s.h"
+#include "convolution_winograd_transform_pack4_bf16s.h"
 #include "convolution_1x1_pack4_bf16s.h"
 #include "convolution_1x1_pack1to4_bf16s.h"
 #include "convolution_1x1_pack4to1_bf16s.h"
@@ -115,6 +119,9 @@ namespace ncnn {
 #include "convolution_sgemm_pack8_fp16s.h"
 #include "convolution_sgemm_pack8to4_fp16s.h"
 #include "convolution_sgemm_pack8to1_fp16s.h"
+#include "convolution_winograd_transform_fp16s.h"
+#include "convolution_winograd_transform_pack4_fp16s.h"
+#include "convolution_winograd_transform_pack8_fp16s.h"
 #include "convolution_1x1_fp16s.h"
 #include "convolution_1x1_pack4_fp16s.h"
 #include "convolution_1x1_pack1to4_fp16s.h"

diff --git a/src/layer/arm/convolution_winograd_transform.h b/src/layer/arm/convolution_winograd_transform.h
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd64_transform_output_neon(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 6;
+    const int h_tiles = outh / 6;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[6][8] = {
+    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
+    // };
+
+    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
+    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
+    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
+    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
+    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
+    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        const float bias0 = biasptr ? biasptr[p] : 0.f;
+
+        float tmp[6][8];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
+                const float* output0_tm_1 = output0_tm_0 + tiles * 1;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 5;
+                const float* output0_tm_6 = output0_tm_0 + tiles * 6;
+                const float* output0_tm_7 = output0_tm_0 + tiles * 7;
+
+                // TODO neon optimize
+                for (int m = 0; m < 8; m++)
+                {
+                    float tmp024a = output0_tm_1[0] + output0_tm_2[0];
+                    float tmp135a = output0_tm_1[0] - output0_tm_2[0];
+
+                    float tmp024b = output0_tm_3[0] + output0_tm_4[0];
+                    float tmp135b = output0_tm_3[0] - output0_tm_4[0];
+
+                    float tmp024c = output0_tm_5[0] + output0_tm_6[0];
+                    float tmp135c = output0_tm_5[0] - output0_tm_6[0];
+
+                    tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
+                    tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
+                    tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
+
+                    tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
+                    tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
+                    tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;
+
+                    output0_tm_0 += tiles * 8;
+                    output0_tm_1 += tiles * 8;
+                    output0_tm_2 += tiles * 8;
+                    output0_tm_3 += tiles * 8;
+                    output0_tm_4 += tiles * 8;
+                    output0_tm_5 += tiles * 8;
+                    output0_tm_6 += tiles * 8;
+                    output0_tm_7 += tiles * 8;
+                }
+
+                float* output0 = out0.row(i * 6) + j * 6;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    const float* tmp0 = tmp[m];
+
+                    float tmp024a = tmp0[1] + tmp0[2];
+                    float tmp135a = tmp0[1] - tmp0[2];
+
+                    float tmp024b = tmp0[3] + tmp0[4];
+                    float tmp135b = tmp0[3] - tmp0[4];
+
+                    float tmp024c = tmp0[5] + tmp0[6];
+                    float tmp135c = tmp0[5] - tmp0[6];
+
+                    output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
+                    output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
+                    output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
+
+                    output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
+                    output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
+                    output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/arm/convolution_winograd_transform_bf16s.h b/src/layer/arm/convolution_winograd_transform_bf16s.h
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd64_transform_output_bf16s_neon(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 6;
+    const int h_tiles = outh / 6;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[6][8] = {
+    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
+    // };
+
+    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
+    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
+    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
+    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
+    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
+    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        const float bias0 = biasptr ? biasptr[p] : 0.f;
+
+        float tmp[6][8];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
+                const float* output0_tm_1 = output0_tm_0 + tiles * 1;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 5;
+                const float* output0_tm_6 = output0_tm_0 + tiles * 6;
+                const float* output0_tm_7 = output0_tm_0 + tiles * 7;
+
+                // TODO neon optimize
+                for (int m = 0; m < 8; m++)
+                {
+                    float tmp024a = output0_tm_1[0] + output0_tm_2[0];
+                    float tmp135a = output0_tm_1[0] - output0_tm_2[0];
+
+                    float tmp024b = output0_tm_3[0] + output0_tm_4[0];
+                    float tmp135b = output0_tm_3[0] - output0_tm_4[0];
+
+                    float tmp024c = output0_tm_5[0] + output0_tm_6[0];
+                    float tmp135c = output0_tm_5[0] - output0_tm_6[0];
+
+                    tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
+                    tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
+                    tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
+
+                    tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
+                    tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
+                    tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;
+
+                    output0_tm_0 += tiles * 8;
+                    output0_tm_1 += tiles * 8;
+                    output0_tm_2 += tiles * 8;
+                    output0_tm_3 += tiles * 8;
+                    output0_tm_4 += tiles * 8;
+                    output0_tm_5 += tiles * 8;
+                    output0_tm_6 += tiles * 8;
+                    output0_tm_7 += tiles * 8;
+                }
+
+                unsigned short* output0 = out0.row<unsigned short>(i * 6) + j * 6;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    const float* tmp0 = tmp[m];
+
+                    float tmp024a = tmp0[1] + tmp0[2];
+                    float tmp135a = tmp0[1] - tmp0[2];
+
+                    float tmp024b = tmp0[3] + tmp0[4];
+                    float tmp135b = tmp0[3] - tmp0[4];
+
+                    float tmp024c = tmp0[5] + tmp0[6];
+                    float tmp135c = tmp0[5] - tmp0[6];
+
+                    output0[0] = float32_to_bfloat16(bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32);
+                    output0[2] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 4 + tmp024c * 8);
+                    output0[4] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c);
+
+                    output0[1] = float32_to_bfloat16(bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16);
+                    output0[3] = float32_to_bfloat16(bias0 + tmp135a + tmp135b * 8 + tmp135c * 4);
+                    output0[5] = float32_to_bfloat16(bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c);
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}