[BYOC][ACL] Support add operation

Added support for an "add" operation implemented via ACL for fp32 and quantized uint8 data types
apache · Sep 28, 2020 · cbfcbe0 · cbfcbe0
1 parent ef50c0e
commit cbfcbe0
Show file tree

Hide file tree

Showing 3 changed files with 217 additions and 18 deletions.
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -345,3 +345,23 @@ def maximum(attrs, args):
     type_a = args[0].checked_type
     type_b = args[0].checked_type
     return (type_a.dtype == "float32") and (type_b.dtype == "float32")
+
+
+@tvm.ir.register_op_attr("add", "target.arm_compute_lib")
+def add(attrs, args):
+    """Check if the external ACL codegen for add should be used."""
+    for typ in [args[0].checked_type, args[1].checked_type]:
+        if typ.dtype not in ["float32"]:
+            return False
+
+    return True
+
+
+@tvm.ir.register_op_attr("qnn.add", "target.arm_compute_lib")
+def qnn_add(attrs, args):
+    """Check if the external ACL codegen for add should be used."""
+    for typ in [args[0].checked_type, args[1].checked_type]:
+        if typ.dtype not in ["uint8"]:
+            return False
+
+    return True
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -30,6 +30,7 @@
 
 #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
@@ -140,8 +141,13 @@ class ACLRuntime : public JSONRuntimeBase {
           CreateGlobalPoolingLayer(&layer_, node);
         } else if ("reshape" == op_name) {
           CreateReshapeLayer(&layer_, node);
+<<<<<<< HEAD
         } else if ("maximum" == op_name) {
           CreateMaximumLayer(&layer_, node);
+=======
+        } else if ("add" == op_name || "qnn.add" == op_name) {
+          CreateAddLayer(&layer_, node);
+>>>>>>> a7fa43daf... ACL: add operation
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -416,15 +422,54 @@ class ACLRuntime : public JSONRuntimeBase {
     auto function = std::make_shared<arm_compute::NEElementwiseMax>();
     function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]);
     layer->function = function;
-  }
 
-  /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
-  ACLAllocator allocator_;
-  /*!
-   * \brief The network layers represented by acl functions.
-   * \note Currently only supports a single layer.
-   */
-  CachedLayer layer_;
+    /*!
+     * \brief Creates an add/qnn.add layer
+     *
+     * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+     * \param node  The JSON representation of the operator.
+     */
+    void CreateAddLayer(CachedLayer * layer, const JSONGraphNode& node) {
+      auto op_name = node.GetOpName();
+      if ("add" == op_name) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
+        layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+      } else if ("qnn.add" == op_name) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(
+            node.GetInputs()[0], &node.GetInputs()[2], &node.GetInputs()[3]));
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(
+            node.GetInputs()[1], &node.GetInputs()[4], &node.GetInputs()[5]));
+        layer->outputs.push_back(
+            MakeACLTensorFromJSONNode(node, &node.GetInputs()[6], &node.GetInputs()[7]));
+      } else {
+        LOG(FATAL) << "Unsupported op: " << op_name;
+      }
+
+      /** Initialise the kernel's inputs, output and conversion policy.
+       *
+       * @param[in]  input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32
+       * @param[in]  input2 Second tensor input. Data types supported: U8/QASYMM8/S16/F16/F32
+       * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/F32
+       * @param[in]  policy Policy to use to handle overflow.
+       * void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy);
+       *
+       * arm_compute::ConvertPolicy::SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 currently
+       * always saturates result
+       */
+      auto f = std::make_shared<arm_compute::NEArithmeticAddition>();
+      f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0],
+                   arm_compute::ConvertPolicy::SATURATE);
+      layer->function = f;
+    }
+
+    /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
+    ACLAllocator allocator_;
+    /*!
+     * \brief The network layers represented by acl functions.
+     * \note Currently only supports a single layer.
+     */
+    CachedLayer layer_;
 #else
   void Run() override {
     LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. "
@@ -436,19 +481,18 @@ class ACLRuntime : public JSONRuntimeBase {
                  << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME.";
   }
 #endif
-};
-
-runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_json,
-                                 const Array<String>& const_names) {
-  auto n = make_object<ACLRuntime>(symbol_name, graph_json, const_names);
-  return runtime::Module(n);
-}
+  }
 
-TVM_REGISTER_GLOBAL("runtime.arm_compute_lib_runtime_create").set_body_typed(ACLRuntimeCreate);
+  runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_json,
+                                   const Array<String>& const_names) {
+    auto n = make_object<ACLRuntime>(symbol_name, graph_json, const_names);
+    return runtime::Module(n);
+  }
 
-TVM_REGISTER_GLOBAL("runtime.module.loadbinary_arm_compute_lib")
-    .set_body_typed(JSONRuntimeBase::LoadFromBinary<ACLRuntime>);
+  TVM_REGISTER_GLOBAL("runtime.arm_compute_lib_runtime_create").set_body_typed(ACLRuntimeCreate);
 
+  TVM_REGISTER_GLOBAL("runtime.module.loadbinary_arm_compute_lib")
+      .set_body_typed(JSONRuntimeBase::LoadFromBinary<ACLRuntime>);
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Compute Library integration reshape tests."""
+
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import relay
+
+from test_arm_compute_lib.infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+)
+from test_arm_compute_lib.infrastructure import Device
+
+_qnn_params = {
+    "lhs_scale": relay.const(0.0156863, "float32"),
+    "lhs_zero_point": relay.const(127, "int32"),
+    "rhs_scale": relay.const(0.0117647, "float32"),
+    "rhs_zero_point": relay.const(85, "int32"),
+    "output_scale": relay.const(0.0235294, "float32"),
+    "output_zero_point": relay.const(128, "int32"),
+}
+
+
+def _get_model(shape, dtype, var_names, op, op_params):
+    a = relay.var(next(var_names), shape=shape, dtype=dtype)
+    b = relay.var(next(var_names), shape=shape, dtype=dtype)
+    return op(a, b, **op_params)
+
+
+def _get_expected_codegen(shape, dtype, op_name, qnn_params):
+    input_a = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    input_b = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    input_qnn = [
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {
+                "shape": [[list(qnn_params[_].data.shape)]],
+                "dtype": [[qnn_params[_].data.dtype]],
+            },
+        }
+        for _ in qnn_params
+    ]
+    inputs = [input_a, input_b, *input_qnn]
+    node = {
+        "op": "kernel",
+        "name": op_name,
+        "inputs": [[_, 0, 0] for _ in range(len(inputs))],
+        "attrs": {
+            "num_inputs": str(len(inputs)),
+            "num_outputs": "1",
+            "shape": [[list(shape)]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    return [*inputs, node]
+
+
+def test_runtime_add():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    for dtype, low, high, atol, rtol, op, op_params in [
+        ("float32", -127, 128, 1e-7, 1e-7, relay.add, {}),
+        ("uint8", 0, 255, 0.0, 1.0, relay.qnn.op.add, _qnn_params),
+    ]:
+        shape = (2, 2)
+        for inputs in [
+            {
+                "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+                "b": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+            }
+        ]:
+            outputs = []
+            func = _get_model(shape, dtype, iter(inputs), op, op_params)
+            for acl in [True, False]:
+                outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0])
+
+            config = {
+                "shape": shape,
+                "dtype": dtype,
+                "inputs": inputs,
+                "operation": op,
+                "op_params": op_params,
+            }
+
+            # verify_saturation=False as the result of add_QASYMM8_QASYMM8_QASYMM8
+            # is always saturated currently.
+            verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=False)
+
+
+def test_runtime_codegen_add():
+    if skip_codegen_test():
+        return
+
+    inputs = {"a", "b"}
+    for dtype, op_name, op, qnn_params in [
+        ("float32", "add", relay.add, {}),
+        ("uint8", "qnn.add", relay.qnn.op.add, _qnn_params),
+    ]:
+        for shape in [(1, 1), (2, 2, 2), (3, 3, 3, 3)]:
+            func = _get_model(shape, dtype, iter(inputs), op, qnn_params)
+            exp_codegen = _get_expected_codegen(shape, dtype, op_name, qnn_params)
+            verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_runtime_codegen_add()
+    test_runtime_add()