Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU][ACL] LPT transformations are enabled + FQ decomposition #28981

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4b05b5c
added lowp_gemm_acl to MM impl priority list
alvoron Jan 24, 2025
5ae4c4e
added LPT MatMulTransformation test for arm
alvoron Feb 7, 2025
3091bff
add u8 test case and skip it
alvoron Feb 7, 2025
2765a27
Merge branch 'master' into alvoron_matmut_int8
alvoron Feb 10, 2025
c8ae024
disable fq fusing
alvoron Feb 11, 2025
5b2f1df
address commints
alvoron Feb 12, 2025
696c3db
remove lowp_gemm_acl
alvoron Feb 12, 2025
5d2fc38
enable lpt on arm and fq decomposition
alvoron Feb 12, 2025
93b2582
disable MatMulTransformation on ARM
alvoron Feb 13, 2025
4531f55
remove 3rd party changes
alvoron Feb 13, 2025
abe2bcb
Enable MatMulTransformation against FC nodes only
alvoron Feb 13, 2025
64956de
disable lpt
alvoron Feb 13, 2025
18d780c
skip all MatMulTransformation tests
alvoron Feb 13, 2025
159653c
fq decomposition
alvoron Feb 13, 2025
82c9814
Merge branch 'master' into alvoron_fq_decomposition_arm
alvoron Feb 14, 2025
0eaeda4
Update transformation_pipeline.cpp
alvoron Feb 14, 2025
c9809be
adjust tests threshold on arm
alvoron Feb 14, 2025
39ce30d
Merge branch 'master' into alvoron_fq_decomposition_arm
alvoron Feb 15, 2025
6c8f3c8
adjust threshold and fix clang
alvoron Feb 15, 2025
979ce24
Merge branch 'master' into alvoron_fq_decomposition_arm
alvoron Feb 18, 2025
fbe7849
implemented MergeEltwiseAndConvert and disanled snippets on int8
alvoron Feb 18, 2025
5963734
fix MergeEltwiseAndConvert
alvoron Feb 18, 2025
3075684
address comments
alvoron Feb 19, 2025
3222eb5
fix clang and InitGraphStatefulDiffPrimitiveModel test
alvoron Feb 19, 2025
36698da
Merge branch 'master' into alvoron_fq_decomposition_arm
alvoron Feb 19, 2025
e1293b1
fix arm32 test
alvoron Feb 19, 2025
9d18773
add arm 32 bit defines
alvoron Feb 20, 2025
b82c950
replace ov::snippets::pass::FakeQuantizeDecomposition with ov::pass::…
alvoron Feb 26, 2025
2385139
remove excessive thresholds
alvoron Feb 26, 2025
d1b76b8
revert ConvertSaturation support
alvoron Feb 26, 2025
0c9aedb
split lpt manager into common and arm
alvoron Feb 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ struct Config {
bool enableNodeSplit = false;
bool enableHyperThreading = true;
bool changedHyperThreading = false;
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
LPTransformsMode lpTransformsMode = LPTransformsMode::On;
#else
// Currently INT8 mode is not optimized on ARM / RISCV or other non-x86 platforms, fallback to FP32 mode.
Expand Down
63 changes: 59 additions & 4 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) {
FuseMultiplyAndAdd(graph);
graph.RemoveDroppedNodes();

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "MergeConvertAndScaleShift");
MergeConvertAndScaleShift(graph);
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "MergeConvertAndEltwise");
MergeConvertAndEltwise(graph);
graph.RemoveDroppedNodes();

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndConvertOnWeights");
Expand Down Expand Up @@ -161,6 +161,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) {
FuseEltwiseAndSimple(graph);
graph.RemoveDroppedNodes();

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "MergeEltwiseAndConvert");
MergeEltwiseAndConvert(graph);
graph.RemoveDroppedNodes();

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "reshapeRnnSeq");
reshapeRnnSeq(graph);
graph.RemoveDroppedNodes();
Expand Down Expand Up @@ -678,12 +682,63 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph& graph) {
}
}

void GraphOptimizer::MergeConvertAndScaleShift(Graph& graph) {
void GraphOptimizer::MergeEltwiseAndConvert(Graph& graph) {
// The pass is required on arm platforms only
#if !defined(OPENVINO_ARCH_ARM64)
return;
#endif
auto& graphNodes = graph.GetNodes();

auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
CPU_GRAPH_OPTIMIZER_SCOPE(MergeEltwiseAndConvert);
auto parentNode = *parent;
if (parentNode->getType() != Type::Eltwise) {
parent++;
continue;
}

const auto& childEdges = parentNode->getChildEdges();
if (childEdges.size() != 1) {
parent++;
continue;
}

const auto edge = childEdges[0].lock();
auto childNode = edge->getChild();
if (childNode->getType() != Type::Convert) {
parent++;
continue;
}

if (!one_of(childNode->getOriginalOutputPrecisionAtPort(0),
ov::element::i8,
ov::element::u8,
ov::element::f16,
ov::element::bf16,
ov::element::f32)) {
parent++;
continue;
}

auto fusedOps = parentNode->getFusedWith();
if (!fusedOps.empty()) {
fusedOps[fusedOps.size() - 1]->setOriginalOutputPrecisionAtPort(
0,
childNode->getOriginalOutputPrecisionAtPort(0));
}
parentNode->setOriginalOutputPrecisionAtPort(0, childNode->getOriginalOutputPrecisionAtPort(0));
parentNode->addOriginalLayer(childNode->getOriginalLayers());
graph.DropNode(childNode);
}
}

void GraphOptimizer::MergeConvertAndEltwise(Graph& graph) {
auto& graphNodes = graph.GetNodes();

auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
CPU_GRAPH_OPTIMIZER_SCOPE(MergeConvertAndScaleShift);
CPU_GRAPH_OPTIMIZER_SCOPE(MergeConvertAndEltwise);
auto parentNode = *parent;
if (parentNode->getType() != Type::Convert) {
parent++;
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/graph_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ class GraphOptimizer {
void FuseConvolutionMatMulDeconvAndBias(Graph& graph);
void FuseDeconvolutionAndSimpleOperation(Graph& graph);
void FuseMultiplyAndAdd(Graph& graph);
void MergeConvertAndScaleShift(Graph& graph);
void MergeEltwiseAndConvert(Graph& graph);
void MergeConvertAndEltwise(Graph& graph);
void FuseFCAndConvertOnWeights(Graph& graph);
void FuseFCAndTransposeOnWeights(Graph& graph);
void FuseFullyConnectedAndSimpleOperation(Graph& graph);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,24 @@

// LPT transformations
#include "low_precision/add.hpp"
#include "low_precision/avg_pool.hpp"
#include "low_precision/convert_subtract_constant.hpp"
#include "low_precision/convolution_backprop_data.hpp"
#include "low_precision/fold_convert.hpp"
#include "low_precision/fuse_convert.hpp"
#include "low_precision/group_convolution.hpp"
#include "low_precision/interpolate.hpp"
#include "low_precision/mat_mul.hpp"
#include "low_precision/max_pool.hpp"
#include "low_precision/multiply_to_group_convolution.hpp"
#include "low_precision/mvn.hpp"
#include "low_precision/network_helper.hpp"
#include "low_precision/normalize_l2.hpp"
#include "low_precision/recurrent_cell.hpp"
#include "low_precision/reduce_max.hpp"
#include "low_precision/reduce_mean.hpp"
#include "low_precision/reduce_min.hpp"
#include "low_precision/reduce_sum.hpp"
#include "low_precision/rt_info/bias_attribute.hpp"
#include "transformations/low_precision/mark_dequantization_subgraph.hpp"

Expand Down Expand Up @@ -158,6 +167,7 @@
#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
#include "snippets/pass/extract_reshapes_from_mha.hpp"
#include "snippets/pass/fc_tokenization.hpp"
#include "snippets/pass/fq_decomposition.hpp"
#include "snippets/pass/mha_tokenization.hpp"
#include "snippets/pass/split_dimension_m.hpp"
#include "snippets/pass/tokenization.hpp"
Expand Down Expand Up @@ -421,7 +431,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
if (config.inferencePrecision == ov::element::f16) {
precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
type_to_fuse_map fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}};
type_to_fuse_map fuse_map = {};
#else
type_to_fuse_map fuse_map = {{ov::op::PagedAttentionExtension::get_type_info_static(), fuse_type_to_pa}};
#endif
Expand Down Expand Up @@ -749,12 +759,59 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
manager.run_passes(model);
}

void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);

void Transformations::runLptPasses(const std::vector<ov::element::Type>& defaultPrecisions) {
using namespace ov::pass::low_precision;
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
ov::pass::Manager lptManager("CPU:LPT");
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
auto supportedPrecisions = std::vector<PrecisionsRestriction>({
PrecisionsRestriction::create<ov::opset1::MatMul>(
{{{0}, {ov::element::i8}}, {{1}, {ov::element::i8}}}),
});

auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>();

CPU_REGISTER_PASS_COMMON(lptManager,
LowPrecision,
supportedPrecisions,
quantizationRestrictions,
LayerTransformation::Params(true, ov::element::f32, defaultPrecisions));
CPU_DISABLE_PASS_COMMON(lptManager, AvgPoolTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, ConvolutionTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, ConvolutionBackpropDataTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, InterpolateTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, GroupConvolutionTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, MaxPoolTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, MVNTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, NormalizeL2Transformation);
CPU_DISABLE_PASS_COMMON(lptManager, RecurrentCellTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, ReduceMaxTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, ReduceMeanTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, ReduceMinTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, ReduceSumTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);

CPU_SET_CALLBACK_COMMON(
lptManager,
[](const_node_ptr& node) -> bool {
return ov::marked_as_bias(node);
},
AddTransformation);

// Enable MatMulTransformation against FC nodes only
// int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only
// most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16
// oneDNN ticket #2696
CPU_SET_CALLBACK_COMMON(
lptManager,
[&](const_node_ptr& node) -> bool {
if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) &&
one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) {
return false;
}
return true;
},
MatMulTransformation);
#else
// Only enable conv/group conv signed input on AMX and avx2_vnni_2 platform.
std::vector<ov::element::Type> input0LowPrecisionList;
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) ||
Expand Down Expand Up @@ -792,7 +849,6 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
{QuantizationGranularityRestriction::create<ov::opset1::Convolution>({0}),
QuantizationGranularityRestriction::create<ov::opset1::ConvolutionBackpropData>({0})});

ov::pass::Manager lptManager("CPU:LPT");
CPU_REGISTER_PASS_COMMON(lptManager,
LowPrecision,
supportedPrecisions,
Expand Down Expand Up @@ -841,27 +897,20 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
},
FuseConvertTransformation);

// Enable MatMulTransformation against FC nodes only
// int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only
// most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16
// oneDNN ticket #2696
CPU_SET_CALLBACK_ARM(
lptManager,
[&](const_node_ptr& node) -> bool {
if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) &&
one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) {
return false;
}
return true;
},
MatMulTransformation);

CPU_DISABLE_PASS_ARM(lptManager, RecurrentCellTransformation);
CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);

#endif
lptManager.run_passes(model);
}

void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);

CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");

runLptPasses(defaultPrecisions);
}

void Transformations::PostLpt() {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PostLpt);

Expand Down Expand Up @@ -1305,7 +1354,7 @@ void Transformations::PostSnippets(void) {
ov::pass::Manager postSnippetsManager("CPU:PostSnippets");
postSnippetsManager.set_per_pass_validation(false);
CPU_REGISTER_PASS_COMMON(postSnippetsManager, ov::pass::FakeQuantizeDecomposition);
CPU_SET_CALLBACK_COMMON(
CPU_SET_CALLBACK_X64(
postSnippetsManager,
[](const_node_ptr& node) -> bool {
std::string errMsg;
Expand All @@ -1325,7 +1374,19 @@ void Transformations::Snippets(void) {
}

CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Snippets);
// Disable MainSnippets for int8 models on arm platforms
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
using namespace ov::pass::low_precision;
static const std::set<levels>& supported_fq_levels = {levels::int4,
levels::int4_narrow_range,
levels::int8,
levels::int8_narrow_range};
if (!LowPrecision::isFunctionQuantized(model, supported_fq_levels)) {
MainSnippets();
}
#else
MainSnippets();
#endif
PostSnippets();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class Transformations {
void PreLpt(const std::vector<ov::element::Type>& defaultPrecisions);

void Lpt(const std::vector<ov::element::Type>& defaultPrecisions);
void runLptPasses(const std::vector<ov::element::Type>& defaultPrecisions);

void MainSnippets(void);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ class InitGraphStatefulDiffPrimitiveModel : public InitGraphStatefulModelBase {

configuration.insert({"SNIPPETS_MODE", "DISABLE"});

bool directPair;
std::tie(inputShapes, directPair) = this->GetParam();

init_input_shapes(inputShapes);
Expand Down Expand Up @@ -250,12 +249,24 @@ class InitGraphStatefulDiffPrimitiveModel : public InitGraphStatefulModelBase {
}

void check_init_graph_node() override {
#if defined(OPENVINO_ARCH_ARM64)
// Convert node is fused into Eltwise on arm platforms
if (directPair) {
CheckNumberOfNodesWithType(compiledModel, "Convert", 0);
} else {
CheckNumberOfNodesWithType(compiledModel, "Convert", 1);
}
#else
CheckNumberOfNodesWithType(compiledModel, "Convert", 1);
#endif
}

ov::Shape get_state_shape(size_t i) override {
return inputShapes[0].second[i];
}

private:
bool directPair;
};

TEST_P(InitGraphStatefulDiffPrimitiveModel, CompareWithRefs) {
Expand Down
Loading