kubernetes · Chase-Marino · Jan 28, 2025 · grosser · Jan 28, 2025 · grosser
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -1083,6 +1083,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `scale-down-unready-enabled` | Should CA scale down unready nodes of the cluster | true |
 | `scale-down-unready-time` | How long an unready node should be unneeded before it is eligible for scale down | 20m0s |
 | `scale-down-utilization-threshold` | The maximum value between the sum of cpu requests and sum of memory requests of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down | 0.5 |
+| `scale-up-ignore-backoff-errors` | Cloud provider error message parts that should not trigger a scaleup backoff |  |
 | `scale-up-from-zero` | Should CA scale up when there are 0 ready nodes. | true |
 | `scan-interval` | How often cluster is reevaluated for scale up or down | 10s |
 | `scheduler-config-file` | scheduler-config allows changing configuration of in-tree scheduler plugins acting on PreFilter and Filter extension points |  |

diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -144,6 +144,8 @@ type AutoscalingOptions struct {
 	OkTotalUnreadyCount int
 	// ScaleUpFromZero defines if CA should scale up when there 0 ready nodes.
 	ScaleUpFromZero bool
+	// ScaleUpIgnoreBackoffErrors defines cloud provider error message parts that should not trigger a nodegroup scaleup backoff
+	ScaleUpIgnoreBackoffErrors []string
 	// ParallelScaleUp defines whether CA can scale up node groups in parallel.
 	ParallelScaleUp bool
 	// CloudConfig is the path to the cloud provider configuration file. Empty string for no configuration file.

diff --git a/cluster-autoscaler/core/scaleup/orchestrator/executor.go b/cluster-autoscaler/core/scaleup/orchestrator/executor.go
@@ -18,6 +18,7 @@ package orchestrator
 
 import (
 	"fmt"
+	"strings"
 	"sync"
 	"time"
 
@@ -166,10 +167,22 @@ func (e *scaleUpExecutor) executeScaleUp(
 		"Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
 	increase := info.NewSize - info.CurrentSize
 	if err := e.increaseSize(info.Group, increase, atomic); err != nil {
-		e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
 		aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ")
-		e.scaleStateNotifier.RegisterFailedScaleUp(info.Group, string(aerr.Type()), aerr.Error(), gpuResourceName, gpuType, now)
-		return aerr
+
+		backoff := true
+		for _, part := range e.autoscalingContext.AutoscalingOptions.ScaleUpIgnoreBackoffErrors {
+			if strings.Contains(err.Error(), part) {
+				e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "ScaledUpGroup", "Scale-up: retriable error %s", aerr.Error())
+				backoff = false
+				break
+			}
+		}
+
+		if backoff {
+			e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
+			e.scaleStateNotifier.RegisterFailedScaleUp(info.Group, string(aerr.Type()), aerr.Error(), gpuResourceName, gpuType, now)
+			return aerr
+		}
 	}
 	if increase < 0 {
 		return errors.NewAutoscalerError(errors.InternalError, fmt.Sprintf("increase in number of nodes cannot be negative, got: %v", increase))

diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -174,13 +174,16 @@ var (
 	maxEmptyBulkDeleteFlag     = flag.Int("max-empty-bulk-delete", 10, "Maximum number of empty nodes that can be deleted at the same time. DEPRECATED: Use --max-scale-down-parallelism instead.")
 	maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node. "+
 		"This flag is mutually exclusion with drain-priority-config flag which allows more configuration options.")
-	maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster.  After this is exceeded, CA halts operations")
-	okTotalUnreadyCount       = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
-	scaleUpFromZero           = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
-	parallelScaleUp           = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
-	maxNodeProvisionTime      = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
-	maxPodEvictionTime        = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
-	nodeGroupsFlag            = multiStringFlag(
+	maxTotalUnreadyPercentage  = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster.  After this is exceeded, CA halts operations")
+	okTotalUnreadyCount        = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
+	scaleUpFromZero            = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
+	scaleUpIgnoreBackoffErrors = multiStringFlag(
+		"scale-up-ignore-backoff-errors",
+		"Cloud provider error message parts that should not trigger a scaleup backoff. Can be used multiple times.")
+	parallelScaleUp      = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
+	maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
+	maxPodEvictionTime   = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
+	nodeGroupsFlag       = multiStringFlag(
 		"nodes",
 		"sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...>")
 	nodeGroupAutoDiscoveryFlag = multiStringFlag(
@@ -367,6 +370,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		MaxTotalUnreadyPercentage:        *maxTotalUnreadyPercentage,
 		OkTotalUnreadyCount:              *okTotalUnreadyCount,
 		ScaleUpFromZero:                  *scaleUpFromZero,
+		ScaleUpIgnoreBackoffErrors:       *scaleUpIgnoreBackoffErrors,
 		ParallelScaleUp:                  *parallelScaleUp,
 		EstimatorName:                    *estimatorFlag,
 		ExpanderNames:                    *expanderFlag,