Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Garbage collect NEGs in to-be-deleted state. #2750

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 49 additions & 11 deletions pkg/neg/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -548,16 +548,21 @@ func (manager *syncerManager) garbageCollectNEG() error {
return nil
}

type negDeletionCandidate struct {
svcNegCR *negv1beta1.ServiceNetworkEndpointGroup
isPartialDelete bool
}

// garbageCollectNEGWithCRD uses the NEG CRs and the svcPortMap to determine which NEGs
// need to be garbage collected. Neg CRs that do not have a configuration in the svcPortMap will deleted
// along with all corresponding NEGs in the CR's list of NetworkEndpointGroups. If NEG deletion fails in
// the cloud, the corresponding Neg CR will not be deleted
func (manager *syncerManager) garbageCollectNEGWithCRD() error {
deletionCandidates := map[string]*negv1beta1.ServiceNetworkEndpointGroup{}
deletionCandidates := make(map[string]negDeletionCandidate)
negCRs := manager.svcNegLister.List()
for _, obj := range negCRs {
neg := obj.(*negv1beta1.ServiceNetworkEndpointGroup)
deletionCandidates[neg.Name] = neg
deletionCandidates[neg.Name] = negDeletionCandidate{svcNegCR: neg, isPartialDelete: false}
}

func() {
Expand All @@ -572,6 +577,13 @@ func (manager *syncerManager) garbageCollectNEGWithCRD() error {
// neither the neg nor CR will not be deleted. In the situation a neg config is not in the svcPortMap,
// but the CR does not have a deletion timestamp, both CR and neg will be deleted.
if _, ok := deletionCandidates[portInfo.NegName]; ok {
// When the service still exists, we won't do GC unless it contains
// to-be-deleted NEGs.
svcNegCR := deletionCandidates[portInfo.NegName].svcNegCR
if flags.F.EnableMultiSubnetClusterPhase1 && containsTBDNeg(svcNegCR) {
deletionCandidates[portInfo.NegName] = negDeletionCandidate{svcNegCR: svcNegCR, isPartialDelete: true}
continue
}
delete(deletionCandidates, portInfo.NegName)
}
}
Expand All @@ -595,7 +607,7 @@ func (manager *syncerManager) garbageCollectNEGWithCRD() error {
errList = append(errList, fmt.Errorf("failed to get zones during garbage collection: %w", err))
}

deletionCandidatesChan := make(chan *negv1beta1.ServiceNetworkEndpointGroup, len(deletionCandidates))
deletionCandidatesChan := make(chan negDeletionCandidate, len(deletionCandidates))
for _, dc := range deletionCandidates {
deletionCandidatesChan <- dc
}
Expand All @@ -605,8 +617,8 @@ func (manager *syncerManager) garbageCollectNEGWithCRD() error {
wg.Add(len(deletionCandidates))
for i := 0; i < manager.numGCWorkers; i++ {
go func() {
for svcNegCR := range deletionCandidatesChan {
errs := manager.processNEGDeletionCandidate(svcNegCR, zones)
for negToDelete := range deletionCandidatesChan {
errs := manager.processNEGDeletionCandidate(negToDelete, zones)

errListMutex.Lock()
errList = append(errList, errs...)
Expand All @@ -621,11 +633,23 @@ func (manager *syncerManager) garbageCollectNEGWithCRD() error {
return utilerrors.NewAggregate(errList)
}

// containsTBDNeg returns if the given NEG CR contains NEG refs in
// to-be-deleted state.
func containsTBDNeg(svcNegCR *negv1beta1.ServiceNetworkEndpointGroup) bool {
contains := false
for _, negRef := range svcNegCR.Status.NetworkEndpointGroups {
contains = contains || negRef.State == negv1beta1.ToBeDeletedState
}
return contains
}

// processNEGDeletionCandidate attempts to delete `svcNegCR` and all NEGs
// associated with it. In case when `svcNegCR` does not have ample information
// about the zones associated with this NEG, it will attempt to delete the NEG
// from all zones specified through the `zones` slice.
func (manager *syncerManager) processNEGDeletionCandidate(svcNegCR *negv1beta1.ServiceNetworkEndpointGroup, zones []string) []error {
func (manager *syncerManager) processNEGDeletionCandidate(negToDelete negDeletionCandidate, zones []string) []error {
svcNegCR := negToDelete.svcNegCR

manager.logger.V(2).Info("Count of NEGs referenced by SvcNegCR", "svcneg", klog.KObj(svcNegCR), "count", len(svcNegCR.Status.NetworkEndpointGroups))
var errList []error
shouldDeleteNegCR := true
Expand All @@ -635,6 +659,11 @@ func (manager *syncerManager) processNEGDeletionCandidate(svcNegCR *negv1beta1.S
deletedNegs := make(map[negtypes.NegInfo]struct{})

for _, negRef := range svcNegCR.Status.NetworkEndpointGroups {
if flags.F.EnableMultiSubnetClusterPhase1 {
if negToDelete.isPartialDelete && negRef.State != negv1beta1.ToBeDeletedState {
continue
}
}
resourceID, err := cloud.ParseResourceURL(negRef.SelfLink)
if err != nil {
errList = append(errList, fmt.Errorf("failed to parse selflink for neg cr %s/%s: %s", svcNegCR.Namespace, svcNegCR.Name, err))
Expand All @@ -648,14 +677,23 @@ func (manager *syncerManager) processNEGDeletionCandidate(svcNegCR *negv1beta1.S
shouldDeleteNegCR = shouldDeleteNegCR && negDeleted
}

// If there is no NEG ref in the NEG CR, our best attempt is to get all existing subnets
// and delete NEGs in these subnets in a full service deletion.
// We will skip in the case of partial deletion. We cannot reconstruct the name of
// the NEG that needs to be deleted since that subnet is no longer available in
// Node Topology CR.
if deleteByZone {
manager.logger.V(2).Info("Deletion candidate has 0 NEG reference", "svcneg", klog.KObj(svcNegCR), "svcNegCR", svcNegCR)
for _, zone := range zones {
negDeleted := manager.deleteNegOrReportErr(svcNegCR.Name, zone, svcNegCR, &errList)
if negDeleted {
deletedNegs[negtypes.NegInfo{Name: svcNegCR.Name, Zone: zone}] = struct{}{}
// When EnableMultiSubnetClusterPhase1=false, we should always process deletion.
// Otherwise, we should only process full deletion.
if !flags.F.EnableMultiSubnetClusterPhase1 || !negToDelete.isPartialDelete {
for _, zone := range zones {
negDeleted := manager.deleteNegOrReportErr(svcNegCR.Name, zone, svcNegCR, &errList)
if negDeleted {
deletedNegs[negtypes.NegInfo{Name: svcNegCR.Name, Zone: zone}] = struct{}{}
}
shouldDeleteNegCR = shouldDeleteNegCR && negDeleted
}
shouldDeleteNegCR = shouldDeleteNegCR && negDeleted
}
}
// Since no more NEG deletion will be happening at this point, and NEG
Expand Down
Loading