Fixing memory leak in KDD watchers

heschlie · heschlie · commit 96597698a47e · 2017-09-28T14:38:39.000-07:00
When we create a cluster without our CRDs or if we remove one from a
running cluster it will start retrying to watch that non-existent
resource. During this loop we "resync" and then destroy the old
watchers. This process kicks off the leak which stems from somewhere in
client-go, which could be from fragmenting memory by quickly creating
and destroying the watches and underlying channels.

We now only close out watchers that have needed resync, this prevents us
from retrying watches on things that don't need to be stopped.
diff --git a/lib/backend/k8s/syncer.go b/lib/backend/k8s/syncer.go
@@ -308,8 +308,19 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 
 	log.Info("Starting Kubernetes API read loop")
 	for {
+		needSync := false
+
+		// Find out if we need to resync.
+		for _, resync := range syn.needsResync {
+			// We found something that needs resync, we can stop and move on.
+			if resync {
+				needSync = true
+				break
+			}
+		}
+
 		// If we need to resync, do so.
-		if len(syn.needsResync) != 0 {
+		if needSync {
 			// Set status to ResyncInProgress.
 			log.Debugf("Resync required - latest versions: %+v", latestVersions)
 			syn.callbacks.OnStatusUpdated(api.ResyncInProgress)
@@ -336,9 +347,12 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 				return
 			}
 
-			// Close the previous crop of watchers to avoid leaking resources when we
-			// recreate them below.
-			syn.closeAllWatchers()
+			// Close out any watches that needed resync.
+			for k, resync := range syn.needsResync {
+				if _, exists := syn.openWatchers[k]; exists && resync {
+					syn.closeWatcher(k)
+				}
+			}
 		}
 
 		// Create the Kubernetes API watchers.
@@ -353,6 +367,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_NS] = nsWatch
 			nsChan = nsWatch.ResultChan()
+			syn.needsResync[KEY_NS] = false
 		}
 
 		if _, exists := syn.openWatchers[KEY_PO]; !exists {
@@ -366,6 +381,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_PO] = poWatch
 			poChan = poWatch.ResultChan()
+			syn.needsResync[KEY_PO] = false
 		}
 
 		if _, exists := syn.openWatchers[KEY_NP]; !exists {
@@ -380,6 +396,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_NP] = npWatch
 			npChan = npWatch.ResultChan()
+			syn.needsResync[KEY_NP] = false
 		}
 
 		if _, exists := syn.openWatchers[KEY_GNP]; !exists {
@@ -394,6 +411,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_GNP] = gnpWatch
 			gnpChan = gnpWatch.ResultChan()
+			syn.needsResync[KEY_GNP] = false
 		}
 
 		if _, exists := syn.openWatchers[KEY_GC]; !exists {
@@ -408,6 +426,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_GC] = globalFelixConfigWatch
 			gcChan = globalFelixConfigWatch.ResultChan()
+			syn.needsResync[KEY_GC] = false
 		}
 
 		if _, exists := syn.openWatchers[KEY_IP]; !exists {
@@ -422,6 +441,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_IP] = ipPoolWatch
 			poolChan = ipPoolWatch.ResultChan()
+			syn.needsResync[KEY_IP] = false
 		}
 
 		if _, exists := syn.openWatchers[KEY_NO]; !exists && !syn.disableNodePoll {
@@ -436,12 +456,10 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {
 			}
 			syn.openWatchers[KEY_NO] = nodeWatch
 			noChan = nodeWatch.ResultChan()
+			syn.needsResync[KEY_NO] = false
+			syn.needsResync[KEY_HC] = false
 		}
 
-		// We resynced if we needed to, and have a complete set of watchers, so reset the
-		// needsResync flag.
-		syn.needsResync = map[string]bool{}
-
 		// Select on the various watch channels.
 		select {
 		case <-syn.stopChan:
diff --git a/lib/backend/k8s/syncer_test.go b/lib/backend/k8s/syncer_test.go
@@ -323,19 +323,19 @@ var _ = Describe("Test Syncer", func() {
 			// Simulate error on pod watch.
 			tc.podC <- watch.Event{Type: watch.Error, Object: nil}
 			// Expect a single new list call, but that each watcher is restarted.
-			Eventually(tc.getNumWatchCalls).Should(BeNumerically("==", WATCH_CALLS+7))
+			Eventually(tc.getNumWatchCalls).Should(BeNumerically("==", WATCH_CALLS+1))
 			Expect(tc.getNumListCalls()).To(BeNumerically("==", LIST_CALLS+1))
 
 			// Simulate error on IP Pool watch.
 			tc.poolC <- watch.Event{Type: watch.Error, Object: nil}
 			// Expect a single new list call, but that each watcher is restarted.
-			Eventually(tc.getNumWatchCalls).Should(BeNumerically("==", WATCH_CALLS+14))
+			Eventually(tc.getNumWatchCalls).Should(BeNumerically("==", WATCH_CALLS+2))
 			Expect(tc.getNumListCalls()).To(BeNumerically("==", LIST_CALLS+2))
 
 			// Simulate empty event on IP Pool watch (resourceVersion too old for TPRs)
 			tc.poolC <- watch.Event{Object: nil}
 			// Expect a single new list call, but that each watcher is restarted.
-			Eventually(tc.getNumWatchCalls).Should(BeNumerically("==", WATCH_CALLS+21))
+			Eventually(tc.getNumWatchCalls).Should(BeNumerically("==", WATCH_CALLS+3))
 			Expect(tc.getNumListCalls()).To(BeNumerically("==", LIST_CALLS+3))
 		})
 
@@ -383,15 +383,24 @@ var _ = Describe("Test Syncer", func() {
 			// Check that, after the resync, the old watchers are stopped.
 			tc.stateMutex.Lock()
 			defer tc.stateMutex.Unlock()
-			// We expect 7 old watchers and 7 new. If that changes, we'll assert here
+			// We expect 7 old watchers and 1 new. If that changes, we'll assert here
 			// so the maintainer can re-check the test still matches the logic.
-			Expect(tc.openWatchers).To(HaveLen(14))
-			for _, w := range tc.openWatchers[:len(tc.openWatchers)/2] {
-				w.stopMutex.Lock()
-				stopped := w.stopped
-				w.stopMutex.Unlock()
-				Expect(stopped).To(BeTrue())
+			Expect(tc.openWatchers).To(HaveLen(8))
+
+			// Check and verify the old pod watcher was closed, make sure we ignore the
+			// newest pod watch that was added by only iterating over the old watchers.
+			closed := false
+			for _, w := range tc.openWatchers[:len(tc.openWatchers)-1] {
+				if w.name == "pod" {
+					w.stopMutex.Lock()
+					stopped := w.stopped
+					w.stopMutex.Unlock()
+					Expect(stopped).To(BeTrue())
+					closed = true
+				}
 			}
+			// If for some reason we never found a pod watch we should fail.
+			Expect(closed).To(BeTrue())
 		})
 	})
 })

Original file line number	Diff line number	Diff line change
`@@ -308,8 +308,19 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`308`	`308`
`309`	`309`	`log.Info("Starting Kubernetes API read loop")`
`310`	`310`	`for {`
	`311`	`+ needSync := false`
	`312`	`+`
	`313`	`+ // Find out if we need to resync.`
	`314`	`+ for _, resync := range syn.needsResync {`
	`315`	`+ // We found something that needs resync, we can stop and move on.`
	`316`	`+ if resync {`
	`317`	`+ needSync = true`
	`318`	`+ break`
	`319`	`+ }`
	`320`	`+ }`
	`321`	`+`
`311`	`322`	`// If we need to resync, do so.`
`312`		`- if len(syn.needsResync) != 0 {`
	`323`	`+ if needSync {`
`313`	`324`	`// Set status to ResyncInProgress.`
`314`	`325`	`log.Debugf("Resync required - latest versions: %+v", latestVersions)`
`315`	`326`	`syn.callbacks.OnStatusUpdated(api.ResyncInProgress)`
`@@ -336,9 +347,12 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`336`	`347`	`return`
`337`	`348`	`}`
`338`	`349`
`339`		`- // Close the previous crop of watchers to avoid leaking resources when we`
`340`		`- // recreate them below.`
`341`		`- syn.closeAllWatchers()`
	`350`	`+ // Close out any watches that needed resync.`
	`351`	`+ for k, resync := range syn.needsResync {`
	`352`	`+ if _, exists := syn.openWatchers[k]; exists && resync {`
	`353`	`+ syn.closeWatcher(k)`
	`354`	`+ }`
	`355`	`+ }`
`342`	`356`	`}`
`343`	`357`
`344`	`358`	`// Create the Kubernetes API watchers.`
`@@ -353,6 +367,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`353`	`367`	`}`
`354`	`368`	`syn.openWatchers[KEY_NS] = nsWatch`
`355`	`369`	`nsChan = nsWatch.ResultChan()`
	`370`	`+ syn.needsResync[KEY_NS] = false`
`356`	`371`	`}`
`357`	`372`
`358`	`373`	`if _, exists := syn.openWatchers[KEY_PO]; !exists {`
`@@ -366,6 +381,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`366`	`381`	`}`
`367`	`382`	`syn.openWatchers[KEY_PO] = poWatch`
`368`	`383`	`poChan = poWatch.ResultChan()`
	`384`	`+ syn.needsResync[KEY_PO] = false`
`369`	`385`	`}`
`370`	`386`
`371`	`387`	`if _, exists := syn.openWatchers[KEY_NP]; !exists {`
`@@ -380,6 +396,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`380`	`396`	`}`
`381`	`397`	`syn.openWatchers[KEY_NP] = npWatch`
`382`	`398`	`npChan = npWatch.ResultChan()`
	`399`	`+ syn.needsResync[KEY_NP] = false`
`383`	`400`	`}`
`384`	`401`
`385`	`402`	`if _, exists := syn.openWatchers[KEY_GNP]; !exists {`
`@@ -394,6 +411,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`394`	`411`	`}`
`395`	`412`	`syn.openWatchers[KEY_GNP] = gnpWatch`
`396`	`413`	`gnpChan = gnpWatch.ResultChan()`
	`414`	`+ syn.needsResync[KEY_GNP] = false`
`397`	`415`	`}`
`398`	`416`
`399`	`417`	`if _, exists := syn.openWatchers[KEY_GC]; !exists {`
`@@ -408,6 +426,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`408`	`426`	`}`
`409`	`427`	`syn.openWatchers[KEY_GC] = globalFelixConfigWatch`
`410`	`428`	`gcChan = globalFelixConfigWatch.ResultChan()`
	`429`	`+ syn.needsResync[KEY_GC] = false`
`411`	`430`	`}`
`412`	`431`
`413`	`432`	`if _, exists := syn.openWatchers[KEY_IP]; !exists {`
`@@ -422,6 +441,7 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`422`	`441`	`}`
`423`	`442`	`syn.openWatchers[KEY_IP] = ipPoolWatch`
`424`	`443`	`poolChan = ipPoolWatch.ResultChan()`
	`444`	`+ syn.needsResync[KEY_IP] = false`
`425`	`445`	`}`
`426`	`446`
`427`	`447`	`if _, exists := syn.openWatchers[KEY_NO]; !exists && !syn.disableNodePoll {`
`@@ -436,12 +456,10 @@ func (syn *kubeSyncer) readFromKubernetesAPI() {`
`436`	`456`	`}`
`437`	`457`	`syn.openWatchers[KEY_NO] = nodeWatch`
`438`	`458`	`noChan = nodeWatch.ResultChan()`
	`459`	`+ syn.needsResync[KEY_NO] = false`
	`460`	`+ syn.needsResync[KEY_HC] = false`
`439`	`461`	`}`
`440`	`462`
`441`		`- // We resynced if we needed to, and have a complete set of watchers, so reset the`
`442`		`- // needsResync flag.`
`443`		`- syn.needsResync = map[string]bool{}`
`444`		`-`
`445`	`463`	`// Select on the various watch channels.`
`446`	`464`	`select {`
`447`	`465`	`case <-syn.stopChan:`