Skip to content

Commit 0499944

Browse files
JustinKuliopenshift-merge-robot
authored andcommitted
Update API discovery cache even when incomplete
Previously, if there were any APIs having issues on the cluster, no new API information could be used by the controller. Now, partial results can still be updated. Additionally, API discovery errors during startup will not prevent the controller from functioning at all. Refs: - https://issues.redhat.com/browse/ACM-5491 Signed-off-by: Justin Kulikauskas <[email protected]>
1 parent 9b8a98f commit 0499944

File tree

3 files changed

+126
-68
lines changed

3 files changed

+126
-68
lines changed

controllers/configurationpolicy_controller.go

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -178,13 +178,16 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
178178
}
179179

180180
start := time.Now()
181-
policiesList := policyv1.ConfigurationPolicyList{}
182181

183182
var skipLoop bool
184-
var discoveryErr error
185183

186184
if len(r.apiResourceList) == 0 || len(r.apiGroups) == 0 {
187-
discoveryErr = r.refreshDiscoveryInfo()
185+
discoveryErr := r.refreshDiscoveryInfo()
186+
187+
// If there was an error and no API information was received, then skip the loop.
188+
if discoveryErr != nil && (len(r.apiResourceList) == 0 || len(r.apiGroups) == 0) {
189+
skipLoop = true
190+
}
188191
}
189192

190193
// If it's been more than 10 minutes since the last refresh, then refresh the discovery info, but ignore
@@ -202,49 +205,45 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
202205
skipLoop = true
203206
}
204207

205-
if !skipLoop && (discoveryErr == nil || cleanupImmediately) {
208+
if cleanupImmediately || !skipLoop {
209+
policiesList := policyv1.ConfigurationPolicyList{}
210+
206211
// This retrieves the policies from the controller-runtime cache populated by the watch.
207212
err := r.List(context.TODO(), &policiesList)
208213
if err != nil {
209214
log.Error(err, "Failed to list the ConfigurationPolicy objects to evaluate")
215+
} else {
216+
// This is done every loop cycle since the channel needs to be variable in size to
217+
// account for the number of policies changing.
218+
policyQueue := make(chan *policyv1.ConfigurationPolicy, len(policiesList.Items))
219+
var wg sync.WaitGroup
210220

211-
skipLoop = true
212-
}
213-
} else {
214-
skipLoop = true
215-
}
216-
217-
// This is done every loop cycle since the channel needs to be variable in size to account for the number of
218-
// policies changing.
219-
policyQueue := make(chan *policyv1.ConfigurationPolicy, len(policiesList.Items))
220-
var wg sync.WaitGroup
221+
log.Info("Processing the policies", "count", len(policiesList.Items))
221222

222-
if !skipLoop {
223-
log.Info("Processing the policies", "count", len(policiesList.Items))
223+
// Initialize the related object map
224+
policyRelatedObjectMap = sync.Map{}
224225

225-
// Initialize the related object map
226-
policyRelatedObjectMap = sync.Map{}
226+
for i := 0; i < int(r.EvaluationConcurrency); i++ {
227+
wg.Add(1)
227228

228-
for i := 0; i < int(r.EvaluationConcurrency); i++ {
229-
wg.Add(1)
229+
go r.handlePolicyWorker(policyQueue, &wg)
230+
}
230231

231-
go r.handlePolicyWorker(policyQueue, &wg)
232-
}
232+
for i := range policiesList.Items {
233+
policy := policiesList.Items[i]
234+
if !shouldEvaluatePolicy(&policy, cleanupImmediately) {
235+
continue
236+
}
233237

234-
for i := range policiesList.Items {
235-
policy := policiesList.Items[i]
236-
if !shouldEvaluatePolicy(&policy, cleanupImmediately) {
237-
continue
238+
// handle each template in each policy
239+
policyQueue <- &policy
238240
}
239241

240-
// handle each template in each policy
241-
policyQueue <- &policy
242+
close(policyQueue)
243+
wg.Wait()
242244
}
243245
}
244246

245-
close(policyQueue)
246-
wg.Wait()
247-
248247
// Update the related object metric after policy processing
249248
if r.EnableMetrics {
250249
updateRelatedObjectMetric()
@@ -288,35 +287,45 @@ func (r *ConfigurationPolicyReconciler) handlePolicyWorker(
288287
}
289288
}
290289

290+
// refreshDiscoveryInfo tries to discover all the available APIs on the cluster, and update the
291+
// cached information. If it encounters an error, it may update the cache with partial results,
292+
// if those seem "better" than what's in the current cache.
291293
func (r *ConfigurationPolicyReconciler) refreshDiscoveryInfo() error {
292294
log.V(2).Info("Refreshing the discovery info")
293295
r.lock.Lock()
294296
defer func() { r.lock.Unlock() }()
295297

296298
dd := r.TargetK8sClient.Discovery()
297299

298-
_, apiResourceList, err := dd.ServerGroupsAndResources()
299-
if err != nil {
300-
log.Error(err, "Could not get the API resource list")
301-
302-
return err
300+
_, apiResourceList, resourceErr := dd.ServerGroupsAndResources()
301+
if resourceErr != nil {
302+
log.Error(resourceErr, "Could not get the full API resource list")
303303
}
304304

305-
r.apiResourceList = apiResourceList
305+
if resourceErr == nil || (len(apiResourceList) > len(r.discoveryInfo.apiResourceList)) {
306+
// update the list if it's complete, or if it's "better" than the old one
307+
r.discoveryInfo.apiResourceList = apiResourceList
308+
}
306309

307-
apiGroups, err := restmapper.GetAPIGroupResources(dd)
308-
if err != nil {
309-
log.Error(err, "Could not get the API groups list")
310+
apiGroups, groupErr := restmapper.GetAPIGroupResources(dd)
311+
if groupErr != nil {
312+
log.Error(groupErr, "Could not get the full API groups list")
313+
}
310314

311-
return err
315+
if (resourceErr == nil && groupErr == nil) || (len(apiGroups) > len(r.discoveryInfo.apiGroups)) {
316+
// update the list if it's complete, or if it's "better" than the old one
317+
r.discoveryInfo.apiGroups = apiGroups
312318
}
313319

314-
r.apiGroups = apiGroups
315320
// Reset the OpenAPI cache in case the CRDs were updated since the last fetch.
316321
r.openAPIParser = openapi.NewOpenAPIParser(dd)
317-
r.discoveryLastRefreshed = time.Now().UTC()
322+
r.discoveryInfo.discoveryLastRefreshed = time.Now().UTC()
318323

319-
return nil
324+
if resourceErr != nil {
325+
return resourceErr
326+
}
327+
328+
return groupErr // can be nil
320329
}
321330

322331
// shouldEvaluatePolicy will determine if the policy is ready for evaluation by examining the

test/e2e/case18_discovery_refresh_test.go

Lines changed: 60 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,39 +11,41 @@ import (
1111
. "github.com/onsi/gomega"
1212
k8serrors "k8s.io/apimachinery/pkg/api/errors"
1313
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/apimachinery/pkg/runtime/schema"
1415

1516
"open-cluster-management.io/config-policy-controller/test/utils"
1617
)
1718

18-
const (
19-
case18PolicyName = "policy-c18"
20-
case18Policy = "../resources/case18_discovery_refresh/policy.yaml"
21-
case18PolicyTemplateName = "policy-c18-template"
22-
case18PolicyTemplatePreReqs = "../resources/case18_discovery_refresh/prereqs-for-template-policy.yaml"
23-
case18PolicyTemplate = "../resources/case18_discovery_refresh/policy-template.yaml"
24-
case18ConfigMapName = "c18-configmap"
25-
)
19+
var _ = Describe("Test discovery info refresh", Ordered, func() {
20+
const (
21+
policyName = "policy-c18"
22+
policyYaml = "../resources/case18_discovery_refresh/policy.yaml"
23+
policyTemplateName = "policy-c18-template"
24+
policyTemplatePreReqs = "../resources/case18_discovery_refresh/prereqs-for-template-policy.yaml"
25+
policyTemplateYaml = "../resources/case18_discovery_refresh/policy-template.yaml"
26+
configMapName = "c18-configmap"
27+
badAPIServiceYaml = "../resources/case18_discovery_refresh/bad-apiservice.yaml"
28+
)
2629

27-
var _ = Describe("Test discovery info refresh", func() {
2830
It("Verifies that the discovery info is refreshed after a CRD is installed", func() {
29-
By("Creating " + case18PolicyName + " on managed")
30-
utils.Kubectl("apply", "-f", case18Policy, "-n", testNamespace)
31+
By("Creating " + policyName + " on managed")
32+
utils.Kubectl("apply", "-f", policyYaml, "-n", testNamespace)
3133
policy := utils.GetWithTimeout(
3234
clientManagedDynamic,
3335
gvrConfigPolicy,
34-
case18PolicyName,
36+
policyName,
3537
testNamespace,
3638
true,
3739
defaultTimeoutSeconds,
3840
)
3941
Expect(policy).NotTo(BeNil())
4042

41-
By("Verifying " + case18PolicyName + " becomes compliant")
43+
By("Verifying " + policyName + " becomes compliant")
4244
Eventually(func() interface{} {
4345
policy := utils.GetWithTimeout(
4446
clientManagedDynamic,
4547
gvrConfigPolicy,
46-
case18PolicyName,
48+
policyName,
4749
testNamespace,
4850
true,
4951
defaultTimeoutSeconds,
@@ -54,12 +56,33 @@ var _ = Describe("Test discovery info refresh", func() {
5456
})
5557

5658
It("Verifies that the discovery info is refreshed when a template references a new object kind", func() {
59+
By("Adding a non-functional API service")
60+
utils.Kubectl("apply", "-f", badAPIServiceYaml)
61+
62+
By("Checking that the API causes an error during API discovery")
63+
Eventually(
64+
func() error {
65+
cmd := exec.Command("kubectl", "api-resources")
66+
67+
err := cmd.Start()
68+
if err != nil {
69+
return nil // Just retry. If this consistently has an error, the test should fail.
70+
}
71+
72+
err = cmd.Wait()
73+
74+
return err
75+
},
76+
defaultTimeoutSeconds,
77+
1,
78+
).ShouldNot(BeNil())
79+
5780
By("Creating the prerequisites on managed")
5881
// This needs to be wrapped in an eventually since the object can't be created immediately after the CRD
5982
// is created.
6083
Eventually(
6184
func() interface{} {
62-
cmd := exec.Command("kubectl", "apply", "-f", case18PolicyTemplatePreReqs)
85+
cmd := exec.Command("kubectl", "apply", "-f", policyTemplatePreReqs)
6386

6487
err := cmd.Start()
6588
if err != nil {
@@ -74,24 +97,24 @@ var _ = Describe("Test discovery info refresh", func() {
7497
1,
7598
).Should(BeNil())
7699

77-
By("Creating " + case18PolicyTemplateName + " on managed")
78-
utils.Kubectl("apply", "-f", case18PolicyTemplate, "-n", testNamespace)
100+
By("Creating " + policyTemplateName + " on managed")
101+
utils.Kubectl("apply", "-f", policyTemplateYaml, "-n", testNamespace)
79102
policy := utils.GetWithTimeout(
80103
clientManagedDynamic,
81104
gvrConfigPolicy,
82-
case18PolicyTemplateName,
105+
policyTemplateName,
83106
testNamespace,
84107
true,
85108
defaultTimeoutSeconds,
86109
)
87110
Expect(policy).NotTo(BeNil())
88111

89-
By("Verifying " + case18PolicyTemplateName + " becomes compliant")
112+
By("Verifying " + policyTemplateName + " becomes compliant")
90113
Eventually(func() interface{} {
91114
policy := utils.GetWithTimeout(
92115
clientManagedDynamic,
93116
gvrConfigPolicy,
94-
case18PolicyTemplateName,
117+
policyTemplateName,
95118
testNamespace,
96119
true,
97120
defaultTimeoutSeconds,
@@ -101,9 +124,9 @@ var _ = Describe("Test discovery info refresh", func() {
101124
}, defaultTimeoutSeconds, 1).Should(Equal("Compliant"))
102125
})
103126

104-
It("Cleans up", func() {
127+
AfterAll(func() {
105128
err := clientManagedDynamic.Resource(gvrConfigPolicy).Namespace(testNamespace).Delete(
106-
context.TODO(), case18PolicyName, metav1.DeleteOptions{},
129+
context.TODO(), policyName, metav1.DeleteOptions{},
107130
)
108131
if !k8serrors.IsNotFound(err) {
109132
Expect(err).ToNot(HaveOccurred())
@@ -117,7 +140,7 @@ var _ = Describe("Test discovery info refresh", func() {
117140
}
118141

119142
err = clientManagedDynamic.Resource(gvrConfigPolicy).Namespace(testNamespace).Delete(
120-
context.TODO(), case18PolicyTemplateName, metav1.DeleteOptions{},
143+
context.TODO(), policyTemplateName, metav1.DeleteOptions{},
121144
)
122145
if !k8serrors.IsNotFound(err) {
123146
Expect(err).ToNot(HaveOccurred())
@@ -131,7 +154,20 @@ var _ = Describe("Test discovery info refresh", func() {
131154
}
132155

133156
err = clientManaged.CoreV1().ConfigMaps("default").Delete(
134-
context.TODO(), case18ConfigMapName, metav1.DeleteOptions{},
157+
context.TODO(), configMapName, metav1.DeleteOptions{},
158+
)
159+
if !k8serrors.IsNotFound(err) {
160+
Expect(err).ToNot(HaveOccurred())
161+
}
162+
163+
gvrAPIService := schema.GroupVersionResource{
164+
Group: "apiregistration.k8s.io",
165+
Version: "v1",
166+
Resource: "apiservices",
167+
}
168+
169+
err = clientManagedDynamic.Resource(gvrAPIService).Delete(
170+
context.TODO(), "v1beta1.pizza.example.com", metav1.DeleteOptions{},
135171
)
136172
if !k8serrors.IsNotFound(err) {
137173
Expect(err).ToNot(HaveOccurred())
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: apiregistration.k8s.io/v1
2+
kind: APIService
3+
metadata:
4+
name: v1beta1.pizza.example.com
5+
spec:
6+
group: pizza.example.com
7+
groupPriorityMinimum: 100
8+
insecureSkipTLSVerify: true
9+
service:
10+
name: pizza-server
11+
namespace: kube-system
12+
version: v1beta1
13+
versionPriority: 100

0 commit comments

Comments
 (0)