Skip to content
Closed
Show file tree
Hide file tree
Changes from 184 commits
Commits
Show all changes
202 commits
Select commit Hold shift + click to select a range
07e9f65
Impelemented the alfa version of the deletion integration solution.
Singularity23x0 Oct 23, 2025
6728e2b
Bump github.com/cert-manager/cert-manager from 1.19.0 to 1.19.1 (#7321)
dependabot[bot] Oct 20, 2025
71b8511
Deflake test (#7325)
pajakd Oct 20, 2025
636383f
[Bugfix] Allow to set ClusterName with ElasticJob (#7278)
mszadkow Oct 20, 2025
b9466d5
Bump kueueviz frontend dependencies. (#7335)
mbobrovskyi Oct 21, 2025
5d3187f
Bump cypress in /test/e2e/kueueviz in the all group (#7239)
dependabot[bot] Oct 21, 2025
0b08d3d
Bump node from 24-alpine to 25-alpine in /hack/depcheck (#7323)
dependabot[bot] Oct 21, 2025
9a97939
Bump node from 24-slim to 25-slim in /cmd/kueueviz/frontend (#7324)
dependabot[bot] Oct 21, 2025
d7bedee
E2e test for Node HotSwap in TAS with slices (#7142)
pajakd Oct 22, 2025
02c34b8
Enable cache in pod integration tests to fix failure with ManagedJob …
kannon92 Oct 23, 2025
b231d05
Fix MultiKueue workload re-evaluation bug (#6732)
ravisantoshgudimetla Oct 23, 2025
e05dd5e
Remove unnecessary error check. (#7352)
mbobrovskyi Oct 23, 2025
122318e
Use default cluster names. (#7353)
mbobrovskyi Oct 23, 2025
93e6b50
Enable conversion webhooks for v1beta2: LocalQueue, ClusterQueue, Wor…
mimowo Oct 23, 2025
5dc4f0a
Extend immutable error messages. (#7354)
mbobrovskyi Oct 23, 2025
4f9e8e4
chore: Use utiltesting context in DRA UTs (#7356)
tenzen-y Oct 23, 2025
26d469b
Update main after 0.13.7 (#7360)
mimowo Oct 23, 2025
071dd51
Update main with the latest v0.14.2 (#7359)
tenzen-y Oct 23, 2025
db6df11
Deprecate LocalQueueFlavorStatus for v1beta1 and v1beta2 (#7337)
iomarsayed Oct 24, 2025
5909cb3
Add TAS support to the Kubeflow Trainer integration (#7249)
kaisoz Oct 24, 2025
a416c2e
Add validation for unsupported DRA features (#7226)
harche Oct 24, 2025
adca57b
hotswap reschedule evicted (#7376)
pajakd Oct 24, 2025
ab74ebe
v1beta2: graduate Config API (#7375)
mbobrovskyi Oct 24, 2025
3e01953
Align imports for Kueue (#7378)
mimowo Oct 24, 2025
e0a733c
Remove workers from Pytorch e2e test. (#7381)
mbobrovskyi Oct 24, 2025
02af80c
Fix Should run a kubeflow PyTorchJob on worker if admitted e2e test. …
mbobrovskyi Oct 24, 2025
aa299d3
[Trainer] Use podset label to identify Kueue injected config (#7389)
kaisoz Oct 24, 2025
ab990ab
Expose contextualized fair sharing weights for cluster queues as metr…
j-skiba Oct 27, 2025
9a4452a
Bump e2e-test-images/agnhost from 2.57 to 2.59 in /hack/agnhost (#7399)
dependabot[bot] Oct 27, 2025
33ceb56
Use clock on preemption. (#7395)
mbobrovskyi Oct 27, 2025
c7ef18f
v1beta2-convert-logic-and-tests (#7369)
mimowo Oct 27, 2025
4d627e6
Split preemptions unit tests. (#7403)
mbobrovskyi Oct 27, 2025
72f0e69
Bump cypress/base from 22.20.0 to 22.21.0 in /hack/cypress (#7402)
dependabot[bot] Oct 27, 2025
ecc5785
update documentation to use v1beta2 (#7409)
kannon92 Oct 27, 2025
1b6910c
Helm: request conversion webhooks only for types requiring it (#7410)
mimowo Oct 27, 2025
b4973f7
replace cohort with cohortName for v1beta2 docs (#7412)
kannon92 Oct 27, 2025
7c1f588
add v1beta2 api gen docs (#7414)
kannon92 Oct 27, 2025
7b78828
formatting issue: add space after comments for apigeneration tags (#7…
kannon92 Oct 27, 2025
361f523
Replace preemtion stub with interceptor function in TestPreemption. (…
mbobrovskyi Oct 28, 2025
1de8dba
Bump the all group in /cmd/kueueviz/frontend with 2 updates (#7405)
mbobrovskyi Oct 28, 2025
54091d1
Bump github.com/onsi/ginkgo/v2 from 2.26.0 to 2.27.1 (#7397)
dependabot[bot] Oct 28, 2025
87d985b
Deprecate QueueVisibility for v1beta2 (#7319)
bobsongplus Oct 28, 2025
5062544
doc: add Kueue configuration v1beta2 API document (#7417)
bobsongplus Oct 28, 2025
7491d35
Support mutating workload priority class. (#7289)
mbobrovskyi Oct 28, 2025
430f1db
enable ssa tags for kubernetes api linter (#7339)
kannon92 Oct 28, 2025
17a7ec3
increase topology limits to 16 to match topology updates (#7423)
kannon92 Oct 29, 2025
fddf1b7
Bump github.com/onsi/ginkgo/v2 in /hack/internal/tools (#7401)
dependabot[bot] Oct 29, 2025
b177dcc
Simplify JobSet ReclaimablePods integration (#7420)
PBundyra Oct 29, 2025
cfe148a
promote MultiKueueBatchJobWithManagedBy to beta (#7341)
kannon92 Oct 29, 2025
9bba991
Remove duplicate env variables in podSet template. (#7425)
mbobrovskyi Oct 30, 2025
a24556e
Fix multikueue/provisioning indexer conflict setup (#7432)
IrvingMg Oct 30, 2025
5891828
Fix SanitizePodSets feature gate version. (#7444)
mbobrovskyi Oct 30, 2025
d0338b2
MultiKueue remote client kubeconfig validation (#7439)
mszadkow Oct 30, 2025
d81e75b
services: update app.kuberntes.io/component for services (#7371)
rphillips Oct 30, 2025
7c4de98
Add License prefix for helm templates. (#7438)
mbobrovskyi Oct 30, 2025
cbc270b
Self-nominate IrvingMg as reviewer for internal tool yaml-processor (…
IrvingMg Oct 30, 2025
56bdbcc
Update main with the latest v0.14.3 (#7455)
mimowo Oct 30, 2025
b4edcb9
Add v0.13.8 Release note to CHANGELOG (#7458)
tenzen-y Oct 30, 2025
ab8b367
Replace preemtion stub with interceptor function in TestHierarchicalP…
mbobrovskyi Oct 30, 2025
4f8172d
Drop graduated ManagedJobsNamespaceSelector feature gate. (#7466)
mbobrovskyi Oct 31, 2025
76a89e0
v1beta2: graduate the visibility API. (#7411)
mbobrovskyi Oct 31, 2025
8e4abe2
add support for maxlength linter command for kubernetes-api-linter (#…
kannon92 Oct 31, 2025
fc3c1c4
Fix feature gates tables. (#7467)
mbobrovskyi Oct 31, 2025
4ce6906
Sync feature gate tables. (#7475)
mbobrovskyi Oct 31, 2025
6121826
Drop graduated ProvisioningACC feature gate. (#7465)
mbobrovskyi Oct 31, 2025
0423898
docs(kep): Create delayed admission check retries KEP (#6210)
dhenkel92 Oct 31, 2025
8c973f5
v1beta2: drop types related to QueueVisibility (#7447)
mbobrovskyi Oct 31, 2025
83effda
v1beta2: Remove deprecated retryDelayMinutes field and fix conversion…
nerdeveloper Oct 31, 2025
8970bdb
v1beta2: drop deprecated Flavors field from LocalQueueStatus (#7449)
mbobrovskyi Oct 31, 2025
825c4e3
v1beta2: remove all unnecessary wrappers for v1beta1 (#7481)
mbobrovskyi Oct 31, 2025
5225f29
Replace preemption stub with interceptor function in TestSchedule. (#…
mbobrovskyi Oct 31, 2025
f483b4f
Extend kubeconfig validation tests (#7483)
mszadkow Oct 31, 2025
0827c45
Prevent StatefulSet scale-up while workload is being deleted (#7479)
IrvingMg Nov 3, 2025
d96094c
Promote AdmissionFairSharing to beta (#7463)
kannon92 Nov 3, 2025
cedc241
Replace preemption stub with interceptor function in TestLastScheduli…
mbobrovskyi Nov 3, 2025
ef12e72
Enable nomaps and nobools kube api linter (#7489)
kannon92 Nov 3, 2025
4aa6d05
Replace preemption stub with interceptor function in scheduler TAS un…
mbobrovskyi Nov 3, 2025
c7d7ef3
Completed the initial draft of Delete event refactor.
Singularity23x0 Nov 4, 2025
1cef4e8
Finished deletion refactor. Added unit tests.
Singularity23x0 Nov 5, 2025
c20b6ac
Remove remote client of insecurely setup cluster (#7486)
mszadkow Nov 3, 2025
5a83c45
Remove applyPreemption stub. (#7507)
mbobrovskyi Nov 3, 2025
07e442e
v1beta2: Remove deprecated PodIntegrationOptions API (#7406)
nerdeveloper Nov 4, 2025
f134700
Switch Default TAS Placement Algorithm from BestFit to Mixed. (#7416)
iomarsayed Nov 4, 2025
a1fb5ae
Cleanup jobframework log (#7426)
PBundyra Nov 4, 2025
05ba58b
Wrap with Eventually to avoid flake (#7523)
mszadkow Nov 4, 2025
b205f5d
Flaky sticky workload - fix (#7528)
mimowo Nov 4, 2025
08319fa
Use Equal instead of Equivalent for asserting Suspend (#7526)
mszadkow Nov 4, 2025
933a228
v1beta2: In FlavorFungibility API migrate Preempt/Borrow to MayStopSe…
mbobrovskyi Nov 5, 2025
490d40d
Graduate ManagedJobsNamespaceSelectorAlwaysRespected feature to Beta …
PannagaRao Nov 5, 2025
ff7236f
Add Multikueue and ProvReq integration test (#7505)
IrvingMg Nov 5, 2025
a806a48
Add feature gate for reclaimable Pods (#7525)
PBundyra Nov 5, 2025
0e7a10d
Cleanup preemption message generation (#7541)
mszadkow Nov 5, 2025
bd6e6fb
Use wrappers in cluster_queue_test.go. (#7543)
mbobrovskyi Nov 5, 2025
e3fe657
Finalizer implementation finalized.
Singularity23x0 Nov 7, 2025
a77639e
enable optional, required and optionalandrequired linter checks (#7488)
kannon92 Nov 5, 2025
1edfeaf
Add preemptor and preemptee path to the Preemption message (#7522)
mszadkow Nov 5, 2025
2544a73
Refactor DRA validation to use field.ErrorList (#7529)
harche Nov 5, 2025
daaa5c2
Remove redundant type conversions. (#7545)
mbobrovskyi Nov 6, 2025
2c0904f
Ensure roundtrip success for Quantities (#7430)
brejman Nov 6, 2025
62db2e8
Remove deprecated AdmissionChecks field from v1beta2 ClusterQueue API…
nerdeveloper Nov 6, 2025
50c1fb0
Use GomegaMatcher instead of OmegaMatcher. (#7552)
mbobrovskyi Nov 6, 2025
c759957
Use ExpectWorkloadsWithWorkloadPriority and ExpectWorkloadsWithPodPri…
mbobrovskyi Nov 6, 2025
f8df1e4
Fix test case to check creation workload with empty priorityClassName…
mbobrovskyi Nov 6, 2025
e1a8d18
Update wrappers to use utiltesting alias (#7561)
mszadkow Nov 6, 2025
fcf5ea3
Add CHANGELOG for v0.13.9 (#7562)
tenzen-y Nov 6, 2025
e8c47b8
Update intergation tests to use utiltesting alias (#7563)
mszadkow Nov 6, 2025
22e249f
Update main with the latest v0.14.4 (#7559)
mimowo Nov 6, 2025
25f46f8
api/kueue/v1beta1: add unit tests for workload conversion (#7546)
sohankunkerkar Nov 6, 2025
1e9c2cd
test: Add conversion unit tests for LocalQueue and ClusterQueue (#7567)
sohankunkerkar Nov 6, 2025
89f6f27
Set default image in wrappers to agnhost (#7551)
mszadkow Nov 6, 2025
c00e728
Bump github.com/containerd/containerd in /hack/internal/tools (#7568)
dependabot[bot] Nov 6, 2025
ae76730
Use util.RealClock in tests. (#7574)
mbobrovskyi Nov 7, 2025
ec1b73d
enable linter via regular expressions (#7571)
kannon92 Nov 7, 2025
15a6257
[Cleanup] Update e2e tests to use utiltesting alias (#7564)
mszadkow Nov 7, 2025
3b81164
Fix - Workloads requesting TAS cannot run via MultiKueue (#5361)
IrvingMg Nov 7, 2025
e2ecd6a
Fixed DelayedTopologyRequestState enum validation. (#7573)
mbobrovskyi Nov 7, 2025
84b835c
Add documentation for Kubeflow Trainer v2 TrainJob integration with K…
NarayanaSabari Nov 7, 2025
71cd4e1
JobReconciler don't update PodsReady condition timely (#7364)
olderTaoist Nov 7, 2025
7d2f0ff
v1beta2: change the API for Workload's spec.priorityClassSource (#7540)
mbobrovskyi Nov 7, 2025
f513d78
Merge branch 'main' into 5310-reconciliation-logic
Singularity23x0 Nov 7, 2025
f8b1d9a
Update pkg/controller/core/workload_controller.go
Singularity23x0 Nov 17, 2025
d816f66
Update pkg/controller/core/workload_controller.go
Singularity23x0 Nov 17, 2025
cf8eebe
Update pkg/controller/core/workload_controller.go
Singularity23x0 Nov 17, 2025
488819b
Log levels cleanup.
Singularity23x0 Nov 17, 2025
61d5822
Refactor.
Singularity23x0 Nov 17, 2025
2759e65
Removed unused constant.
Singularity23x0 Nov 17, 2025
6e35455
Added value back for merge purposes.
Singularity23x0 Nov 17, 2025
d91bff0
Introduce workload.Finish helper function (#7582)
mszadkow Nov 7, 2025
9b7fa28
Bump Kubeflow Trainer to v2.1.0 (#7586)
IrvingMg Nov 7, 2025
c188f41
Bump cypress in /test/e2e/kueueviz in the all group (#7595)
dependabot[bot] Nov 10, 2025
c5ba474
Bump github.com/kubeflow/mpi-operator from 0.6.0 to 0.7.0 (#7593)
dependabot[bot] Nov 10, 2025
291ba2c
Bump cypress/base from 22.21.0 to 24.11.0 in /hack/cypress (#7596)
dependabot[bot] Nov 10, 2025
213b2d3
Bump github.com/onsi/ginkgo/v2 from 2.27.1 to 2.27.2 (#7590)
dependabot[bot] Nov 10, 2025
a2b16f1
Bump sigs.k8s.io/controller-runtime from 0.22.3 to 0.22.4 (#7591)
dependabot[bot] Nov 10, 2025
2201beb
Bump golang.org/x/sync from 0.17.0 to 0.18.0 (#7592)
dependabot[bot] Nov 10, 2025
6d01bac
Bump github.com/ray-project/kuberay/ray-operator from 1.4.2 to 1.5.0 …
dependabot[bot] Nov 10, 2025
abd37c9
Cleanup workload.Finish (#7588)
mszadkow Nov 12, 2025
a3bccf8
Bump the all group in /cmd/kueueviz/frontend with 4 updates (#7603)
mbobrovskyi Nov 12, 2025
5fd9a8f
Remove RuntimeInfo wrapper (#7607)
mszadkow Nov 12, 2025
0a438a9
Refactor Pending() and add PendingTotal(). (#7609)
mbobrovskyi Nov 12, 2025
d4c69c1
Merge PendingActiveInLocalQueue and PendingInadmissibleInLocalQueue. …
mbobrovskyi Nov 12, 2025
14772ff
Fix Scheduler when ClusterQueue head has inadmissible workload sticky…
mbobrovskyi Nov 12, 2025
0c08737
Remove support for Kubernetes v1.31. (#7623)
mbobrovskyi Nov 12, 2025
1888303
Cleanup logging for Job MultiKueue adapter (#7624)
mbobrovskyi Nov 12, 2025
76d8785
[TAS] Balanced placement (#6851)
pajakd Nov 13, 2025
6e502d4
Restrict logging of nominating with incremental dispatcher (#7619)
mszadkow Nov 13, 2025
1a59050
Make ExpectWorkloadsToBePreempted() more strict. (#7631)
mbobrovskyi Nov 13, 2025
8f3bd28
Add ginkgo.GinkgoHelper() where it was missed. (#7635)
mbobrovskyi Nov 13, 2025
cc0fff0
[KEP] FlavorFungability: replace FlavorFungibilityImplicitPreferenceD…
vladikkuzn Nov 13, 2025
780484e
Fix wait_for_images.sh for release candidates. (#7636)
mbobrovskyi Nov 13, 2025
88ff2d5
Add priorities to workload to make the test deterministic (#7630)
mszadkow Nov 13, 2025
05935b3
Remove offset if using ginkgo.GinkgoHelper(). (#7632)
mbobrovskyi Nov 13, 2025
1f9dde2
Use `constants.PodSetLabel` instead of `controllerconsts.PodSetLabel`…
kshalot Nov 13, 2025
a341567
fix: fix typo in docs (#7648)
kennygt51 Nov 13, 2025
3ff4e09
v1beta2: Delete .enable field from FairSharing API in config (#7583)
mbobrovskyi Nov 13, 2025
23b0ae6
Cleanup of Balanced TAS (#7645)
pajakd Nov 13, 2025
d3018e3
Log Sticky Workload Deletion Path (#7654)
gabesaba Nov 14, 2025
b542ac9
test: add TestCompareBool (#7651)
kennygt51 Nov 14, 2025
2f5dd3a
update to helm 4.0 (#7653)
kannon92 Nov 14, 2025
0666991
KEP changes for v1beta2 TopologyAssignment (#7419)
olekzabl Nov 14, 2025
6bdb2dc
TopologyAssignment v1beta2 (#7544)
olekzabl Nov 14, 2025
08014e4
Disable IcrementalDispatcher if not configured (#7638)
mszadkow Nov 14, 2025
44ff55e
Break looping when workload is already known to run on node (#7658)
olekzabl Nov 14, 2025
a04a945
Document changing featureGates with configMap (#7652)
MaysaMacedo Nov 14, 2025
5342857
v1beta2: Delete .enable field from WaitForPodsReady API in config (#7…
mbobrovskyi Nov 14, 2025
79846fc
[KEP] FlavorFungability: replace FlavorFungibilityImplicitPreferenceD…
vladikkuzn Nov 14, 2025
1473a73
Check Cq active before the test to avoid flakiness (#7672)
mszadkow Nov 14, 2025
0f8ca03
Fix the MultiKueue flake issue (#7666)
mimowo Nov 14, 2025
71524e3
Rename variable (#7668)
pajakd Nov 14, 2025
0c6635e
Set blockAdmission to false in workload retention docs (#7676)
kannon92 Nov 14, 2025
98c692b
Use job key instead of key. (#7681)
mbobrovskyi Nov 15, 2025
03fdaa0
Fix example links in website. (#7685)
mbobrovskyi Nov 17, 2025
7195107
chore: Use structured loggings for localqueue entry penalty (#7680)
tenzen-y Nov 17, 2025
b79f9fa
v1beta2: change default for waitForPodsReady.blockAdmission to false …
mbobrovskyi Nov 17, 2025
b9f23bb
KEP-2349: Move MultiKueue external custom Job support to Beta (#7669)
khrm Nov 17, 2025
a47c3a5
Bump the kubernetes group across 1 directory with 3 updates (#7694)
dependabot[bot] Nov 17, 2025
25f7e06
feat(KEP-3258): implement delayed admission check retries (#7620)
sohankunkerkar Nov 17, 2025
16c6638
Bump cypress/base from 24.11.0 to 24.11.1 in /hack/cypress (#7696)
dependabot[bot] Nov 17, 2025
0b5ed06
Change `common{Prefix,Suffix}` -> `{prefix,suffix}` (#7697)
olekzabl Nov 17, 2025
ab2ef7e
Fix inconsistency in the KEP/2349 README.md (#7698)
khrm Nov 17, 2025
db35b6f
Check Lq active before the test to avoid flakiness (#7699)
mszadkow Nov 17, 2025
ca7924f
Merge branch 'main' into 5310-reconciliation-logic
Singularity23x0 Nov 17, 2025
4a229aa
Post merge cleanup.
Singularity23x0 Nov 17, 2025
17fb70c
Code stucturing - minor ix.
Singularity23x0 Nov 17, 2025
135a260
Applied review comments.
Singularity23x0 Nov 17, 2025
8b10c39
Added safety check with finalizers reconciliation.
Singularity23x0 Nov 17, 2025
1362d10
Fix AFS docs (#7705)
PBundyra Nov 17, 2025
57ba36e
Balanced refactor (#7700)
pajakd Nov 17, 2025
5d3c532
Optimize triggerDeactivation() logic. (#7711)
mbobrovskyi Nov 17, 2025
0d4b31e
Bump js-yaml from 3.14.1 to 3.14.2 in /test/e2e/kueueviz (#7717)
dependabot[bot] Nov 18, 2025
aca2250
Bump js-yaml from 3.14.1 to 3.14.2 in /cmd/kueueviz/frontend (#7718)
dependabot[bot] Nov 18, 2025
0f7a062
Wait for quota reservation before admission in Should readmit preempt…
mbobrovskyi Nov 18, 2025
4943614
add modernize check (#7704)
dongjiang1989 Nov 18, 2025
b72dd61
Use UpdateFunc type. (#7719)
mbobrovskyi Nov 18, 2025
af15273
Add MultiKueue with Topology-Aware Scheduling setup guide for Kind (#…
IrvingMg Nov 18, 2025
8296466
Use pointer for PatchOptions. (#7721)
mbobrovskyi Nov 18, 2025
000ea24
docs: Add feature gate documentation for MultiKueueAdaptersForCustomJ…
khrm Nov 18, 2025
2a901de
Format fix.
Singularity23x0 Nov 18, 2025
510f516
Merge branch 'kubernetes-sigs:main' into 5310-reconciliation-logic
Singularity23x0 Nov 18, 2025
476cc52
fix wl controller test
Singularity23x0 Nov 18, 2025
7dc5491
WOrklaod controller tests fix.
Singularity23x0 Nov 18, 2025
82491dd
Unit tests update.
Singularity23x0 Nov 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apis/kueue/v1beta1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package v1beta1

const (
ResourceInUseFinalizerName = "kueue.x-k8s.io/resource-in-use"
SafeDeleteFinalizerName = "kueue.x-k8s.io/delete-safeguard"
DefaultPodSetName PodSetReference = "main"

// ElasticJobSchedulingGate is the name of the scheduling gate applied to Pods
Expand Down
1 change: 1 addition & 0 deletions apis/kueue/v1beta2/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package v1beta2

const (
ResourceInUseFinalizerName = "kueue.x-k8s.io/resource-in-use"
SafeDeleteFinalizerName = "kueue.x-k8s.io/delete-safeguard"
DefaultPodSetName PodSetReference = "main"

// ElasticJobSchedulingGate is the name of the scheduling gate applied to Pods
Expand Down
4 changes: 2 additions & 2 deletions pkg/cache/queue/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ func (m *Manager) GetClusterQueueNames() []kueue.ClusterQueueReference {
return m.hm.ClusterQueuesNames()
}

func (m *Manager) getClusterQueue(cqName kueue.ClusterQueueReference) *ClusterQueue {
func (m *Manager) GetClusterQueue(cqName kueue.ClusterQueueReference) *ClusterQueue {
m.RLock()
defer m.RUnlock()
return m.getClusterQueueLockless(cqName)
Expand All @@ -736,7 +736,7 @@ func (m *Manager) getClusterQueueLockless(cqName kueue.ClusterQueueReference) *C
}

func (m *Manager) PendingWorkloadsInfo(cqName kueue.ClusterQueueReference) []*workload.Info {
cq := m.getClusterQueue(cqName)
cq := m.GetClusterQueue(cqName)
if cq == nil {
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cache/queue/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ func TestUpdateClusterQueue(t *testing.T) {
t.Fatalf("Failed adding clusterQueue %s: %v", cq.Name, err)
}
// Increase the popCycle to ensure that the workload will be added as inadmissible.
manager.getClusterQueue(kueue.ClusterQueueReference(cq.Name)).popCycle++
manager.GetClusterQueue(kueue.ClusterQueueReference(cq.Name)).popCycle++
}
for _, q := range queues {
if err := manager.AddLocalQueue(ctx, q); err != nil {
Expand Down
4 changes: 4 additions & 0 deletions pkg/cache/scheduler/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,10 @@ func (c *Cache) GetCacheLocalQueue(cqName kueue.ClusterQueueReference, lq *kueue
return nil, errQNotFound
}

func (c *Cache) GetClusterQueue(cqName kueue.ClusterQueueReference) *clusterQueue {
return c.hm.ClusterQueues()[cqName]
}

func (c *Cache) UpdateLocalQueue(oldQ, newQ *kueue.LocalQueue) error {
if oldQ.Spec.ClusterQueue == newQ.Spec.ClusterQueue {
return nil
Expand Down
93 changes: 59 additions & 34 deletions pkg/controller/core/workload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/predicate"
Expand Down Expand Up @@ -158,18 +159,33 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
return ctrl.Result{}, client.IgnoreNotFound(err)
}

log := ctrl.LoggerFrom(ctx)
status := workload.Status(&wl)
log := ctrl.LoggerFrom(ctx).WithValues("workload", klog.KObj(&wl), "queue", wl.Spec.QueueName, "status", status)
log.V(2).Info("Reconcile Workload")

if len(wl.OwnerReferences) == 0 && !wl.DeletionTimestamp.IsZero() {
// manual deletion triggered by the user
err := workload.RemoveFinalizer(ctx, r.client, &wl)
return ctrl.Result{}, client.IgnoreNotFound(err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old code wrapped err with IgnoreNotFound - IIUC to silence errors in case when the workload has been deleted in the meantime.
Have you dropped that wrapping in this PR? If so, why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I did. The idea is that we do not want the workload deleted outside of the controlled environment safeguarded by the deletion finalizer. As such I advocate for the not found error to be explicit here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, from our onboarding I recall that, strictly speaking, there's no guarantee that a particular resource change (in this case - deletion) will be processed by Reconcile at most once.

Though this was theory, and in practice - IDK if it's better to throw on duplicated reconciliations (because they're so rare in practice) or to swallow potential mishandling errors as you described.

So for me it looks like a non-obvious tradeoff.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay makes sense, I'll amend the logic.

if !wl.DeletionTimestamp.IsZero() {
log = log.WithValues("deletionTimestamp", wl.DeletionTimestamp)
log.V(2).Info("Attempting to finalize workload.")

switch {
case controllerutil.ContainsFinalizer(&wl, kueue.ResourceInUseFinalizerName):
log.V(2).Info("Manual deletion by a user detected.")
if len(wl.OwnerReferences) == 0 {
return ctrl.Result{}, r.finalize(ctx, &wl, log)
} else {
log.V(3).Info("Unable to finalize: workload still has owners. Proceeding with reconcile.", "owners", wl.OwnerReferences)
}
case controllerutil.ContainsFinalizer(&wl, kueue.SafeDeleteFinalizerName):
return ctrl.Result{}, r.finalize(ctx, &wl, log)
default:
log.V(3).Info("Unknown finalizer(s) present. Proceeding with reconcile.")
}
}

finishedCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadFinished)
if finishedCond != nil && finishedCond.Status == metav1.ConditionTrue {
if !features.Enabled(features.ObjectRetentionPolicies) || r.workloadRetention == nil || r.workloadRetention.afterFinished == nil {
log.Info("Unable to determine workload retention scheme.")
return ctrl.Result{}, nil
}

Expand Down Expand Up @@ -521,6 +537,39 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
return ctrl.Result{}, nil
}

func (r *WorkloadReconciler) finalize(ctx context.Context, wl *kueue.Workload, log logr.Logger) error {
log.V(2).Info("Finalizing workload.")
defer r.notifyWatchers(wl, nil)

if workload.HasQuotaReservation(wl) {
var err error
r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() {
err = r.cache.DeleteWorkload(log, wl)
})
if err != nil {
log.Error(err, "Failed to delete workload from cache.")
return err
}
} else {
r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() {
if err := r.cache.DeleteWorkload(log, wl); err != nil {
log.Info("Failed to delete workload from cache.", "Error", err, "Note", "this may be intended behavior")
}
})
}

r.queues.DeleteWorkload(log, wl)

controllerutil.RemoveFinalizer(wl, kueue.ResourceInUseFinalizerName)
controllerutil.RemoveFinalizer(wl, kueue.SafeDeleteFinalizerName)
if err := r.client.Update(ctx, wl); err != nil {
return err
}

r.recorder.Eventf(wl, corev1.EventTypeNormal, "Finalized", "Workload %s has been finalized", workload.Key(wl))
return nil
}

// isDisabledRequeuedByClusterQueueStopped returns true if the workload is unset requeued by cluster queue stopped.
func isDisabledRequeuedByClusterQueueStopped(w *kueue.Workload) bool {
return isDisabledRequeuedByReason(w, kueue.WorkloadEvictedByClusterQueueStopped)
Expand Down Expand Up @@ -806,36 +855,12 @@ func (r *WorkloadReconciler) Create(e event.TypedCreateEvent[*kueue.Workload]) b
}

func (r *WorkloadReconciler) Delete(e event.TypedDeleteEvent[*kueue.Workload]) bool {
defer r.notifyWatchers(e.Object, nil)
status := "unknown"
if !e.DeleteStateUnknown {
status = workload.Status(e.Object)
}
log := r.log.WithValues("workload", klog.KObj(e.Object), "queue", e.Object.Spec.QueueName, "status", status)
log.V(2).Info("Workload delete event")
ctx := ctrl.LoggerInto(context.Background(), log)

// When assigning a clusterQueue to a workload, we assume it in the cache. If
// the state is unknown, the workload could have been assumed, and we need
// to clear it from the cache.
if workload.HasQuotaReservation(e.Object) || e.DeleteStateUnknown {
// trigger the move of associated inadmissibleWorkloads if required.
r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, e.Object, func() {
// Delete the workload from cache while holding the queues lock
// to guarantee that requeued workloads are taken into account before
// the next scheduling cycle.
if err := r.cache.DeleteWorkload(log, e.Object); err != nil {
if !e.DeleteStateUnknown {
log.Error(err, "Failed to delete workload from cache")
}
}
})
log := r.log.WithValues("workload", klog.KObj(e.Object), "queue", e.Object.Spec.QueueName, "status", workload.Status(e.Object))
if e.DeleteStateUnknown {
log.V(2).Info("Workload delete event; delete status unknown")
} else {
log.V(2).Info("Workload delete event")
}

// Even if the state is unknown, the last cached state tells us whether the
// workload was in the queues and should be cleared from them.
r.queues.DeleteWorkload(log, e.Object)

return true
}

Expand Down
Loading