Skip to content

Commit 7d99392

Browse files
authored
Add drive repair support (#880)
Signed-off-by: Bala.FA <[email protected]>
1 parent a3471d6 commit 7d99392

File tree

25 files changed

+1025
-83
lines changed

25 files changed

+1025
-83
lines changed

.golangci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
linters-settings:
2-
gofumpt:
3-
lang-version: "1.22"
2+
run:
3+
go: "1.22"
44

55
misspell:
66
locale: US

cmd/directpv/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ func init() {
128128
mainCmd.AddCommand(legacyControllerCmd)
129129
mainCmd.AddCommand(legacyNodeServerCmd)
130130
mainCmd.AddCommand(nodeControllerCmd)
131+
mainCmd.AddCommand(repairCmd)
131132
}
132133

133134
func main() {

cmd/directpv/repair.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// This file is part of MinIO DirectPV
2+
// Copyright (c) 2024 MinIO, Inc.
3+
//
4+
// This program is free software: you can redistribute it and/or modify
5+
// it under the terms of the GNU Affero General Public License as published by
6+
// the Free Software Foundation, either version 3 of the License, or
7+
// (at your option) any later version.
8+
//
9+
// This program is distributed in the hope that it will be useful,
10+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
// GNU Affero General Public License for more details.
13+
//
14+
// You should have received a copy of the GNU Affero General Public License
15+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"errors"
22+
23+
directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
24+
"github.com/minio/directpv/pkg/client"
25+
drivepkg "github.com/minio/directpv/pkg/drive"
26+
"github.com/minio/directpv/pkg/types"
27+
"github.com/spf13/cobra"
28+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
)
30+
31+
var (
32+
forceFlag = false
33+
disablePrefetchFlag = false
34+
dryRunFlag = false
35+
)
36+
37+
var repairCmd = &cobra.Command{
38+
Use: "repair <DRIVE-ID>",
39+
Short: "Start drive repair.",
40+
SilenceUsage: true,
41+
SilenceErrors: true,
42+
RunE: func(c *cobra.Command, args []string) error {
43+
switch len(args) {
44+
case 0:
45+
return errors.New("DRIVE-ID must be provided")
46+
case 1:
47+
default:
48+
return errors.New("only one DRIVE-ID must be provided")
49+
}
50+
return startRepair(c.Context(), args[0])
51+
},
52+
}
53+
54+
func init() {
55+
repairCmd.PersistentFlags().BoolVar(&forceFlag, "force", forceFlag, "Force log zeroing")
56+
repairCmd.PersistentFlags().BoolVar(&disablePrefetchFlag, "disable-prefetch", disablePrefetchFlag, "Disable prefetching of inode and directory blocks")
57+
repairCmd.PersistentFlags().BoolVar(&dryRunFlag, "dry-run", dryRunFlag, "No modify mode")
58+
}
59+
60+
func startRepair(ctx context.Context, driveID string) error {
61+
var cancel context.CancelFunc
62+
ctx, cancel = context.WithCancel(ctx)
63+
defer cancel()
64+
65+
drive, err := client.DriveClient().Get(ctx, driveID, metav1.GetOptions{})
66+
if err != nil {
67+
return err
68+
}
69+
70+
if drive.Status.Status != directpvtypes.DriveStatusRepairing {
71+
drive.Status.Status = directpvtypes.DriveStatusRepairing
72+
}
73+
74+
updatedDrive, err := client.DriveClient().Update(ctx, drive, metav1.UpdateOptions{TypeMeta: types.NewDriveTypeMeta()})
75+
if err != nil {
76+
return err
77+
}
78+
79+
return drivepkg.Repair(ctx, updatedDrive, forceFlag, disablePrefetchFlag, dryRunFlag)
80+
}

cmd/kubectl-directpv/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ Use "{{.CommandPath}} [command] --help" for more information about this command.
165165
mainCmd.AddCommand(cleanCmd)
166166
mainCmd.AddCommand(suspendCmd)
167167
mainCmd.AddCommand(resumeCmd)
168+
mainCmd.AddCommand(repairCmd)
168169
mainCmd.AddCommand(removeCmd)
169170
mainCmd.AddCommand(uninstallCmd)
170171
mainCmd.SetHelpCommand(&cobra.Command{

cmd/kubectl-directpv/repair.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// This file is part of MinIO DirectPV
2+
// Copyright (c) 2024 MinIO, Inc.
3+
//
4+
// This program is free software: you can redistribute it and/or modify
5+
// it under the terms of the GNU Affero General Public License as published by
6+
// the Free Software Foundation, either version 3 of the License, or
7+
// (at your option) any later version.
8+
//
9+
// This program is distributed in the hope that it will be useful,
10+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
// GNU Affero General Public License for more details.
13+
//
14+
// You should have received a copy of the GNU Affero General Public License
15+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"errors"
22+
"os"
23+
"strings"
24+
25+
"github.com/minio/directpv/pkg/admin"
26+
"github.com/minio/directpv/pkg/consts"
27+
"github.com/spf13/cobra"
28+
)
29+
30+
var (
31+
forceFlag = false
32+
disablePrefetchFlag = false
33+
)
34+
35+
var repairCmd = &cobra.Command{
36+
Use: "repair DRIVE ...",
37+
Short: "Repair filesystem of drives",
38+
SilenceUsage: true,
39+
SilenceErrors: true,
40+
Example: strings.ReplaceAll(
41+
`1. Repair drives
42+
$ kubectl {PLUGIN_NAME} repair 3b562992-f752-4a41-8be4-4e688ae8cd4c`,
43+
`{PLUGIN_NAME}`,
44+
consts.AppName,
45+
),
46+
Run: func(c *cobra.Command, args []string) {
47+
driveIDArgs = args
48+
if err := validateRepairCmd(); err != nil {
49+
eprintf(true, "%v\n", err)
50+
os.Exit(-1)
51+
}
52+
53+
repairMain(c.Context())
54+
},
55+
}
56+
57+
func init() {
58+
setFlagOpts(repairCmd)
59+
60+
addDryRunFlag(repairCmd, "Repair drives with no modify mode")
61+
repairCmd.PersistentFlags().BoolVar(&forceFlag, "force", forceFlag, "Force log zeroing")
62+
repairCmd.PersistentFlags().BoolVar(&disablePrefetchFlag, "disable-prefetch", disablePrefetchFlag, "Disable prefetching of inode and directory blocks")
63+
}
64+
65+
func validateRepairCmd() error {
66+
if err := validateDriveIDArgs(); err != nil {
67+
return err
68+
}
69+
70+
if len(driveIDArgs) == 0 {
71+
return errors.New("no drive provided to repair")
72+
}
73+
74+
return nil
75+
}
76+
77+
func repairMain(ctx context.Context) {
78+
_, err := adminClient.Repair(
79+
ctx,
80+
admin.RepairArgs{
81+
DriveIDs: driveIDSelectors,
82+
DryRun: dryRunFlag,
83+
ForceFlag: forceFlag,
84+
DisablePrefetchFlag: disablePrefetchFlag,
85+
},
86+
logFunc,
87+
)
88+
if err != nil {
89+
eprintf(!errors.Is(err, admin.ErrNoMatchingResourcesFound), "%v\n", err)
90+
os.Exit(1)
91+
}
92+
}

docs/command-reference.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,28 @@ EXAMPLES:
722722
$ kubectl directpv resume volumes pvc-0700b8c7-85b2-4894-b83a-274484f220d0
723723
```
724724

725+
## `repair` command
726+
```
727+
Repair filesystem of drives
728+
729+
USAGE:
730+
directpv repair DRIVE ... [flags]
731+
732+
FLAGS:
733+
--dry-run Repair drives with no modify mode
734+
--force Force log zeroing
735+
--disable-prefetch Disable prefetching of inode and directory blocks
736+
-h, --help help for repair
737+
738+
GLOBAL FLAGS:
739+
--kubeconfig string Path to the kubeconfig file to use for CLI requests
740+
--quiet Suppress printing error messages
741+
742+
EXAMPLES:
743+
1. Repair drives
744+
$ kubectl directpv repair 3b562992-f752-4a41-8be4-4e688ae8cd4c
745+
```
746+
725747
## `remove` command
726748
```
727749
Remove unused drives from DirectPV

docs/drive-management.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,27 @@ Refer [remove command](./command-reference.md#remove-command) for more informati
118118
By Kubernetes design, [StatefulSet](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) workload is active only if all of its pods are in running state. Any faulty drive(s) will prevent the statefulset from starting up. DirectPV provides a workaround to suspend failed drives which will mount the respective volumes on empty `/var/lib/directpv/tmp` directory with read-only access. This can be done by executing the `suspend drives` command. Below is an example:
119119

120120
```sh
121-
> kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
121+
$ kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
122122
```
123123

124124
Suspended drives can be resumed once they are fixed. Upon resuming, the corresponding volumes will resume using the respective allocated drives. This can be done by using the `resume drives` command. Below is an example:
125125

126126
```sh
127-
> kubectl directpv resume drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
127+
$ kubectl directpv resume drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
128+
```
129+
130+
## Repair drives
131+
132+
***CAUTION: THIS IS DANGEROUS OPERATION WHICH LEADS TO DATA LOSS***
133+
134+
In a rare situation, filesystem on faulty drives can be repaired to make them usable. As a first step, faulty drives must be suspended, then the `repair` command should be run for them. The `repair` command creates onetime Kubernetes `Job` with the pod name as `repair-<DRIVE-ID>` and these jobs are auto removed after five minutes of its completion. Progress and status of the drive repair can be viewed using `kubectl log` command. Below is an example:
135+
136+
```sh
137+
# Suspend faulty drives
138+
$ kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
139+
140+
# Restart volume consumer pods and make sure associated volumes are unbound
141+
142+
# Run repair command on suspended drives
143+
$ kubectl directpv repair af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
128144
```

0 commit comments

Comments
 (0)