just like: #5517
Bug Report
What version of Kubernetes are you using?
What version of TiDB Operator are you using?
1.4.7 , it also contains in master branch
What storage classes exist in the Kubernetes cluster and what are used for PD/TiKV pods?
What's the status of the TiDB cluster pods?
What did you do?
- Run snapshot backup via backup schedule, with maxReservedTime set
- Some backup failed
What did you expect to see?
Newly failed backup can be retained for a while
What did you see instead?
The failed backup was GC prematurely.
ref by code:
|
func (bm *backupScheduleManager) backupGC(bs *v1alpha1.BackupSchedule) { |
|
ns := bs.GetNamespace() |
|
bsName := bs.GetName() |
|
|
|
// if MaxBackups and MaxReservedTime are set at the same time, MaxReservedTime is preferred. |
|
if bs.Spec.MaxReservedTime != nil { |
|
bm.backupGCByMaxReservedTime(bs) |
|
bm.compactGCByMaxReservedTime(bs) |
|
return |
|
} |
|
|
|
if bs.Spec.MaxBackups != nil && *bs.Spec.MaxBackups > 0 { |
|
bm.backupGCByMaxBackups(bs) |
|
bm.compactGCByMaxBackups(bs) |
|
return |
|
} |
|
// TODO: When the backup schedule gc policy is not set, we should set a default backup gc policy. |
|
klog.Warningf("backup schedule %s/%s does not set backup gc policy", ns, bsName) |
|
} |
If backup in invalid/completed/failed phase, sort by creationTimestamp , the earlier the earlier, the higher the front
|
ascBackups, logBackup := separateSnapshotBackupsAndLogBackup(backupsList) |
|
if len(ascBackups) == 0 { |
|
return |
|
} |
|
// separateSnapshotBackupsAndLogBackup return snapshot backups order by create time asc and log backup |
|
func separateSnapshotBackupsAndLogBackup(backupsList []*v1alpha1.Backup) ([]*v1alpha1.Backup, *v1alpha1.Backup) { |
|
var ( |
|
ascBackupList = make([]*v1alpha1.Backup, 0) |
|
logBackup *v1alpha1.Backup |
|
) |
|
|
|
for _, backup := range backupsList { |
|
if backup.Spec.Mode == v1alpha1.BackupModeLog { |
|
logBackup = backup |
|
continue |
|
} |
|
// Completed, failed or invalid backups will be GC'ed |
|
if !(v1alpha1.IsBackupFailed(backup) || v1alpha1.IsBackupComplete(backup) || v1alpha1.IsBackupInvalid(backup)) { |
|
continue |
|
} |
|
ascBackupList = append(ascBackupList, backup) |
|
} |
|
|
|
sort.Slice(ascBackupList, func(i, j int) bool { |
|
return ascBackupList[i].CreationTimestamp.Unix() < ascBackupList[j].CreationTimestamp.Unix() |
|
}) |
|
return ascBackupList, logBackup |
|
} |
If logbackup doesn't exist, into func 'calculateExpiredBackups'
|
expiredBackups, err = calculateExpiredBackups(ascBackups, reservedTime) |
|
if err != nil { |
|
klog.Errorf("caculate expired backups without log backup, err: %s", err) |
|
return |
|
} |
|
} |
Traverse from the beginning to the end until commitTs is later than the timeout
But failed-backup doesn't save commitTs, func 'ParseTSString' will return 0
|
func calculateExpiredBackups(backupsList []*v1alpha1.Backup, reservedTime time.Duration) ([]*v1alpha1.Backup, error) { |
|
expiredTS := config.TSToTSO(time.Now().Add(-1 * reservedTime).Unix()) |
|
i := 0 |
|
for ; i < len(backupsList); i++ { |
|
startTS, err := config.ParseTSString(backupsList[i].Status.CommitTs) |
|
if err != nil { |
|
return nil, perrors.Annotatef(err, "parse start tso: %s", backupsList[i].Status.CommitTs) |
|
} |
|
if startTS >= expiredTS { |
|
break |
|
} |
|
} |
|
return backupsList[:i], nil |
|
} |
|
// ParseTSString supports TSO or datetime, e.g. '400036290571534337', '2006-01-02 15:04:05' |
|
func ParseTSString(ts string) (uint64, error) { |
|
if len(ts) == 0 { |
|
return 0, nil |
|
} |
|
if tso, err := strconv.ParseUint(ts, 10, 64); err == nil { |
|
return tso, nil |
|
} |
|
t, err := time.ParseInLocation("2006-01-02 15:04:05", ts, time.Local) |
|
if err != nil { |
|
t, err = time.Parse(time.RFC3339, ts) |
|
if err != nil { |
|
return 0, fmt.Errorf("cannot parse ts string %s, err: %v", ts, err) |
|
} |
|
} |
|
return GoTimeToTS(t), nil |
|
} |
There are two case about it
Case1:
Backups in order: failed1, complete2, failed3 (all in reservedTime)
Retained: complete2, failed3
Case2:
Backups in order: failed1 (in reservedTime)
Retained: (none, deleted directly)
This behavior is not expected.
Failed backups should also be retained as expected.
Bug Report
What version of Kubernetes are you using?
What version of TiDB Operator are you using?
1.4.7 , it also contains in master branch
What storage classes exist in the Kubernetes cluster and what are used for PD/TiKV pods?
What's the status of the TiDB cluster pods?
What did you do?
What did you expect to see?
Newly failed backup can be retained for a while
What did you see instead?
The failed backup was GC prematurely.
ref by code:
tidb-operator/pkg/backup/backupschedule/backup_schedule_manager.go
Lines 629 to 647 in fe85d86
If backup in invalid/completed/failed phase, sort by creationTimestamp , the earlier the earlier, the higher the front
tidb-operator/pkg/backup/backupschedule/backup_schedule_manager.go
Lines 665 to 668 in fe85d86
tidb-operator/pkg/backup/backupschedule/backup_schedule_manager.go
Lines 766 to 789 in fe85d86
If logbackup doesn't exist, into func 'calculateExpiredBackups'
tidb-operator/pkg/backup/backupschedule/backup_schedule_manager.go
Lines 682 to 687 in fe85d86
Traverse from the beginning to the end until commitTs is later than the timeout
But failed-backup doesn't save commitTs, func 'ParseTSString' will return 0
tidb-operator/pkg/backup/backupschedule/backup_schedule_manager.go
Lines 863 to 876 in fe85d86
tidb-operator/pkg/apis/util/config/config.go
Lines 396 to 412 in fe85d86
There are two case about it
This behavior is not expected.
Failed backups should also be retained as expected.