From eb7563f9526e8ab7945e40f2bb6aea2f97416fdd Mon Sep 17 00:00:00 2001 From: Lou <luogj@cn.ibm.com> Date: Fri, 17 Jan 2020 17:56:23 +0800 Subject: [PATCH 1/5] retry xfs_repair if failed; replay dirty logs Signed-off-by: Lou <luogj@cn.ibm.com> --- mount/mount_linux.go | 70 ++++++++++++++++++++++++++--- mount/safe_format_and_mount_test.go | 69 +++++++++++++++++++++++++++- 2 files changed, 132 insertions(+), 7 deletions(-) diff --git a/mount/mount_linux.go b/mount/mount_linux.go index 4dc696d6..4973c011 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -20,6 +20,7 @@ package mount import ( "fmt" + "io/ioutil" "os" "os/exec" "path/filepath" @@ -43,6 +44,10 @@ const ( fsckErrorsCorrected = 1 // 'fsck' found errors but exited without correcting them fsckErrorsUncorrected = 4 + // 'xfs_repair' found errors but exited without correcting them + xfsRepairErrorsUncorrected = 1 + // 'xfs_repair' was unable to proceed due to a dirty log + xfsRepairErrorsDirtyLogs = 2 ) // Mounter provides the default implementation of mount.Interface @@ -292,18 +297,71 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er return nil } else { klog.Warningf("Filesystem corruption was detected for %s, running xfs_repair to repair", source) - out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput() - if err != nil { - return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out) - } else { - klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) - return nil + var uncorrectedErr error + + // If xfs_repair fails to repair the file system successfully, try giving the same xfs_repair + // command twice more; xfs_repair may be able to make more repairs on successive runs. + for i := 0; i < 3; i++ { + out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput() + if err != nil { + if e, isExitError := err.(utilexec.ExitError); isExitError { + switch e.ExitStatus() { + case xfsRepairErrorsUncorrected: + uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) + // If the exit status is 2, do not retry, replay the dirty logs instead. + case xfsRepairErrorsDirtyLogs: + return mounter.replayXfsDirtyLogs(source) + default: + return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) + } + } else { + return NewMountError(HasFilesystemErrors, "failed to run 'xfs_repair' on device %s: %v\n", source, err) + } + } else { + klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) + return nil + } + } + if uncorrectedErr != nil { + return uncorrectedErr } } } return nil } +// replayXfsDirtyLogs tries to replay dirty logs by by mounting and +// immediately unmounting the filesystem on the same class of machine +// that crashed. +func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source string) error { + klog.V(4).Infof("Attempting to replay the dirty logs on device %s", source) + msg := fmt.Sprintf("failed to replay dirty logs on device %s", source) + target, err := ioutil.TempDir("", "") + if err != nil { + return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + defer os.RemoveAll(target) + + klog.V(4).Infof("Attempting to mount disk %s at %s", source, target) + if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil { + return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + + klog.V(4).Infof("Unmounting %s", target) + if err := mounter.Interface.Unmount(target); err != nil { + return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + + klog.V(4).Infof("Checking for issues with xfs_repair on disk %s again", source) + out, err := mounter.Exec.Command("xfs_repair", source).CombinedOutput() + if err != nil { + return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out) + } else { + klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) + return nil + } +} + // formatAndMount uses unix utils to format and mount the given disk func (mounter *SafeFormatAndMount) formatAndMount(source string, target string, fstype string, options []string) error { readOnly := false diff --git a/mount/safe_format_and_mount_test.go b/mount/safe_format_and_mount_test.go index a0869293..45567ff9 100644 --- a/mount/safe_format_and_mount_test.go +++ b/mount/safe_format_and_mount_test.go @@ -225,12 +225,79 @@ func TestSafeFormatAndMount(t *testing.T) { expErrorType: UnknownMountError, }, { - description: "Test that 'xfs_repair' is called twice but could not repair the filesystem", + description: "Test that 'xfs_repair' is called three times and repair the filesystem", fstype: "xfs", execScripts: []ExecArgs{ {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, + }, + expectedError: nil, + }, + { + description: "Test that 'xfs_repair' is called four times and repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, + }, + expectedError: nil, + }, + { + description: "Test that 'xfs_repair' is called twice but exit with bad code and could not repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 10}}, + }, + expectedError: fmt.Errorf("'xfs_repair' found errors on device %s but could not correct them: %v", "/dev/foo", "\nAn error occurred\n"), + }, + { + description: "Test that 'xfs_repair' is called twice but exit with bad error and could not repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", fmt.Errorf("An error occurred")}, + }, + expectedError: fmt.Errorf("failed to run 'xfs_repair' on device %s: %v\n", "/dev/foo", "An error occurred"), + }, + { + description: "Test that 'xfs_repair' is called three times and replay dirty logs", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}}, + {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, + }, + expectedError: nil, + }, + { + description: "Test that 'xfs_repair' is called three times and replay dirty logs, but final 'xfs_repair' is failed", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + }, + expectedError: fmt.Errorf("'xfs_repair' found errors on device %s but could not correct them: %v", "/dev/foo", "\nAn error occurred\n"), + }, + { + description: "Test that 'xfs_repair' is called four times but could not repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, }, expErrorType: HasFilesystemErrors, }, From cc7c8fa9d694442e591d77f29316515ab744a419 Mon Sep 17 00:00:00 2001 From: Lou <luogj@cn.ibm.com> Date: Wed, 5 Feb 2020 21:02:50 +0800 Subject: [PATCH 2/5] update after review Signed-off-by: Lou <luogj@cn.ibm.com> --- mount/mount_linux.go | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mount/mount_linux.go b/mount/mount_linux.go index 4973c011..cfc22e39 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -48,6 +48,8 @@ const ( xfsRepairErrorsUncorrected = 1 // 'xfs_repair' was unable to proceed due to a dirty log xfsRepairErrorsDirtyLogs = 2 + // Retry 'xfs_repair' three times on failure to make more repairs on successive runs. + xfsRepairRetries = 3 ) // Mounter provides the default implementation of mount.Interface @@ -301,26 +303,26 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er // If xfs_repair fails to repair the file system successfully, try giving the same xfs_repair // command twice more; xfs_repair may be able to make more repairs on successive runs. - for i := 0; i < 3; i++ { + for i := 0; i < xfsRepairRetries; i++ { out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput() - if err != nil { - if e, isExitError := err.(utilexec.ExitError); isExitError { - switch e.ExitStatus() { - case xfsRepairErrorsUncorrected: - uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) - // If the exit status is 2, do not retry, replay the dirty logs instead. - case xfsRepairErrorsDirtyLogs: - return mounter.replayXfsDirtyLogs(source) - default: - return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) - } - } else { - return NewMountError(HasFilesystemErrors, "failed to run 'xfs_repair' on device %s: %v\n", source, err) - } - } else { + if err == nil { klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) return nil } + e, isExitError := err.(utilexec.ExitError) + if !isExitError { + return NewMountError(HasFilesystemErrors, "failed to run 'xfs_repair' on device %s: %v\n", source, err) + + } + switch e.ExitStatus() { + case xfsRepairErrorsUncorrected: + uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) + // If the exit status is 2, do not retry, replay the dirty logs instead. + case xfsRepairErrorsDirtyLogs: + return mounter.replayXfsDirtyLogs(source) + default: + return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) + } } if uncorrectedErr != nil { return uncorrectedErr From 0a3bc691ba7faf07aa0090125d861a5adf328c08 Mon Sep 17 00:00:00 2001 From: Lou <luogj@cn.ibm.com> Date: Wed, 5 Feb 2020 22:05:39 +0800 Subject: [PATCH 3/5] rebase and fix ut Signed-off-by: Lou <luogj@cn.ibm.com> --- mount/safe_format_and_mount_test.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mount/safe_format_and_mount_test.go b/mount/safe_format_and_mount_test.go index 45567ff9..d8aeda2b 100644 --- a/mount/safe_format_and_mount_test.go +++ b/mount/safe_format_and_mount_test.go @@ -233,7 +233,6 @@ func TestSafeFormatAndMount(t *testing.T) { {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, }, - expectedError: nil, }, { description: "Test that 'xfs_repair' is called four times and repair the filesystem", @@ -245,7 +244,6 @@ func TestSafeFormatAndMount(t *testing.T) { {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, }, - expectedError: nil, }, { description: "Test that 'xfs_repair' is called twice but exit with bad code and could not repair the filesystem", @@ -255,7 +253,7 @@ func TestSafeFormatAndMount(t *testing.T) { {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 10}}, }, - expectedError: fmt.Errorf("'xfs_repair' found errors on device %s but could not correct them: %v", "/dev/foo", "\nAn error occurred\n"), + expErrorType: HasFilesystemErrors, }, { description: "Test that 'xfs_repair' is called twice but exit with bad error and could not repair the filesystem", @@ -265,7 +263,7 @@ func TestSafeFormatAndMount(t *testing.T) { {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", fmt.Errorf("An error occurred")}, }, - expectedError: fmt.Errorf("failed to run 'xfs_repair' on device %s: %v\n", "/dev/foo", "An error occurred"), + expErrorType: HasFilesystemErrors, }, { description: "Test that 'xfs_repair' is called three times and replay dirty logs", @@ -276,7 +274,6 @@ func TestSafeFormatAndMount(t *testing.T) { {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}}, {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, }, - expectedError: nil, }, { description: "Test that 'xfs_repair' is called three times and replay dirty logs, but final 'xfs_repair' is failed", @@ -287,7 +284,7 @@ func TestSafeFormatAndMount(t *testing.T) { {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}}, {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, }, - expectedError: fmt.Errorf("'xfs_repair' found errors on device %s but could not correct them: %v", "/dev/foo", "\nAn error occurred\n"), + expErrorType: HasFilesystemErrors, }, { description: "Test that 'xfs_repair' is called four times but could not repair the filesystem", From f7bc9d17f71afb8c113693582e8ba8b5040e4e1b Mon Sep 17 00:00:00 2001 From: Lou <luogj@cn.ibm.com> Date: Wed, 5 Feb 2020 23:24:30 +0800 Subject: [PATCH 4/5] replay xfs dirty logs on target mount point instead of a temp dir Signed-off-by: Lou <luogj@cn.ibm.com> --- mount/mount_linux.go | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mount/mount_linux.go b/mount/mount_linux.go index cfc22e39..5a1be607 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -20,7 +20,6 @@ package mount import ( "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -264,7 +263,7 @@ func (mounter *Mounter) GetMountRefs(pathname string) ([]string, error) { } // checkAndRepairFileSystem checks and repairs filesystems using command fsck. -func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error { +func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source, target string) error { klog.V(4).Infof("Checking for issues with fsck on disk: %s", source) args := []string{"-a", source} out, err := mounter.Exec.Command("fsck", args...).CombinedOutput() @@ -285,7 +284,7 @@ func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error } // checkAndRepairXfsFilesystem checks and repairs xfs filesystem using command xfs_repair. -func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) error { +func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source, target string) error { klog.V(4).Infof("Checking for issues with xfs_repair on disk: %s", source) args := []string{source} @@ -319,7 +318,7 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) // If the exit status is 2, do not retry, replay the dirty logs instead. case xfsRepairErrorsDirtyLogs: - return mounter.replayXfsDirtyLogs(source) + return mounter.replayXfsDirtyLogs(source, target) default: return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) } @@ -335,14 +334,9 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er // replayXfsDirtyLogs tries to replay dirty logs by by mounting and // immediately unmounting the filesystem on the same class of machine // that crashed. -func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source string) error { +func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source, target string) error { klog.V(4).Infof("Attempting to replay the dirty logs on device %s", source) msg := fmt.Sprintf("failed to replay dirty logs on device %s", source) - target, err := ioutil.TempDir("", "") - if err != nil { - return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) - } - defer os.RemoveAll(target) klog.V(4).Infof("Attempting to mount disk %s at %s", source, target) if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil { @@ -425,9 +419,9 @@ func (mounter *SafeFormatAndMount) formatAndMount(source string, target string, var err error switch existingFormat { case "xfs": - err = mounter.checkAndRepairXfsFilesystem(source) + err = mounter.checkAndRepairXfsFilesystem(source, target) default: - err = mounter.checkAndRepairFilesystem(source) + err = mounter.checkAndRepairFilesystem(source, target) } if err != nil { From 49780f3be274de1c7d47074ad0b5a57fa2861c34 Mon Sep 17 00:00:00 2001 From: Lou <luogj@cn.ibm.com> Date: Fri, 7 Feb 2020 14:03:45 +0800 Subject: [PATCH 5/5] always unmount after mount in replayXfsDirtyLogs Signed-off-by: Lou <luogj@cn.ibm.com> --- mount/mount_linux.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/mount/mount_linux.go b/mount/mount_linux.go index 5a1be607..f0757989 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -338,14 +338,22 @@ func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source, target string) err klog.V(4).Infof("Attempting to replay the dirty logs on device %s", source) msg := fmt.Sprintf("failed to replay dirty logs on device %s", source) - klog.V(4).Infof("Attempting to mount disk %s at %s", source, target) - if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil { - return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) - } - - klog.V(4).Infof("Unmounting %s", target) - if err := mounter.Interface.Unmount(target); err != nil { - return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + // make sure unmount is always called after mount. + err := func() (returnErr error) { + defer func() { + klog.V(4).Infof("Unmounting %s", target) + if err := mounter.Interface.Unmount(target); err != nil { + returnErr = NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + }() + klog.V(4).Infof("Attempting to mount disk %s at %s", source, target) + if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil { + return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + return nil + }() + if err != nil { + return err } klog.V(4).Infof("Checking for issues with xfs_repair on disk %s again", source)