From 5d83d8960977fa358c0509befe728f0277321492 Mon Sep 17 00:00:00 2001 From: Xiaomou Date: Tue, 6 Aug 2024 02:56:22 -0700 Subject: [PATCH 1/8] [PD] add option to wait a certain time before start pd (#5696) --- docs/api-references/docs.md | 12 ++++++++++ manifests/crd.yaml | 3 +++ .../crd/v1/pingcap.com_tidbclusters.yaml | 3 +++ .../pingcap/v1alpha1/openapi_generated.go | 7 ++++++ pkg/apis/pingcap/v1alpha1/tidbcluster.go | 8 +++++++ pkg/apis/pingcap/v1alpha1/types.go | 5 ++++ .../member/startscript/v2/pd_start_script.go | 8 +++++++ ...tart_script_with_wait_for_ip_match_test.go | 24 +++++++++++++++++++ 8 files changed, 70 insertions(+) diff --git a/docs/api-references/docs.md b/docs/api-references/docs.md index f1cad339202..7bb15cf2e1d 100644 --- a/docs/api-references/docs.md +++ b/docs/api-references/docs.md @@ -12321,6 +12321,18 @@ int +initWaitTime
+ +int + + + +

Wait time before pd get started. This wait time is to allow the new DNS record to propagate, +ensuring that the PD DNS resolves to the same IP address as the pod.

+ + + + mode
string diff --git a/manifests/crd.yaml b/manifests/crd.yaml index 9ed708b41c4..5af06877636 100644 --- a/manifests/crd.yaml +++ b/manifests/crd.yaml @@ -25059,6 +25059,9 @@ spec: - name type: object type: array + initWaitTime: + default: 0 + type: integer labels: additionalProperties: type: string diff --git a/manifests/crd/v1/pingcap.com_tidbclusters.yaml b/manifests/crd/v1/pingcap.com_tidbclusters.yaml index 50e6f657fb8..bd231fd88be 100644 --- a/manifests/crd/v1/pingcap.com_tidbclusters.yaml +++ b/manifests/crd/v1/pingcap.com_tidbclusters.yaml @@ -5674,6 +5674,9 @@ spec: - name type: object type: array + initWaitTime: + default: 0 + type: integer labels: additionalProperties: type: string diff --git a/pkg/apis/pingcap/v1alpha1/openapi_generated.go b/pkg/apis/pingcap/v1alpha1/openapi_generated.go index a5c71acef10..aadde10b971 100644 --- a/pkg/apis/pingcap/v1alpha1/openapi_generated.go +++ b/pkg/apis/pingcap/v1alpha1/openapi_generated.go @@ -6907,6 +6907,13 @@ func schema_pkg_apis_pingcap_v1alpha1_PDSpec(ref common.ReferenceCallback) commo Format: "int32", }, }, + "initWaitTime": { + SchemaProps: spec.SchemaProps{ + Description: "Wait time before pd get started. This wait time is to allow the new DNS record to propagate, ensuring that the PD DNS resolves to the same IP address as the pod.", + Type: []string{"integer"}, + Format: "int32", + }, + }, "mode": { SchemaProps: spec.SchemaProps{ Description: "Mode is the mode of PD cluster", diff --git a/pkg/apis/pingcap/v1alpha1/tidbcluster.go b/pkg/apis/pingcap/v1alpha1/tidbcluster.go index d9c10d39b34..fb0b7f75a5b 100644 --- a/pkg/apis/pingcap/v1alpha1/tidbcluster.go +++ b/pkg/apis/pingcap/v1alpha1/tidbcluster.go @@ -46,6 +46,7 @@ const ( // shutdown a TiCDC pod. defaultTiCDCGracefulShutdownTimeout = 10 * time.Minute defaultPDStartTimeout = 30 + defaultPDInitWaitTime = 0 // the latest version versionLatest = "latest" @@ -1386,3 +1387,10 @@ func (tc *TidbCluster) PDStartTimeout() int { } return defaultPDStartTimeout } + +func (tc *TidbCluster) PDInitWaitTime() int { + if tc.Spec.PD != nil && tc.Spec.PD.InitWaitTime != 0 { + return tc.Spec.PD.InitWaitTime + } + return defaultPDInitWaitTime +} diff --git a/pkg/apis/pingcap/v1alpha1/types.go b/pkg/apis/pingcap/v1alpha1/types.go index 291f785fd62..7f92b46dcb7 100644 --- a/pkg/apis/pingcap/v1alpha1/types.go +++ b/pkg/apis/pingcap/v1alpha1/types.go @@ -572,6 +572,11 @@ type PDSpec struct { // +kubebuilder:default=30 StartTimeout int `json:"startTimeout,omitempty"` + // Wait time before pd get started. This wait time is to allow the new DNS record to propagate, + // ensuring that the PD DNS resolves to the same IP address as the pod. + // +kubebuilder:default=0 + InitWaitTime int `json:"initWaitTime,omitempty"` + // Mode is the mode of PD cluster // +optional // +kubebuilder:validation:Enum:="";"ms" diff --git a/pkg/manager/member/startscript/v2/pd_start_script.go b/pkg/manager/member/startscript/v2/pd_start_script.go index 4b93488fa13..0a7eac84e1a 100644 --- a/pkg/manager/member/startscript/v2/pd_start_script.go +++ b/pkg/manager/member/startscript/v2/pd_start_script.go @@ -38,11 +38,13 @@ type PDStartScriptModel struct { ExtraArgs string PDAddresses string PDStartTimeout int + PDInitWaitTime int } // PDMSStartScriptModel contain fields for rendering PD Micro Service start script type PDMSStartScriptModel struct { PDStartTimeout int + PDInitWaitTime int PDAddresses string PDMSDomain string @@ -90,6 +92,8 @@ func RenderPDStartScript(tc *v1alpha1.TidbCluster) (string, error) { m.PDStartTimeout = tc.PDStartTimeout() + m.PDInitWaitTime = tc.PDInitWaitTime() + waitForDnsNameIpMatchOnStartup := slices.Contains( tc.Spec.StartScriptV2FeatureFlags, v1alpha1.StartScriptV2FeatureFlagWaitForDnsNameIpMatch) @@ -134,6 +138,8 @@ func renderPDMSStartScript(tc *v1alpha1.TidbCluster, name string) (string, error m.PDStartTimeout = tc.PDStartTimeout() + m.PDInitWaitTime = tc.PDInitWaitTime() + preferPDAddressesOverDiscovery := slices.Contains( tc.Spec.StartScriptV2FeatureFlags, v1alpha1.StartScriptV2FeatureFlagPreferPDAddressesOverDiscovery) if preferPDAddressesOverDiscovery { @@ -181,6 +187,8 @@ const ( pdWaitForDnsIpMatchSubScript = ` componentDomain=${PD_DOMAIN} waitThreshold={{ .PDStartTimeout }} +initWaitTime={{ .PDInitWaitTime }} +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" ` + componentCommonWaitForDnsIpMatchScript diff --git a/pkg/manager/member/startscript/v2/pd_start_script_with_wait_for_ip_match_test.go b/pkg/manager/member/startscript/v2/pd_start_script_with_wait_for_ip_match_test.go index e055ffa0a07..1f81ca9ceac 100644 --- a/pkg/manager/member/startscript/v2/pd_start_script_with_wait_for_ip_match_test.go +++ b/pkg/manager/member/startscript/v2/pd_start_script_with_wait_for_ip_match_test.go @@ -58,6 +58,8 @@ PD_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PD_POD_NAME}.start-script-test-pd-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -172,6 +174,8 @@ PD_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PD_POD_NAME}.start-script-test-pd-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -286,6 +290,8 @@ PD_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PD_POD_NAME}.start-script-test-pd-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -400,6 +406,8 @@ PD_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PD_POD_NAME}.start-script-test-pd-peer.start-script-test-ns.svc.cluster-1.com componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -515,6 +523,8 @@ PD_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PD_POD_NAME}.start-script-test-pd-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -630,6 +640,8 @@ PD_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PD_POD_NAME}.start-script-test-pd-peer.start-script-test-ns.svc.cluster-1.com componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -781,6 +793,8 @@ PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -880,6 +894,8 @@ PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -979,6 +995,8 @@ PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -1078,6 +1096,8 @@ PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc.cluster-1.com componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -1178,6 +1198,8 @@ PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 @@ -1285,6 +1307,8 @@ PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc.cluster-1.com componentDomain=${PD_DOMAIN} waitThreshold=30 +initWaitTime=0 +sleep initWaitTime nsLookupCmd="dig ${componentDomain} A ${componentDomain} AAAA +search +short" elapseTime=0 From c329624a4a3108e21b045f1ee0770a8c6b1dd732 Mon Sep 17 00:00:00 2001 From: Hu# Date: Tue, 6 Aug 2024 18:09:09 +0800 Subject: [PATCH 2/8] pdms: Add the name field to the startup parameters (#5698) Signed-off-by: husharp Signed-off-by: husharp Co-authored-by: Xuecheng Zhang --- examples/basic/pd-micro-service-cluster.yaml | 15 +-- pkg/controller/pd_control.go | 15 ++- pkg/manager/member/pd_ms_member_manager.go | 10 -- pkg/manager/member/pd_ms_upgrader.go | 33 +++-- pkg/manager/member/pd_ms_upgrader_test.go | 1 - .../member/startscript/v2/pd_start_script.go | 28 ++++- .../startscript/v2/pd_start_script_test.go | 116 ++++++++++++++++++ pkg/manager/member/tikv_member_manager.go | 4 +- pkg/pdapi/pdapi.go | 2 +- pkg/pdapi/pdms_api.go | 2 + 10 files changed, 178 insertions(+), 48 deletions(-) diff --git a/examples/basic/pd-micro-service-cluster.yaml b/examples/basic/pd-micro-service-cluster.yaml index 355831f56ea..6f6a294bc5a 100644 --- a/examples/basic/pd-micro-service-cluster.yaml +++ b/examples/basic/pd-micro-service-cluster.yaml @@ -16,8 +16,9 @@ spec: helper: image: alpine:3.16.0 pd: - baseImage: pingcap/pd - version: v8.1.0 + # TODO: replaced v8.3.0 after v8.3.0 released + baseImage: hub.pingcap.net/devbuild/pd + version: v8.3.0-5427 maxFailoverCount: 0 replicas: 1 # if storageClassName is not set, the default Storage Class of the Kubernetes cluster will be used @@ -28,13 +29,13 @@ spec: mode: "ms" pdms: - name: "tso" - baseImage: pingcap/pd - version: v8.1.0 + baseImage: hub.pingcap.net/devbuild/pd + version: v8.3.0-5427 replicas: 2 - name: "scheduling" - baseImage: pingcap/pd - version: v8.1.0 - replicas: 1 + baseImage: hub.pingcap.net/devbuild/pd + version: v8.3.0-5427 + replicas: 2 tikv: baseImage: pingcap/tikv version: v8.1.0 diff --git a/pkg/controller/pd_control.go b/pkg/controller/pd_control.go index c56aeae8377..53418aa39bb 100644 --- a/pkg/controller/pd_control.go +++ b/pkg/controller/pd_control.go @@ -74,26 +74,29 @@ func GetPDClient(pdControl pdapi.PDControlInterface, tc *v1alpha1.TidbCluster) p } // GetPDMSClient tries to return an available PDMSClient -func GetPDMSClient(pdControl pdapi.PDControlInterface, tc *v1alpha1.TidbCluster, serviceName string) error { +func GetPDMSClient(pdControl pdapi.PDControlInterface, tc *v1alpha1.TidbCluster, serviceName string) pdapi.PDMSClient { pdMSClient := getPDMSClientFromService(pdControl, tc, serviceName) err := pdMSClient.GetHealth() if err == nil { - return nil + return pdMSClient } for _, service := range tc.Status.PDMS { + if service.Name != serviceName { + continue + } for _, pdMember := range service.Members { - pdPeerClient := pdControl.GetPDMSClient(pdapi.Namespace(tc.GetNamespace()), tc.GetName(), serviceName, + pdMSPeerClient := pdControl.GetPDMSClient(pdapi.Namespace(tc.GetNamespace()), tc.GetName(), serviceName, tc.IsTLSClusterEnabled(), pdapi.SpecifyClient(pdMember, pdMember)) - err = pdPeerClient.GetHealth() + err = pdMSPeerClient.GetHealth() if err == nil { - return nil + return pdMSPeerClient } } } - return err + return nil } // NewFakePDClient creates a fake pdclient that is set as the pd client diff --git a/pkg/manager/member/pd_ms_member_manager.go b/pkg/manager/member/pd_ms_member_manager.go index 7ad6189cbdb..9db67ca571a 100644 --- a/pkg/manager/member/pd_ms_member_manager.go +++ b/pkg/manager/member/pd_ms_member_manager.go @@ -18,7 +18,6 @@ import ( "path" "strings" - "github.com/Masterminds/semver" "github.com/pingcap/tidb-operator/pkg/apis/label" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/controller" @@ -775,15 +774,6 @@ func (m *pdMSMemberManager) getPDMSConfigMap(tc *v1alpha1.TidbCluster, curSpec * return cm, nil } -// PDMSSupportMicroServices returns true if the given version of PDMS supports microservices. -func PDMSSupportMicroServices(version string) (bool, error) { - v, err := semver.NewVersion(version) - if err != nil { - return true, err - } - return v.Major() >= 7 && v.Minor() >= 2 && v.Patch() >= 0, nil -} - type FakePDMSMemberManager struct { err error } diff --git a/pkg/manager/member/pd_ms_upgrader.go b/pkg/manager/member/pd_ms_upgrader.go index b42ed0a95dd..a00653f5dea 100644 --- a/pkg/manager/member/pd_ms_upgrader.go +++ b/pkg/manager/member/pd_ms_upgrader.go @@ -48,23 +48,23 @@ func (u *pdMSUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.St return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS status is nil, can not to be upgraded", ns, tcName) } - componentName := controller.PDMSTrimName(newSet.Name) - klog.Infof("gracefulUpgrade pdMS trim name, componentName: %s", componentName) - if tc.Status.PDMS[componentName] == nil { - tc.Status.PDMS[componentName] = &v1alpha1.PDMSStatus{Name: componentName} - return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS component is nil, can not to be upgraded, component: %s", ns, tcName, componentName) + curService := controller.PDMSTrimName(newSet.Name) + klog.Infof("TidbCluster: [%s/%s]' gracefulUpgrade pdMS trim name, componentName: %s", ns, tcName, curService) + if tc.Status.PDMS[curService] == nil { + tc.Status.PDMS[curService] = &v1alpha1.PDMSStatus{Name: curService} + return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS component is nil, can not to be upgraded, component: %s", ns, tcName, curService) } - if !tc.Status.PDMS[componentName].Synced { - return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS status sync failed, can not to be upgraded, component: %s", ns, tcName, componentName) + if !tc.Status.PDMS[curService].Synced { + return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS status sync failed, can not to be upgraded, component: %s", ns, tcName, curService) } oldTrimName := controller.PDMSTrimName(oldSet.Name) - if oldTrimName != componentName { - return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS oldTrimName is %s, not equal to componentName: %s", ns, tcName, oldTrimName, componentName) + if oldTrimName != curService { + return fmt.Errorf("tidbcluster: [%s/%s]'s pdMS oldTrimName is %s, not equal to componentName: %s", ns, tcName, oldTrimName, curService) } - klog.Infof("gracefulUpgrade pdMS trim name, oldTrimName: %s", oldTrimName) + klog.Infof("TidbCluster: [%s/%s]' gracefulUpgrade pdMS trim name, oldTrimName: %s", ns, tcName, oldTrimName) if tc.PDMSScaling(oldTrimName) { klog.Infof("TidbCluster: [%s/%s]'s pdMS status is %v, can not upgrade pdMS", - ns, tcName, tc.Status.PDMS[componentName].Phase) + ns, tcName, tc.Status.PDMS[curService].Phase) _, podSpec, err := GetLastAppliedConfig(oldSet) if err != nil { return err @@ -73,7 +73,7 @@ func (u *pdMSUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.St return nil } - tc.Status.PDMS[componentName].Phase = v1alpha1.UpgradePhase + tc.Status.PDMS[curService].Phase = v1alpha1.UpgradePhase if !templateEqual(newSet, oldSet) { return nil } @@ -84,7 +84,7 @@ func (u *pdMSUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.St // If we encounter this situation, we will let the native statefulset controller do the upgrade completely, which may be unsafe for upgrading pdMS. // Therefore, in the production environment, we should try to avoid modifying the pd statefulset update strategy directly. newSet.Spec.UpdateStrategy = oldSet.Spec.UpdateStrategy - klog.Warningf("tidbcluster: [%s/%s] pdMS statefulset %s UpdateStrategy has been modified manually, componentName: %s", ns, tcName, oldSet.GetName(), componentName) + klog.Warningf("Tidbcluster: [%s/%s] pdMS statefulset %s UpdateStrategy has been modified manually, componentName: %s", ns, tcName, oldSet.GetName(), curService) return nil } @@ -103,27 +103,26 @@ func (u *pdMSUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.St return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pdMS pod: [%s] has no label: %s", ns, tcName, podName, apps.ControllerRevisionHashLabelKey) } - if revision == tc.Status.PDMS[componentName].StatefulSet.UpdateRevision { + if revision == tc.Status.PDMS[curService].StatefulSet.UpdateRevision { if !k8s.IsPodReady(pod) { return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pdMS pod: [%s] is not ready", ns, tcName, podName) } var exist bool - for _, member := range tc.Status.PDMS[componentName].Members { + for _, member := range tc.Status.PDMS[curService].Members { if strings.Contains(member, podName) { exist = true } } if !exist { return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pdMS upgraded pod: [%s] is not exist, all members: %v", - ns, tcName, podName, tc.Status.PDMS[componentName].Members) + ns, tcName, podName, tc.Status.PDMS[curService].Members) } continue } mngerutils.SetUpgradePartition(newSet, i) return nil } - return nil } diff --git a/pkg/manager/member/pd_ms_upgrader_test.go b/pkg/manager/member/pd_ms_upgrader_test.go index 9059b4c58dd..19152d69b37 100644 --- a/pkg/manager/member/pd_ms_upgrader_test.go +++ b/pkg/manager/member/pd_ms_upgrader_test.go @@ -216,7 +216,6 @@ func TestPDMSUpgraderUpgrade(t *testing.T) { for i := range tests { testFn(&tests[i]) } - } func newPDMSUpgrader() (Upgrader, podinformers.PodInformer) { diff --git a/pkg/manager/member/startscript/v2/pd_start_script.go b/pkg/manager/member/startscript/v2/pd_start_script.go index 0a7eac84e1a..8cfe4bb2dcd 100644 --- a/pkg/manager/member/startscript/v2/pd_start_script.go +++ b/pkg/manager/member/startscript/v2/pd_start_script.go @@ -23,6 +23,7 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/pkg/manager/member/constants" + "github.com/pingcap/tidb-operator/pkg/util/cmpver" ) // PDStartScriptModel contain fields for rendering PD start script @@ -47,6 +48,7 @@ type PDMSStartScriptModel struct { PDInitWaitTime int PDAddresses string + PDMSName string PDMSDomain string ListenAddr string AdvertiseListenAddr string @@ -100,7 +102,7 @@ func RenderPDStartScript(tc *v1alpha1.TidbCluster) (string, error) { mode := "" if tc.Spec.PD.Mode == "ms" && tc.Spec.PDMS != nil { mode = "api" - // default enbled the dns detection + // default enabled the dns detection waitForDnsNameIpMatchOnStartup = true } pdStartScriptTpl := template.Must( @@ -136,6 +138,14 @@ func renderPDMSStartScript(tc *v1alpha1.TidbCluster, name string) (string, error m.PDMSDomain = m.PDMSDomain + "." + tc.Spec.ClusterDomain } + if check, err := pdMSSupportMicroServicesWithName.Check(tc.PDMSVersion(name)); check && err == nil { + m.PDMSName = "${PDMS_POD_NAME}" + if tc.Spec.ClusterDomain != "" { + m.PDMSName = m.PDMSDomain + } + name = fmt.Sprintf("%s --name=%s", name, m.PDMSName) + } + m.PDStartTimeout = tc.PDStartTimeout() m.PDInitWaitTime = tc.PDInitWaitTime() @@ -315,10 +325,20 @@ func replacePdStartScriptDnsAwaitPart(withLocalIpMatch bool, startScript string) } } -func enableMicroServiceModeDynamic(ms string, startScript string) string { - if ms != "" { - return strings.ReplaceAll(startScript, pdEnableMicroService, fmt.Sprintf(" %s %s ", pdEnableMicroServiceSubScript, ms)) +// startParams has different values for different PD related service: +// - for original `PD`, startParams should be empty. +// - for `PD API` service, startParams should be `api` +// - for `TSO` and `Scheduling`, startParams should be `tso` and `scheduling` respectively. +// NOTICE: in `8.3.0` we have supported `name` start parameter, so we will pass `tso name=${PDMS_POD_NAME}` to startParams. +func enableMicroServiceModeDynamic(startParams string, startScript string) string { + if startParams != "" { + return strings.ReplaceAll(startScript, pdEnableMicroService, fmt.Sprintf(" %s %s ", pdEnableMicroServiceSubScript, startParams)) } else { + // for original `PD`, should be empty. return strings.ReplaceAll(startScript, pdEnableMicroService, "") } } + +// PDMSSupportMicroServicesWithName returns true if the given version of PDMS supports microservices with name. +// related https://github.com/tikv/pd/pull/8461. +var pdMSSupportMicroServicesWithName, _ = cmpver.NewConstraint(cmpver.GreaterOrEqual, "v8.3.0") diff --git a/pkg/manager/member/startscript/v2/pd_start_script_test.go b/pkg/manager/member/startscript/v2/pd_start_script_test.go index 8f07e70405b..703cd994eda 100644 --- a/pkg/manager/member/startscript/v2/pd_start_script_test.go +++ b/pkg/manager/member/startscript/v2/pd_start_script_test.go @@ -19,6 +19,8 @@ import ( "github.com/google/go-cmp/cmp" "github.com/onsi/gomega" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" + "github.com/stretchr/testify/require" + "k8s.io/utils/pointer" ) func TestRenderPDStartScript(t *testing.T) { @@ -793,6 +795,83 @@ ARGS=" services tso --listen-addr=http://0.0.0.0:2379 \ --config=/etc/pd/pd.toml \ " +echo "starting pd-server ..." +sleep $((RANDOM % 10)) +echo "/pd-server ${ARGS}" +exec /pd-server ${ARGS} +exit 0 +`, + }, + { + name: "pdms with name(>= 8.3.0)", + modifyTC: func(tc *v1alpha1.TidbCluster) { + tc.Spec.ClusterDomain = "cluster-1.com" + tc.Spec.PDMS = []*v1alpha1.PDMSSpec{ + { + ComponentSpec: v1alpha1.ComponentSpec{ + Image: "pingcap/pd:v8.3.0", + }, + Name: "tso", + }, + } + }, + expectScript: `#!/bin/sh + +set -uo pipefail + +ANNOTATIONS="/etc/podinfo/annotations" +if [[ ! -f "${ANNOTATIONS}" ]] +then + echo "${ANNOTATIONS} does't exist, exiting." + exit 1 +fi +source ${ANNOTATIONS} 2>/dev/null + +runmode=${runmode:-normal} +if [[ X${runmode} == Xdebug ]] +then + echo "entering debug mode." + tail -f /dev/null +fi + +PDMS_POD_NAME=${POD_NAME:-$HOSTNAME} +PD_DOMAIN=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc.cluster-1.com + +elapseTime=0 +period=1 +threshold=30 +while true; do + sleep ${period} + elapseTime=$(( elapseTime+period )) + + if [[ ${elapseTime} -ge ${threshold} ]]; then + echo "waiting for pd cluster ready timeout" >&2 + exit 1 + fi + + digRes=$(dig ${PD_DOMAIN} A ${PD_DOMAIN} AAAA +search +short) + if [ $? -ne 0 ]; then + echo "domain resolve ${PD_DOMAIN} failed" + echo "$digRes" + continue + fi + + if [ -z "${digRes}" ] + then + echo "domain resolve ${PD_DOMAIN} no record return" + else + echo "domain resolve ${PD_DOMAIN} success" + echo "$digRes" + break + fi +done + +ARGS=" services tso --name=${PDMS_POD_NAME}.start-script-test-tso-peer.start-script-test-ns.svc.cluster-1.com --listen-addr=http://0.0.0.0:2379 \ +--advertise-listen-addr=http://${PD_DOMAIN}:2379 \ +--backend-endpoints=http://start-script-test-pd:2379 \ +--config=/etc/pd/pd.toml \ +" + echo "starting pd-server ..." sleep $((RANDOM % 10)) echo "/pd-server ${ARGS}" @@ -822,3 +901,40 @@ exit 0 g.Expect(validateScript(script)).Should(gomega.Succeed()) } } + +func TestPDMSWithName(t *testing.T) { + re := require.New(t) + tc := &v1alpha1.TidbCluster{ + Spec: v1alpha1.TidbClusterSpec{ + PDMS: []*v1alpha1.PDMSSpec{ + { + Name: "tso", + ComponentSpec: v1alpha1.ComponentSpec{ + Image: "pd-test-image", + }, + Replicas: 3, + StorageClassName: pointer.StringPtr("my-storage-class"), + }, + }, + }, + } + + for _, spec := range tc.Spec.PDMS { + spec.Image = "pingcap/pd:v8.2.0" + } + check, err := pdMSSupportMicroServicesWithName.Check(tc.PDMSVersion("tso")) + re.Nil(err) + re.False(check) + for _, spec := range tc.Spec.PDMS { + spec.Image = "pingcap/pd:v8.3.0" + } + check, err = pdMSSupportMicroServicesWithName.Check(tc.PDMSVersion("tso")) + re.Nil(err) + re.True(check) + for _, spec := range tc.Spec.PDMS { + spec.Image = "pingcap/pd:v9.1.0" + } + check, err = pdMSSupportMicroServicesWithName.Check(tc.PDMSVersion("tso")) + re.Nil(err) + re.True(check) +} diff --git a/pkg/manager/member/tikv_member_manager.go b/pkg/manager/member/tikv_member_manager.go index 5da033d984b..58e20aaafbd 100644 --- a/pkg/manager/member/tikv_member_manager.go +++ b/pkg/manager/member/tikv_member_manager.go @@ -115,9 +115,9 @@ func (m *tikvMemberManager) Sync(tc *v1alpha1.TidbCluster) error { // Check if all PD Micro Services are available if tc.Spec.PDMS != nil && (tc.Spec.PD != nil && tc.Spec.PD.Mode == "ms") { for _, pdms := range tc.Spec.PDMS { - if err = controller.GetPDMSClient(m.deps.PDControl, tc, pdms.Name); err != nil { + if cli := controller.GetPDMSClient(m.deps.PDControl, tc, pdms.Name); cli == nil { return controller.RequeueErrorf("PDMS component %s for TidbCluster: [%s/%s], "+ - "waiting for PD micro service cluster running, error: %v", pdms.Name, ns, tcName, err) + "waiting for PD micro service cluster running", pdms.Name, ns, tcName) } } } diff --git a/pkg/pdapi/pdapi.go b/pkg/pdapi/pdapi.go index bc8d867e0cc..5f2fe17f38e 100644 --- a/pkg/pdapi/pdapi.go +++ b/pkg/pdapi/pdapi.go @@ -94,7 +94,7 @@ type PDClient interface { GetAutoscalingPlans(strategy Strategy) ([]Plan, error) // GetRecoveringMark return the pd recovering mark GetRecoveringMark() (bool, error) - // GetMSMembers returns all PD members service-addr from cluster by specific Micro Service + // GetMSMembers returns all PDMS members service-addr from cluster by specific Micro Service GetMSMembers(service string) ([]string, error) } diff --git a/pkg/pdapi/pdms_api.go b/pkg/pdapi/pdms_api.go index 3e53d7558f1..1a948e0692d 100644 --- a/pkg/pdapi/pdms_api.go +++ b/pkg/pdapi/pdms_api.go @@ -20,6 +20,7 @@ import ( "time" httputil "github.com/pingcap/tidb-operator/pkg/util/http" + "k8s.io/klog/v2" ) // PDMSClient provides pd MS server's api @@ -58,6 +59,7 @@ func NewPDMSClient(serviceName, url string, timeout time.Duration, tlsConfig *tl func (c *pdMSClient) GetHealth() error { // only support TSO service if c.serviceName != TSOServiceName { + klog.Errorf("only support TSO service, but got %s", c.serviceName) return nil } apiURL := fmt.Sprintf("%s/%s/%s", c.url, c.serviceName, pdMSHealthPrefix) From 545685a65d02612fd4d57c5bcbe405a32b0c4276 Mon Sep 17 00:00:00 2001 From: wuhuizuo Date: Thu, 8 Aug 2024 13:06:40 +0800 Subject: [PATCH 3/8] fix(deps): update docker image of pingcap base to v1.9.1 (#5705) --- cmd/http-service/Dockerfile | 2 +- images/br-federation-manager/Dockerfile | 2 +- images/br-federation-manager/Dockerfile.e2e | 2 +- images/tidb-backup-manager/Dockerfile | 2 +- images/tidb-backup-manager/Dockerfile.e2e | 2 +- images/tidb-operator/Dockerfile | 2 +- images/tidb-operator/Dockerfile.e2e | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/http-service/Dockerfile b/cmd/http-service/Dockerfile index b7221fdb5a7..ba4af321248 100644 --- a/cmd/http-service/Dockerfile +++ b/cmd/http-service/Dockerfile @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 ARG TARGETARCH RUN dnf install -y tzdata bind-utils && dnf clean all diff --git a/images/br-federation-manager/Dockerfile b/images/br-federation-manager/Dockerfile index 7f65cb87ee2..13ee5acb6e3 100644 --- a/images/br-federation-manager/Dockerfile +++ b/images/br-federation-manager/Dockerfile @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 ARG TARGETARCH RUN dnf install -y bind-utils tzdata && dnf clean all ADD bin/${TARGETARCH}/br-federation-manager /usr/local/bin/br-federation-manager diff --git a/images/br-federation-manager/Dockerfile.e2e b/images/br-federation-manager/Dockerfile.e2e index 626b24ef04d..1dddaa24091 100644 --- a/images/br-federation-manager/Dockerfile.e2e +++ b/images/br-federation-manager/Dockerfile.e2e @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 ARG TARGETARCH RUN dnf install -y tzdata bind-utils && dnf clean all diff --git a/images/tidb-backup-manager/Dockerfile b/images/tidb-backup-manager/Dockerfile index 96d8df4a73f..79b5b324d49 100644 --- a/images/tidb-backup-manager/Dockerfile +++ b/images/tidb-backup-manager/Dockerfile @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 ARG TARGETARCH ARG RCLONE_VERSION=v1.57.0 ARG SHUSH_VERSION=v1.4.0 diff --git a/images/tidb-backup-manager/Dockerfile.e2e b/images/tidb-backup-manager/Dockerfile.e2e index c77c3fccbaa..04a9c1bf29e 100644 --- a/images/tidb-backup-manager/Dockerfile.e2e +++ b/images/tidb-backup-manager/Dockerfile.e2e @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 ARG TARGETARCH=amd64 ARG RCLONE_VERSION=v1.57.0 ARG SHUSH_VERSION=v1.4.0 diff --git a/images/tidb-operator/Dockerfile b/images/tidb-operator/Dockerfile index d4b0495c65e..5aed5287131 100644 --- a/images/tidb-operator/Dockerfile +++ b/images/tidb-operator/Dockerfile @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 ARG TARGETARCH RUN dnf install -y tzdata bind-utils && dnf clean all diff --git a/images/tidb-operator/Dockerfile.e2e b/images/tidb-operator/Dockerfile.e2e index 7d1b9a7938f..a700aa92136 100644 --- a/images/tidb-operator/Dockerfile.e2e +++ b/images/tidb-operator/Dockerfile.e2e @@ -1,4 +1,4 @@ -FROM pingcap/pingcap-base:v1 +FROM ghcr.io/pingcap-qe/bases/pingcap-base:v1.9.1 RUN dnf install -y tzdata bash bind-utils && dnf clean all From dbe75c02a4214cc2c4a02e8d1b846816a224f835 Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 14 Aug 2024 09:46:36 +0800 Subject: [PATCH 4/8] pdms: Choose a suitable pdms to transfer primary when upgrade (#5643) --- pkg/controller/pd_control.go | 14 ++++ pkg/manager/member/pd_ms_upgrader.go | 88 ++++++++++++++++++++++- pkg/manager/member/pd_ms_upgrader_test.go | 20 +++++- pkg/manager/member/utils.go | 15 ++++ pkg/pdapi/fake_pdapi.go | 48 +++++++++++++ pkg/pdapi/pd_control.go | 14 +++- pkg/pdapi/pdapi.go | 17 +++++ pkg/pdapi/pdms_api.go | 25 ++++++- 8 files changed, 234 insertions(+), 7 deletions(-) diff --git a/pkg/controller/pd_control.go b/pkg/controller/pd_control.go index 53418aa39bb..b42f88bb0a5 100644 --- a/pkg/controller/pd_control.go +++ b/pkg/controller/pd_control.go @@ -113,6 +113,20 @@ func NewFakePDClient(pdControl *pdapi.FakePDControl, tc *v1alpha1.TidbCluster) * return pdClient } +// NewFakePDMSClient creates a fake pdmsclient that is set as the pdms client +func NewFakePDMSClient(pdControl *pdapi.FakePDControl, tc *v1alpha1.TidbCluster, curService string) *pdapi.FakePDMSClient { + pdmsClient := pdapi.NewFakePDMSClient() + if tc.Spec.Cluster != nil { + pdControl.SetPDMSClientWithClusterDomain(pdapi.Namespace(tc.Spec.Cluster.Namespace), tc.Spec.Cluster.Name, tc.Spec.Cluster.ClusterDomain, curService, pdmsClient) + } + if tc.Spec.ClusterDomain != "" { + pdControl.SetPDMSClientWithClusterDomain(pdapi.Namespace(tc.GetNamespace()), tc.GetName(), tc.Spec.ClusterDomain, curService, pdmsClient) + } + pdControl.SetPDMSClient(pdapi.Namespace(tc.GetNamespace()), tc.GetName(), curService, pdmsClient) + + return pdmsClient +} + // NewFakePDClientWithAddress creates a fake pdclient that is set as the pd client func NewFakePDClientWithAddress(pdControl *pdapi.FakePDControl, peerURL string) *pdapi.FakePDClient { pdClient := pdapi.NewFakePDClient() diff --git a/pkg/manager/member/pd_ms_upgrader.go b/pkg/manager/member/pd_ms_upgrader.go index a00653f5dea..3c99ad64999 100644 --- a/pkg/manager/member/pd_ms_upgrader.go +++ b/pkg/manager/member/pd_ms_upgrader.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/tidb-operator/pkg/controller" mngerutils "github.com/pingcap/tidb-operator/pkg/manager/utils" "github.com/pingcap/tidb-operator/pkg/third_party/k8s" + "github.com/pingcap/tidb-operator/pkg/util/cmpver" apps "k8s.io/api/apps/v1" "k8s.io/klog/v2" ) @@ -120,12 +121,95 @@ func (u *pdMSUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.St } continue } - mngerutils.SetUpgradePartition(newSet, i) - return nil + + return u.upgradePDMSPod(tc, i, newSet, curService) } return nil } +func (u *pdMSUpgrader) upgradePDMSPod(tc *v1alpha1.TidbCluster, ordinal int32, newSet *apps.StatefulSet, curService string) error { + // Only support after `8.3.0` to keep compatibility. + if check, err := pdMSSupportMicroServicesWithName.Check(tc.PDMSVersion(curService)); check && err == nil { + ns := tc.GetNamespace() + tcName := tc.GetName() + upgradePDMSName := PDMSName(tcName, ordinal, tc.Namespace, tc.Spec.ClusterDomain, tc.Spec.AcrossK8s, curService) + upgradePodName := PDMSPodName(tcName, ordinal, curService) + + pdClient := controller.GetPDClient(u.deps.PDControl, tc) + primary, err := pdClient.GetMSPrimary(curService) + if err != nil { + return err + } + + klog.Infof("TidbCluster: [%s/%s]' pdms upgrader: check primary: %s, upgradePDMSName: %s, upgradePodName: %s", ns, tcName, + primary, upgradePDMSName, upgradePodName) + // If current pdms is primary, transfer primary to other pdms pod + if strings.Contains(primary, upgradePodName) || strings.Contains(primary, upgradePDMSName) { + targetName := "" + + if tc.PDMSStsActualReplicas(curService) > 1 { + targetName = choosePDMSToTransferFromMembers(tc, newSet, ordinal) + } + + if targetName != "" { + klog.Infof("TidbCluster: [%s/%s]' pdms upgrader: transfer pdms primary to: %s", ns, tcName, targetName) + err := controller.GetPDMSClient(u.deps.PDControl, tc, curService).TransferPrimary(targetName) + if err != nil { + klog.Errorf("TidbCluster: [%s/%s]' pdms upgrader: failed to transfer pdms primary to: %s, %v", ns, tcName, targetName, err) + return err + } + klog.Infof("TidbCluster: [%s/%s]' pdms upgrader: transfer pdms primary to: %s successfully", ns, tcName, targetName) + } else { + klog.Warningf("TidbCluster: [%s/%s]' pdms upgrader: skip to transfer pdms primary, because can not find a suitable pd", ns, tcName) + } + } + } + + mngerutils.SetUpgradePartition(newSet, ordinal) + return nil +} + +// choosePDMSToTransferFromMembers choose a pdms to transfer primary from members +// +// Assume that current primary ordinal is x, and range is [0, n] +// 1. Find the max suitable ordinal in (x, n], because they have been upgraded +// 2. If no suitable ordinal, find the min suitable ordinal in [0, x) to reduce the count of transfer +func choosePDMSToTransferFromMembers(tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet, ordinal int32) string { + ns := tc.GetNamespace() + tcName := tc.GetName() + klog.Infof("Tidbcluster: [%s/%s]' pdms upgrader: start to choose pdms to transfer primary from members", ns, tcName) + ordinals := helper.GetPodOrdinals(*newSet.Spec.Replicas, newSet) + + // set ordinal to max ordinal if ordinal isn't exist + if !ordinals.Has(ordinal) { + ordinal = helper.GetMaxPodOrdinal(*newSet.Spec.Replicas, newSet) + } + + targetName := "" + list := ordinals.List() + if len(list) == 0 { + return "" + } + + // just using pods index for now. TODO: add healthy checker for pdms. + // find the maximum ordinal which is larger than ordinal + if len(list) > int(ordinal)+1 { + targetName = PDMSPodName(tcName, list[len(list)-1], controller.PDMSTrimName(newSet.Name)) + } + + if targetName == "" && ordinal != 0 { + // find the minimum ordinal which is less than ordinal + targetName = PDMSPodName(tcName, list[0], controller.PDMSTrimName(newSet.Name)) + } + + klog.Infof("Tidbcluster: [%s/%s]' pdms upgrader: choose pdms to transfer primary from members, targetName: %s", ns, tcName, targetName) + return targetName +} + +// PDMSSupportMicroServicesWithName returns true if the given version of PDMS supports microservices with name. +// related https://github.com/tikv/pd/pull/8157. +var pdMSSupportMicroServicesWithName, _ = cmpver.NewConstraint(cmpver.GreaterOrEqual, "v8.3.0") + type fakePDMSUpgrader struct{} // NewFakePDMSUpgrader returns a fakePDUpgrader diff --git a/pkg/manager/member/pd_ms_upgrader_test.go b/pkg/manager/member/pd_ms_upgrader_test.go index 19152d69b37..b6b0f7adfb3 100644 --- a/pkg/manager/member/pd_ms_upgrader_test.go +++ b/pkg/manager/member/pd_ms_upgrader_test.go @@ -21,6 +21,7 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/controller" mngerutils "github.com/pingcap/tidb-operator/pkg/manager/utils" + "github.com/pingcap/tidb-operator/pkg/pdapi" apps "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -44,8 +45,20 @@ func TestPDMSUpgraderUpgrade(t *testing.T) { testFn := func(test *testcase) { t.Log(test.name) - upgrader, podInformer := newPDMSUpgrader() + upgrader, pdControl, podInformer := newPDMSUpgrader() tc := newTidbClusterForPDMSUpgrader() + pdClient := controller.NewFakePDClient(pdControl, tc) + pdMSClient := controller.NewFakePDMSClient(pdControl, tc, "tso") + + pdClient.AddReaction(pdapi.GetPDMSPrimaryActionType, func(action *pdapi.Action) (interface{}, error) { + return "upgrader-tso-1", nil + }) + pdMSClient.AddReaction(pdapi.GetHealthActionType, func(action *pdapi.Action) (interface{}, error) { + return nil, nil + }) + pdMSClient.AddReaction(pdapi.PDMSTransferPrimaryActionType, func(action *pdapi.Action) (interface{}, error) { + return nil, nil + }) if test.changeFn != nil { test.changeFn(tc) @@ -218,11 +231,12 @@ func TestPDMSUpgraderUpgrade(t *testing.T) { } } -func newPDMSUpgrader() (Upgrader, podinformers.PodInformer) { +func newPDMSUpgrader() (Upgrader, *pdapi.FakePDControl, podinformers.PodInformer) { fakeDeps := controller.NewFakeDependencies() pdMSUpgrader := &pdMSUpgrader{deps: fakeDeps} podInformer := fakeDeps.KubeInformerFactory.Core().V1().Pods() - return pdMSUpgrader, podInformer + pdControl := fakeDeps.PDControl.(*pdapi.FakePDControl) + return pdMSUpgrader, pdControl, podInformer } func newStatefulSetForPDMSUpgrader() *apps.StatefulSet { diff --git a/pkg/manager/member/utils.go b/pkg/manager/member/utils.go index 898ee77ec29..c7cda0ba535 100644 --- a/pkg/manager/member/utils.go +++ b/pkg/manager/member/utils.go @@ -160,6 +160,21 @@ func PdName(tcName string, ordinal int32, namespace string, clusterDomain string return PdPodName(tcName, ordinal) } +// PDMSName should match the start arg `--name` of pd-server +// See the start script of PDMS in pkg/manager/member/startscript/v2.renderPDMSStartScript +func PDMSName(tcName string, ordinal int32, namespace, clusterDomain string, acrossK8s bool, component string) string { + if len(clusterDomain) > 0 { + return fmt.Sprintf("%s.%s-%s-peer.%s.svc.%s", PDMSPodName(tcName, ordinal, component), component, tcName, namespace, clusterDomain) + } + + // clusterDomain is not set + if acrossK8s { + return fmt.Sprintf("%s.%s-%s-peer.%s.svc", PDMSPodName(tcName, ordinal, component), component, tcName, namespace) + } + + return PDMSPodName(tcName, ordinal, component) +} + // NeedForceUpgrade check if force upgrade is necessary func NeedForceUpgrade(ann map[string]string) bool { // Check if annotation 'pingcap.com/force-upgrade: "true"' is set diff --git a/pkg/pdapi/fake_pdapi.go b/pkg/pdapi/fake_pdapi.go index e451b910ed5..80138fe4a5c 100644 --- a/pkg/pdapi/fake_pdapi.go +++ b/pkg/pdapi/fake_pdapi.go @@ -28,6 +28,7 @@ const ( GetClusterActionType ActionType = "GetCluster" GetMembersActionType ActionType = "GetMembers" GetPDMSMembersActionType ActionType = "GetPDMSMembers" + GetPDMSPrimaryActionType ActionType = "GetPDMSPrimary" GetStoresActionType ActionType = "GetStores" GetTombStoneStoresActionType ActionType = "GetTombStoneStores" GetStoreActionType ActionType = "GetStore" @@ -45,6 +46,7 @@ const ( TransferPDLeaderActionType ActionType = "TransferPDLeader" GetAutoscalingPlansActionType ActionType = "GetAutoscalingPlans" GetRecoveringMarkActionType ActionType = "GetRecoveringMark" + PDMSTransferPrimaryActionType ActionType = "PDMSTransferPrimary" ) type NotFoundReaction struct { @@ -78,6 +80,15 @@ func (c *FakePDClient) GetMSMembers(_ string) ([]string, error) { return result.([]string), nil } +func (c *FakePDClient) GetMSPrimary(_ string) (string, error) { + action := &Action{} + result, err := c.fakeAPI(GetPDMSPrimaryActionType, action) + if err != nil { + return "", err + } + return result.(string), nil +} + func NewFakePDClient() *FakePDClient { return &FakePDClient{reactions: map[ActionType]Reaction{}} } @@ -291,3 +302,40 @@ func (c *FakePDClient) GetRecoveringMark() (bool, error) { return true, nil } + +// FakePDMSClient implements a fake version of PDMSClient. +type FakePDMSClient struct { + reactions map[ActionType]Reaction +} + +func NewFakePDMSClient() *FakePDMSClient { + return &FakePDMSClient{reactions: map[ActionType]Reaction{}} +} + +func (c *FakePDMSClient) AddReaction(actionType ActionType, reaction Reaction) { + c.reactions[actionType] = reaction +} + +// fakeAPI is a small helper for fake API calls +func (c *FakePDMSClient) fakeAPI(actionType ActionType, action *Action) (interface{}, error) { + if reaction, ok := c.reactions[actionType]; ok { + result, err := reaction(action) + if err != nil { + return nil, err + } + return result, nil + } + return nil, &NotFoundReaction{actionType} +} + +func (c *FakePDMSClient) GetHealth() error { + action := &Action{} + _, err := c.fakeAPI(GetHealthActionType, action) + return err +} + +func (c *FakePDMSClient) TransferPrimary(newPrimary string) error { + action := &Action{Name: newPrimary} + _, err := c.fakeAPI(PDMSTransferPrimaryActionType, action) + return err +} diff --git a/pkg/pdapi/pd_control.go b/pkg/pdapi/pd_control.go index 667300b8e8a..811f7d6c01b 100644 --- a/pkg/pdapi/pd_control.go +++ b/pkg/pdapi/pd_control.go @@ -337,7 +337,7 @@ type FakePDControl struct { func NewFakePDControl(secretLister corelisterv1.SecretLister) *FakePDControl { return &FakePDControl{ - defaultPDControl{secretLister: secretLister, pdClients: map[string]PDClient{}}, + defaultPDControl{secretLister: secretLister, pdClients: map[string]PDClient{}, pdMSClients: map[string]PDMSClient{}}, } } @@ -352,3 +352,15 @@ func (fpc *FakePDControl) SetPDClientWithClusterDomain(namespace Namespace, tcNa func (fpc *FakePDControl) SetPDClientWithAddress(peerURL string, pdclient PDClient) { fpc.defaultPDControl.pdClients[peerURL] = pdclient } + +func (fpc *FakePDControl) SetPDMSClient(namespace Namespace, tcName, curService string, pdmsclient PDMSClient) { + fpc.defaultPDControl.pdMSClients[genClientUrl(namespace, tcName, "http", "", curService, false)] = pdmsclient +} + +func (fpc *FakePDControl) SetPDMSClientWithClusterDomain(namespace Namespace, tcName, tcClusterDomain, curService string, pdmsclient PDMSClient) { + fpc.defaultPDControl.pdMSClients[genClientUrl(namespace, tcName, "http", tcClusterDomain, curService, false)] = pdmsclient +} + +func (fpc *FakePDControl) SetPDMSClientWithAddress(peerURL string, pdmsclient PDMSClient) { + fpc.defaultPDControl.pdMSClients[peerURL] = pdmsclient +} diff --git a/pkg/pdapi/pdapi.go b/pkg/pdapi/pdapi.go index 5f2fe17f38e..4a633d6906d 100644 --- a/pkg/pdapi/pdapi.go +++ b/pkg/pdapi/pdapi.go @@ -96,6 +96,8 @@ type PDClient interface { GetRecoveringMark() (bool, error) // GetMSMembers returns all PDMS members service-addr from cluster by specific Micro Service GetMSMembers(service string) ([]string, error) + // GetMSPrimary returns the primary PDMS member service-addr from cluster by specific Micro Service + GetMSPrimary(service string) (string, error) } var ( @@ -341,6 +343,21 @@ func (c *pdClient) GetMSMembers(service string) ([]string, error) { return addrs, nil } +func (c *pdClient) GetMSPrimary(service string) (string, error) { + apiURL := fmt.Sprintf("%s/%s/primary/%s", c.url, MicroServicePrefix, service) + body, err := httputil.GetBodyOK(c.httpClient, apiURL) + if err != nil { + return "", err + } + var primary string + err = json.Unmarshal(body, &primary) + if err != nil { + return "", err + } + + return primary, nil +} + func (c *pdClient) getStores(apiURL string) (*StoresInfo, error) { body, err := httputil.GetBodyOK(c.httpClient, apiURL) if err != nil { diff --git a/pkg/pdapi/pdms_api.go b/pkg/pdapi/pdms_api.go index 1a948e0692d..b8c8c77482c 100644 --- a/pkg/pdapi/pdms_api.go +++ b/pkg/pdapi/pdms_api.go @@ -14,7 +14,9 @@ package pdapi import ( + "bytes" "crypto/tls" + "encoding/json" "fmt" "net/http" "time" @@ -27,10 +29,13 @@ import ( type PDMSClient interface { // GetHealth returns ping result GetHealth() error + // TransferPrimary transfers the primary to the newPrimary + TransferPrimary(newPrimary string) error } var ( - pdMSHealthPrefix = "api/v1/health" + pdMSHealthPrefix = "api/v1/health" + pdMSPrimaryTransferPrefix = "api/v1/primary/transfer" ) // pdMSClient is default implementation of PDClient @@ -69,3 +74,21 @@ func (c *pdMSClient) GetHealth() error { } return nil } + +func (c *pdMSClient) TransferPrimary(newPrimary string) error { + apiURL := fmt.Sprintf("%s/%s/%s", c.url, c.serviceName, pdMSPrimaryTransferPrefix) + data, err := json.Marshal(struct { + NewPrimary string `json:"new_primary"` + }{ + NewPrimary: newPrimary, + }) + if err != nil { + return err + } + _, err = httputil.PostBodyOK(c.httpClient, apiURL, bytes.NewBuffer(data)) + if err != nil { + return err + } + + return nil +} From df24c478e8554f3ad40fe5497967dfc96da41997 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 23 Aug 2024 16:04:20 +0800 Subject: [PATCH 5/8] br: add some log to record the CR we are processing (#5711) (#5715) Co-authored-by: csuzhangxc --- cmd/backup-manager/app/backup/manager.go | 11 +++++++++++ cmd/backup-manager/app/export/manager.go | 8 ++++++++ cmd/backup-manager/app/import/manager.go | 8 ++++++++ cmd/backup-manager/app/restore/manager.go | 11 +++++++++++ 4 files changed, 38 insertions(+) diff --git a/cmd/backup-manager/app/backup/manager.go b/cmd/backup-manager/app/backup/manager.go index f5d30dcdb89..b0d3b712da0 100644 --- a/cmd/backup-manager/app/backup/manager.go +++ b/cmd/backup-manager/app/backup/manager.go @@ -35,6 +35,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" errorutils "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ) @@ -101,6 +102,13 @@ func (bm *Manager) ProcessBackup() error { return errorutils.NewAggregate(errs) } + crData, err := json.Marshal(backup) + if err != nil { + klog.Errorf("failed to marshal backup %v to json, err: %v", backup, err) + } else { + klog.Infof("start to process backup: %s", string(crData)) + } + // we treat snapshot backup as restarted if its status is not scheduled when backup pod just start to run // we will clean backup data before run br command if backup.Spec.Mode == v1alpha1.BackupModeSnapshot && (backup.Status.Phase != v1alpha1.BackupScheduled || v1alpha1.IsBackupRestart(backup)) { @@ -132,6 +140,9 @@ func (bm *Manager) ProcessBackup() error { return bm.performBackup(ctx, backup.DeepCopy(), nil) } + klog.Infof("start to connect to tidb server (%s:%d) as the .spec.from field is specified", + backup.Spec.From.Host, backup.Spec.From.Port) + // validate and create from db var db *sql.DB db, err = bm.validateAndCreateFromDB(ctx, backup.DeepCopy()) diff --git a/cmd/backup-manager/app/export/manager.go b/cmd/backup-manager/app/export/manager.go index 6bedab45178..49edfa18683 100644 --- a/cmd/backup-manager/app/export/manager.go +++ b/cmd/backup-manager/app/export/manager.go @@ -32,6 +32,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" errorutils "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ) @@ -99,6 +100,13 @@ func (bm *BackupManager) ProcessBackup() error { return errorutils.NewAggregate(errs) } + crData, err := json.Marshal(backup) + if err != nil { + klog.Errorf("failed to marshal backup %v to json, err: %v", backup, err) + } else { + klog.Infof("start to process backup: %s", string(crData)) + } + reason, err := bm.setOptions(backup) if err != nil { errs = append(errs, err) diff --git a/cmd/backup-manager/app/import/manager.go b/cmd/backup-manager/app/import/manager.go index 36e4db7e37c..cdeb15737a4 100644 --- a/cmd/backup-manager/app/import/manager.go +++ b/cmd/backup-manager/app/import/manager.go @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" errorutils "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/json" "k8s.io/klog/v2" ) @@ -87,6 +88,13 @@ func (rm *RestoreManager) ProcessRestore() error { return errorutils.NewAggregate(errs) } + crData, err := json.Marshal(restore) + if err != nil { + klog.Errorf("failed to marshal restore %v to json, err: %s", restore, err) + } else { + klog.Infof("start to process restore: %s", string(crData)) + } + rm.setOptions(restore) return rm.performRestore(ctx, restore.DeepCopy()) diff --git a/cmd/backup-manager/app/restore/manager.go b/cmd/backup-manager/app/restore/manager.go index 5ba497f7bae..da9bf95c620 100644 --- a/cmd/backup-manager/app/restore/manager.go +++ b/cmd/backup-manager/app/restore/manager.go @@ -30,6 +30,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" errorutils "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ) @@ -96,12 +97,22 @@ func (rm *Manager) ProcessRestore() error { return fmt.Errorf("no br config in %s", rm) } + crData, err := json.Marshal(restore) + if err != nil { + klog.Errorf("failed to marshal restore %v to json, err: %s", restore, err) + } else { + klog.Infof("start to process restore: %s", string(crData)) + } + if restore.Spec.To == nil { return rm.performRestore(ctx, restore.DeepCopy(), nil) } rm.setOptions(restore) + klog.Infof("start to connect to tidb server (%s:%d) as the .spec.to field is specified", + restore.Spec.To.Host, restore.Spec.To.Port) + var db *sql.DB var dsn string err = wait.PollImmediate(constants.PollInterval, constants.CheckTimeout, func() (done bool, err error) { From bdd43958651e9cd493adc9a71bc2e4f67182ff83 Mon Sep 17 00:00:00 2001 From: Xuecheng Zhang Date: Mon, 26 Aug 2024 15:56:06 +0800 Subject: [PATCH 6/8] chore: remove spec.from for Backup and spec.to for Restore in example (#5717) --- examples/backup/backup-azblob.yaml | 6 ------ examples/backup/backup-ebs-local.yaml | 6 ------ examples/backup/backup-ebs-minio.yaml | 6 ------ examples/backup/backup-local.yaml | 6 ------ examples/backup/backup-nfs.yaml | 6 ------ examples/backup/backup-schedule-azblob.yaml | 6 ------ examples/backup/backup-schedule-nfs.yaml | 6 ------ examples/backup/restore-azblob.yaml | 6 ------ examples/backup/restore-ebs-minio.yaml | 6 ------ examples/backup/restore-local.yaml | 6 ------ examples/backup/restore-nfs.yaml | 6 ------ manifests/backup/backup-aws-s3-br.yaml | 6 ------ manifests/backup/backup-gcs-br.yaml | 6 ------ manifests/backup/backup-s3-br.yaml | 6 ------ manifests/backup/backup-schedule-aws-s3-br.yaml | 6 ------ manifests/backup/backup-schedule-gcs-br.yaml | 6 ------ manifests/backup/backup-schedule-s3-br.yaml | 6 ------ manifests/backup/restore-aws-s3-br.yaml | 6 ------ manifests/backup/restore-gcs-br.yaml | 6 ------ manifests/backup/restore-s3-br.yaml | 6 ------ 20 files changed, 120 deletions(-) diff --git a/examples/backup/backup-azblob.yaml b/examples/backup/backup-azblob.yaml index 03637ebe3d0..e211bf12f55 100644 --- a/examples/backup/backup-azblob.yaml +++ b/examples/backup/backup-azblob.yaml @@ -6,12 +6,6 @@ metadata: spec: cleanPolicy: Delete # backupType: full - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret br: cluster: basic clusterNamespace: default diff --git a/examples/backup/backup-ebs-local.yaml b/examples/backup/backup-ebs-local.yaml index a6d6de03c18..8bc38985b30 100644 --- a/examples/backup/backup-ebs-local.yaml +++ b/examples/backup/backup-ebs-local.yaml @@ -7,12 +7,6 @@ spec: cleanPolicy: Delete backupType: full backupMode: volume-snapshot - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret toolImage: localhost:5000/pingcap/br:latest br: cluster: basic diff --git a/examples/backup/backup-ebs-minio.yaml b/examples/backup/backup-ebs-minio.yaml index 0ea3bb37d86..db55b0084b8 100644 --- a/examples/backup/backup-ebs-minio.yaml +++ b/examples/backup/backup-ebs-minio.yaml @@ -7,12 +7,6 @@ spec: cleanPolicy: Delete backupType: full backupMode: volume-snapshot - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret toolImage: localhost:5000/pingcap/br:latest br: cluster: basic diff --git a/examples/backup/backup-local.yaml b/examples/backup/backup-local.yaml index 0e2022c73af..5889bdc51bc 100644 --- a/examples/backup/backup-local.yaml +++ b/examples/backup/backup-local.yaml @@ -6,12 +6,6 @@ metadata: spec: cleanPolicy: Delete # backupType: full - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret br: cluster: basic clusterNamespace: default diff --git a/examples/backup/backup-nfs.yaml b/examples/backup/backup-nfs.yaml index 6d83f184a9d..0c58383cbc3 100644 --- a/examples/backup/backup-nfs.yaml +++ b/examples/backup/backup-nfs.yaml @@ -6,12 +6,6 @@ metadata: spec: cleanPolicy: Delete # backupType: full - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret br: cluster: basic clusterNamespace: default diff --git a/examples/backup/backup-schedule-azblob.yaml b/examples/backup/backup-schedule-azblob.yaml index 734df81fa01..4cf78540159 100644 --- a/examples/backup/backup-schedule-azblob.yaml +++ b/examples/backup/backup-schedule-azblob.yaml @@ -9,12 +9,6 @@ spec: # maxReservedTime: "2m" schedule: "*/1 * * * *" backupTemplate: - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb_host} - # port: ${tidb_port} - # user: ${tidb_user} - # secretName: backup-demo1-tidb-secret cleanPolicy: Delete br: cluster: basic diff --git a/examples/backup/backup-schedule-nfs.yaml b/examples/backup/backup-schedule-nfs.yaml index 7175b75921d..def3ead4600 100644 --- a/examples/backup/backup-schedule-nfs.yaml +++ b/examples/backup/backup-schedule-nfs.yaml @@ -9,12 +9,6 @@ spec: # maxReservedTime: "2m" schedule: "*/1 * * * *" backupTemplate: - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb_host} - # port: ${tidb_port} - # user: ${tidb_user} - # secretName: backup-demo1-tidb-secret cleanPolicy: Delete br: cluster: basic diff --git a/examples/backup/restore-azblob.yaml b/examples/backup/restore-azblob.yaml index 500ccf20986..34e5a2127b2 100644 --- a/examples/backup/restore-azblob.yaml +++ b/examples/backup/restore-azblob.yaml @@ -5,12 +5,6 @@ metadata: namespace: default spec: # backupType: full - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # to: - # host: ${tidb_host} - # port: ${tidb_port} - # user: ${tidb_user} - # secretName: restore-demo2-tidb-secret br: cluster: basic clusterNamespace: default diff --git a/examples/backup/restore-ebs-minio.yaml b/examples/backup/restore-ebs-minio.yaml index a6164bb59d8..3f208930e8b 100644 --- a/examples/backup/restore-ebs-minio.yaml +++ b/examples/backup/restore-ebs-minio.yaml @@ -6,12 +6,6 @@ metadata: spec: backupType: full backupMode: volume-snapshot - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret toolImage: localhost:5000/pingcap/br:latest br: cluster: basic diff --git a/examples/backup/restore-local.yaml b/examples/backup/restore-local.yaml index 011cefb671d..d4b1e058b67 100644 --- a/examples/backup/restore-local.yaml +++ b/examples/backup/restore-local.yaml @@ -5,12 +5,6 @@ metadata: namespace: default spec: # backupType: full - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # from: - # host: ${tidb-host} - # port: ${tidb-port} - # user: ${tidb-user} - # secretName: backup-basic-tidb-secret br: cluster: basic clusterNamespace: default diff --git a/examples/backup/restore-nfs.yaml b/examples/backup/restore-nfs.yaml index d5dc6fad924..c8da926fb18 100644 --- a/examples/backup/restore-nfs.yaml +++ b/examples/backup/restore-nfs.yaml @@ -5,12 +5,6 @@ metadata: namespace: default spec: # backupType: full - # Only needed for TiDB Operator < v1.1.7 or TiDB < v4.0.8 - # to: - # host: ${tidb_host} - # port: ${tidb_port} - # user: ${tidb_user} - # secretName: restore-demo2-tidb-secret br: cluster: basic clusterNamespace: default diff --git a/manifests/backup/backup-aws-s3-br.yaml b/manifests/backup/backup-aws-s3-br.yaml index dfafde44495..72b04eb9a32 100644 --- a/manifests/backup/backup-aws-s3-br.yaml +++ b/manifests/backup/backup-aws-s3-br.yaml @@ -21,12 +21,6 @@ spec: # timeAgo: