From 6c3410ddc136608b577cbbf804e81434d06f6a79 Mon Sep 17 00:00:00 2001 From: Sherif Akoush Date: Fri, 4 Oct 2024 18:26:25 +0100 Subject: [PATCH] fix(controller): Adjust controller -> scheduler state recreation upon scheduler disconnect. (#5944) * move initial sync to scheduler to a separate logic (servers) * add coverage for model statues * add control plane rpc (scheduler<->controller) * add control plane stream code and test * add stop control plane stream * add control plane client handling * add handle state reconstruction to the control plane * adjust name of the logger func * block streams connecting until scheduler is ready * adjust simple schedule code * adjust simple sync (2) * add exec with timeout for the control plane ops (for state) * do not block Load/Unload models on scheduling * add test for subscribe control plane stream * extend control plane tests * fix lint * PR comments --- apis/go/mlops/scheduler/scheduler.pb.go | 427 ++++++++++++------- apis/go/mlops/scheduler/scheduler_grpc.pb.go | 67 +++ apis/mlops/scheduler/scheduler.proto | 11 + operator/scheduler/client.go | 55 ++- operator/scheduler/control_plane.go | 83 ++++ operator/scheduler/control_plane_test.go | 287 +++++++++++++ operator/scheduler/experiment.go | 22 +- operator/scheduler/experiment_test.go | 6 +- operator/scheduler/model.go | 5 - operator/scheduler/model_test.go | 256 +++++++++++ operator/scheduler/pipeline.go | 14 - operator/scheduler/pipeline_test.go | 6 +- operator/scheduler/server.go | 4 +- operator/scheduler/server_test.go | 2 +- operator/scheduler/utils.go | 164 +++++-- operator/scheduler/utils_test.go | 86 +++- scheduler/cmd/scheduler/main.go | 1 + scheduler/pkg/server/control_plane.go | 71 +++ scheduler/pkg/server/control_plane_test.go | 73 ++++ scheduler/pkg/server/experiment_status.go | 2 + scheduler/pkg/server/pipeline_status.go | 2 + scheduler/pkg/server/server.go | 37 +- scheduler/pkg/server/server_status.go | 3 +- scheduler/pkg/server/server_test.go | 121 ++++-- scheduler/pkg/synchroniser/servers_sync.go | 6 + scheduler/pkg/synchroniser/sync.go | 19 +- scheduler/pkg/synchroniser/sync_test.go | 28 +- 27 files changed, 1554 insertions(+), 304 deletions(-) create mode 100644 operator/scheduler/control_plane.go create mode 100644 operator/scheduler/control_plane_test.go create mode 100644 operator/scheduler/model_test.go create mode 100644 scheduler/pkg/server/control_plane.go create mode 100644 scheduler/pkg/server/control_plane_test.go diff --git a/apis/go/mlops/scheduler/scheduler.pb.go b/apis/go/mlops/scheduler/scheduler.pb.go index a71317d2d3..54bc9aa90d 100644 --- a/apis/go/mlops/scheduler/scheduler.pb.go +++ b/apis/go/mlops/scheduler/scheduler.pb.go @@ -3712,6 +3712,91 @@ func (x *SchedulerStatusResponse) GetApplicationVersion() string { return "" } +type ControlPlaneSubscriptionRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + SubscriberName string `protobuf:"bytes,1,opt,name=subscriberName,proto3" json:"subscriberName,omitempty"` //Name of the subscription caller +} + +func (x *ControlPlaneSubscriptionRequest) Reset() { + *x = ControlPlaneSubscriptionRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_scheduler_proto_msgTypes[54] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ControlPlaneSubscriptionRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ControlPlaneSubscriptionRequest) ProtoMessage() {} + +func (x *ControlPlaneSubscriptionRequest) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_scheduler_proto_msgTypes[54] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ControlPlaneSubscriptionRequest.ProtoReflect.Descriptor instead. +func (*ControlPlaneSubscriptionRequest) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_scheduler_proto_rawDescGZIP(), []int{54} +} + +func (x *ControlPlaneSubscriptionRequest) GetSubscriberName() string { + if x != nil { + return x.SubscriberName + } + return "" +} + +type ControlPlaneResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields +} + +func (x *ControlPlaneResponse) Reset() { + *x = ControlPlaneResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_scheduler_proto_msgTypes[55] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ControlPlaneResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ControlPlaneResponse) ProtoMessage() {} + +func (x *ControlPlaneResponse) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_scheduler_proto_msgTypes[55] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ControlPlaneResponse.ProtoReflect.Descriptor instead. +func (*ControlPlaneResponse) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_scheduler_proto_rawDescGZIP(), []int{55} +} + var File_mlops_scheduler_scheduler_proto protoreflect.FileDescriptor var file_mlops_scheduler_scheduler_proto_rawDesc = []byte{ @@ -4319,132 +4404,146 @@ var file_mlops_scheduler_scheduler_proto_rawDesc = []byte{ 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x2e, 0x0a, 0x12, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x12, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x2a, 0x27, 0x0a, 0x0c, - 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x54, 0x79, 0x70, 0x65, 0x12, 0x09, 0x0a, 0x05, - 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x50, 0x49, 0x50, 0x45, 0x4c, - 0x49, 0x4e, 0x45, 0x10, 0x01, 0x32, 0xd9, 0x0e, 0x0a, 0x09, 0x53, 0x63, 0x68, 0x65, 0x64, 0x75, - 0x6c, 0x65, 0x72, 0x12, 0x6b, 0x0a, 0x0c, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x4e, 0x6f, 0x74, - 0x69, 0x66, 0x79, 0x12, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x49, 0x0a, 0x1f, + 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x50, 0x6c, 0x61, 0x6e, 0x65, 0x53, 0x75, 0x62, 0x73, + 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, + 0x26, 0x0a, 0x0e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x72, 0x4e, 0x61, 0x6d, + 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, + 0x62, 0x65, 0x72, 0x4e, 0x61, 0x6d, 0x65, 0x22, 0x16, 0x0a, 0x14, 0x43, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x50, 0x6c, 0x61, 0x6e, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2a, + 0x27, 0x0a, 0x0c, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x54, 0x79, 0x70, 0x65, 0x12, + 0x09, 0x0a, 0x05, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x50, 0x49, + 0x50, 0x45, 0x4c, 0x49, 0x4e, 0x45, 0x10, 0x01, 0x32, 0xde, 0x0f, 0x0a, 0x09, 0x53, 0x63, 0x68, + 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x12, 0x6b, 0x0a, 0x0c, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, + 0x4e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x12, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, + 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, + 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x4e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x65, 0x72, - 0x76, 0x65, 0x72, 0x4e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, - 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, - 0x4e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, - 0x12, 0x62, 0x0a, 0x09, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x28, 0x2e, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, - 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, - 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, - 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x22, 0x00, 0x12, 0x68, 0x0a, 0x0b, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, - 0x64, 0x65, 0x6c, 0x12, 0x2a, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, - 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, - 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, - 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, - 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x4d, - 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x6b, - 0x0a, 0x0c, 0x4c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x12, 0x2b, + 0x76, 0x65, 0x72, 0x4e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x22, 0x00, 0x12, 0x62, 0x0a, 0x09, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, + 0x12, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x29, 0x2e, 0x73, 0x65, 0x6c, + 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, + 0x6c, 0x65, 0x72, 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x68, 0x0a, 0x0b, 0x55, 0x6e, 0x6c, 0x6f, 0x61, + 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x2a, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, + 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, + 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, + 0x73, 0x74, 0x1a, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, + 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, 0x6f, + 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, + 0x00, 0x12, 0x6b, 0x0a, 0x0c, 0x4c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, + 0x65, 0x12, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x50, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, - 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, - 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, - 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, - 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x71, 0x0a, 0x0e, 0x55, - 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x12, 0x2d, 0x2e, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, - 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, 0x73, - 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, - 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, - 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x74, - 0x0a, 0x0f, 0x53, 0x74, 0x61, 0x72, 0x74, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, - 0x74, 0x12, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, - 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x74, 0x61, 0x72, 0x74, - 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, - 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x74, 0x61, 0x72, 0x74, - 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x22, 0x00, 0x12, 0x71, 0x0a, 0x0e, 0x53, 0x74, 0x6f, 0x70, 0x45, 0x78, 0x70, 0x65, - 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, - 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, - 0x53, 0x74, 0x6f, 0x70, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, - 0x74, 0x6f, 0x70, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x73, - 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x6d, 0x0a, 0x0c, 0x53, 0x65, 0x72, 0x76, 0x65, - 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x71, + 0x0a, 0x0e, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, + 0x12, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, + 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, + 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x50, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, + 0x00, 0x12, 0x74, 0x0a, 0x0f, 0x53, 0x74, 0x61, 0x72, 0x74, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, + 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, + 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x74, + 0x61, 0x72, 0x74, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, + 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x74, + 0x61, 0x72, 0x74, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x71, 0x0a, 0x0e, 0x53, 0x74, 0x6f, 0x70, 0x45, + 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, + 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, + 0x65, 0x72, 0x2e, 0x53, 0x74, 0x6f, 0x70, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, + 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x53, 0x74, 0x6f, 0x70, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x6d, 0x0a, 0x0c, 0x53, 0x65, + 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2b, 0x2e, 0x73, 0x65, 0x6c, + 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, + 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, + 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x6a, 0x0a, 0x0b, 0x4d, 0x6f, 0x64, + 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2a, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, + 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x73, 0x0a, 0x0e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, + 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, - 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, - 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, + 0x2e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, + 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, + 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x79, 0x0a, 0x10, 0x45, 0x78, + 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, + 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, + 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x30, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, + 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, + 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x74, 0x0a, 0x0f, 0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, + 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x7c, 0x0a, 0x15, 0x53, + 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, + 0x61, 0x74, 0x75, 0x73, 0x12, 0x31, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x65, - 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x6a, 0x0a, 0x0b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2a, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4d, - 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x72, 0x76, 0x65, 0x72, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, + 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x79, 0x0a, 0x14, 0x53, 0x75, 0x62, + 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x12, 0x30, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, - 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, - 0x30, 0x01, 0x12, 0x73, 0x0a, 0x0e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x74, - 0x61, 0x74, 0x75, 0x73, 0x12, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, - 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x50, 0x69, - 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, 0x75, - 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, - 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x50, 0x69, 0x70, - 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, - 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x79, 0x0a, 0x10, 0x45, 0x78, 0x70, 0x65, 0x72, - 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x2e, 0x73, 0x65, + 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x1a, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, + 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4d, 0x6f, 0x64, + 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, + 0x22, 0x00, 0x30, 0x01, 0x12, 0x88, 0x01, 0x0a, 0x19, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, + 0x62, 0x65, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x12, 0x35, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, + 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, + 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, + 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x30, 0x2e, 0x73, 0x65, 0x6c, 0x64, + 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, + 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, + 0x82, 0x01, 0x0a, 0x17, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x50, 0x69, 0x70, + 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x33, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, - 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x30, 0x2e, 0x73, - 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, - 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, - 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, - 0x30, 0x01, 0x12, 0x74, 0x0a, 0x0f, 0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x53, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, - 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, - 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x7c, 0x0a, 0x15, 0x53, 0x75, 0x62, 0x73, - 0x63, 0x72, 0x69, 0x62, 0x65, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, - 0x73, 0x12, 0x31, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, - 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, - 0x72, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, - 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, - 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x53, 0x65, - 0x72, 0x76, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x79, 0x0a, 0x14, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, - 0x69, 0x62, 0x65, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x30, - 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, - 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x75, 0x62, + 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x1a, 0x2b, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, - 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, - 0x01, 0x12, 0x88, 0x01, 0x0a, 0x19, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x45, - 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, - 0x35, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, - 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, - 0x65, 0x6e, 0x74, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x52, - 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x30, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, - 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, - 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, - 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x12, 0x82, 0x01, 0x0a, - 0x17, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, - 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x33, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, - 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, - 0x72, 0x2e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, - 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, - 0x01, 0x42, 0x3c, 0x5a, 0x3a, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2d, - 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, 0x73, 0x2f, 0x67, 0x6f, 0x2f, 0x76, 0x32, 0x2f, - 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x62, - 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x1a, 0x2e, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, + 0x6e, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, + 0x22, 0x00, 0x30, 0x01, 0x12, 0x82, 0x01, 0x0a, 0x15, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, + 0x62, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x50, 0x6c, 0x61, 0x6e, 0x65, 0x12, 0x37, + 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x50, + 0x6c, 0x61, 0x6e, 0x65, 0x53, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, + 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x50, 0x6c, 0x61, 0x6e, 0x65, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x42, 0x3c, 0x5a, 0x3a, 0x67, 0x69, 0x74, + 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, + 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, + 0x73, 0x2f, 0x67, 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, + 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -4460,7 +4559,7 @@ func file_mlops_scheduler_scheduler_proto_rawDescGZIP() []byte { } var file_mlops_scheduler_scheduler_proto_enumTypes = make([]protoimpl.EnumInfo, 7) -var file_mlops_scheduler_scheduler_proto_msgTypes = make([]protoimpl.MessageInfo, 58) +var file_mlops_scheduler_scheduler_proto_msgTypes = make([]protoimpl.MessageInfo, 60) var file_mlops_scheduler_scheduler_proto_goTypes = []any{ (ResourceType)(0), // 0: seldon.mlops.scheduler.ResourceType (ModelStatus_ModelState)(0), // 1: seldon.mlops.scheduler.ModelStatus.ModelState @@ -4523,11 +4622,13 @@ var file_mlops_scheduler_scheduler_proto_goTypes = []any{ (*PipelineVersionState)(nil), // 58: seldon.mlops.scheduler.PipelineVersionState (*SchedulerStatusRequest)(nil), // 59: seldon.mlops.scheduler.SchedulerStatusRequest (*SchedulerStatusResponse)(nil), // 60: seldon.mlops.scheduler.SchedulerStatusResponse - nil, // 61: seldon.mlops.scheduler.ModelVersionStatus.ModelReplicaStateEntry - nil, // 62: seldon.mlops.scheduler.PipelineStep.TensorMapEntry - nil, // 63: seldon.mlops.scheduler.PipelineInput.TensorMapEntry - nil, // 64: seldon.mlops.scheduler.PipelineOutput.TensorMapEntry - (*timestamppb.Timestamp)(nil), // 65: google.protobuf.Timestamp + (*ControlPlaneSubscriptionRequest)(nil), // 61: seldon.mlops.scheduler.ControlPlaneSubscriptionRequest + (*ControlPlaneResponse)(nil), // 62: seldon.mlops.scheduler.ControlPlaneResponse + nil, // 63: seldon.mlops.scheduler.ModelVersionStatus.ModelReplicaStateEntry + nil, // 64: seldon.mlops.scheduler.PipelineStep.TensorMapEntry + nil, // 65: seldon.mlops.scheduler.PipelineInput.TensorMapEntry + nil, // 66: seldon.mlops.scheduler.PipelineOutput.TensorMapEntry + (*timestamppb.Timestamp)(nil), // 67: google.protobuf.Timestamp } var file_mlops_scheduler_scheduler_proto_depIdxs = []int32{ 8, // 0: seldon.mlops.scheduler.LoadModelRequest.model:type_name -> seldon.mlops.scheduler.Model @@ -4543,13 +4644,13 @@ var file_mlops_scheduler_scheduler_proto_depIdxs = []int32{ 14, // 10: seldon.mlops.scheduler.UnloadModelRequest.kubernetesMeta:type_name -> seldon.mlops.scheduler.KubernetesMeta 22, // 11: seldon.mlops.scheduler.ModelStatusResponse.versions:type_name -> seldon.mlops.scheduler.ModelVersionStatus 14, // 12: seldon.mlops.scheduler.ModelVersionStatus.kubernetesMeta:type_name -> seldon.mlops.scheduler.KubernetesMeta - 61, // 13: seldon.mlops.scheduler.ModelVersionStatus.modelReplicaState:type_name -> seldon.mlops.scheduler.ModelVersionStatus.ModelReplicaStateEntry + 63, // 13: seldon.mlops.scheduler.ModelVersionStatus.modelReplicaState:type_name -> seldon.mlops.scheduler.ModelVersionStatus.ModelReplicaStateEntry 23, // 14: seldon.mlops.scheduler.ModelVersionStatus.state:type_name -> seldon.mlops.scheduler.ModelStatus 8, // 15: seldon.mlops.scheduler.ModelVersionStatus.modelDefn:type_name -> seldon.mlops.scheduler.Model 1, // 16: seldon.mlops.scheduler.ModelStatus.state:type_name -> seldon.mlops.scheduler.ModelStatus.ModelState - 65, // 17: seldon.mlops.scheduler.ModelStatus.lastChangeTimestamp:type_name -> google.protobuf.Timestamp + 67, // 17: seldon.mlops.scheduler.ModelStatus.lastChangeTimestamp:type_name -> google.protobuf.Timestamp 2, // 18: seldon.mlops.scheduler.ModelReplicaStatus.state:type_name -> seldon.mlops.scheduler.ModelReplicaStatus.ModelReplicaState - 65, // 19: seldon.mlops.scheduler.ModelReplicaStatus.lastChangeTimestamp:type_name -> google.protobuf.Timestamp + 67, // 19: seldon.mlops.scheduler.ModelReplicaStatus.lastChangeTimestamp:type_name -> google.protobuf.Timestamp 27, // 20: seldon.mlops.scheduler.ServerStatusResponse.resources:type_name -> seldon.mlops.scheduler.ServerReplicaResources 14, // 21: seldon.mlops.scheduler.ServerStatusResponse.kubernetesMeta:type_name -> seldon.mlops.scheduler.KubernetesMeta 18, // 22: seldon.mlops.scheduler.ModelStatusRequest.model:type_name -> seldon.mlops.scheduler.ModelReference @@ -4567,20 +4668,20 @@ var file_mlops_scheduler_scheduler_proto_depIdxs = []int32{ 50, // 34: seldon.mlops.scheduler.Pipeline.output:type_name -> seldon.mlops.scheduler.PipelineOutput 14, // 35: seldon.mlops.scheduler.Pipeline.kubernetesMeta:type_name -> seldon.mlops.scheduler.KubernetesMeta 49, // 36: seldon.mlops.scheduler.Pipeline.input:type_name -> seldon.mlops.scheduler.PipelineInput - 62, // 37: seldon.mlops.scheduler.PipelineStep.tensorMap:type_name -> seldon.mlops.scheduler.PipelineStep.TensorMapEntry + 64, // 37: seldon.mlops.scheduler.PipelineStep.tensorMap:type_name -> seldon.mlops.scheduler.PipelineStep.TensorMapEntry 3, // 38: seldon.mlops.scheduler.PipelineStep.inputsJoin:type_name -> seldon.mlops.scheduler.PipelineStep.JoinOp 3, // 39: seldon.mlops.scheduler.PipelineStep.triggersJoin:type_name -> seldon.mlops.scheduler.PipelineStep.JoinOp 48, // 40: seldon.mlops.scheduler.PipelineStep.batch:type_name -> seldon.mlops.scheduler.Batch 4, // 41: seldon.mlops.scheduler.PipelineInput.joinType:type_name -> seldon.mlops.scheduler.PipelineInput.JoinOp 4, // 42: seldon.mlops.scheduler.PipelineInput.triggersJoin:type_name -> seldon.mlops.scheduler.PipelineInput.JoinOp - 63, // 43: seldon.mlops.scheduler.PipelineInput.tensorMap:type_name -> seldon.mlops.scheduler.PipelineInput.TensorMapEntry + 65, // 43: seldon.mlops.scheduler.PipelineInput.tensorMap:type_name -> seldon.mlops.scheduler.PipelineInput.TensorMapEntry 5, // 44: seldon.mlops.scheduler.PipelineOutput.stepsJoin:type_name -> seldon.mlops.scheduler.PipelineOutput.JoinOp - 64, // 45: seldon.mlops.scheduler.PipelineOutput.tensorMap:type_name -> seldon.mlops.scheduler.PipelineOutput.TensorMapEntry + 66, // 45: seldon.mlops.scheduler.PipelineOutput.tensorMap:type_name -> seldon.mlops.scheduler.PipelineOutput.TensorMapEntry 57, // 46: seldon.mlops.scheduler.PipelineStatusResponse.versions:type_name -> seldon.mlops.scheduler.PipelineWithState 46, // 47: seldon.mlops.scheduler.PipelineWithState.pipeline:type_name -> seldon.mlops.scheduler.Pipeline 58, // 48: seldon.mlops.scheduler.PipelineWithState.state:type_name -> seldon.mlops.scheduler.PipelineVersionState 6, // 49: seldon.mlops.scheduler.PipelineVersionState.status:type_name -> seldon.mlops.scheduler.PipelineVersionState.PipelineStatus - 65, // 50: seldon.mlops.scheduler.PipelineVersionState.lastChangeTimestamp:type_name -> google.protobuf.Timestamp + 67, // 50: seldon.mlops.scheduler.PipelineVersionState.lastChangeTimestamp:type_name -> google.protobuf.Timestamp 24, // 51: seldon.mlops.scheduler.ModelVersionStatus.ModelReplicaStateEntry.value:type_name -> seldon.mlops.scheduler.ModelReplicaStatus 30, // 52: seldon.mlops.scheduler.Scheduler.ServerNotify:input_type -> seldon.mlops.scheduler.ServerNotifyRequest 7, // 53: seldon.mlops.scheduler.Scheduler.LoadModel:input_type -> seldon.mlops.scheduler.LoadModelRequest @@ -4598,24 +4699,26 @@ var file_mlops_scheduler_scheduler_proto_depIdxs = []int32{ 28, // 65: seldon.mlops.scheduler.Scheduler.SubscribeModelStatus:input_type -> seldon.mlops.scheduler.ModelSubscriptionRequest 42, // 66: seldon.mlops.scheduler.Scheduler.SubscribeExperimentStatus:input_type -> seldon.mlops.scheduler.ExperimentSubscriptionRequest 55, // 67: seldon.mlops.scheduler.Scheduler.SubscribePipelineStatus:input_type -> seldon.mlops.scheduler.PipelineSubscriptionRequest - 32, // 68: seldon.mlops.scheduler.Scheduler.ServerNotify:output_type -> seldon.mlops.scheduler.ServerNotifyResponse - 17, // 69: seldon.mlops.scheduler.Scheduler.LoadModel:output_type -> seldon.mlops.scheduler.LoadModelResponse - 20, // 70: seldon.mlops.scheduler.Scheduler.UnloadModel:output_type -> seldon.mlops.scheduler.UnloadModelResponse - 51, // 71: seldon.mlops.scheduler.Scheduler.LoadPipeline:output_type -> seldon.mlops.scheduler.LoadPipelineResponse - 53, // 72: seldon.mlops.scheduler.Scheduler.UnloadPipeline:output_type -> seldon.mlops.scheduler.UnloadPipelineResponse - 39, // 73: seldon.mlops.scheduler.Scheduler.StartExperiment:output_type -> seldon.mlops.scheduler.StartExperimentResponse - 41, // 74: seldon.mlops.scheduler.Scheduler.StopExperiment:output_type -> seldon.mlops.scheduler.StopExperimentResponse - 26, // 75: seldon.mlops.scheduler.Scheduler.ServerStatus:output_type -> seldon.mlops.scheduler.ServerStatusResponse - 21, // 76: seldon.mlops.scheduler.Scheduler.ModelStatus:output_type -> seldon.mlops.scheduler.ModelStatusResponse - 56, // 77: seldon.mlops.scheduler.Scheduler.PipelineStatus:output_type -> seldon.mlops.scheduler.PipelineStatusResponse - 43, // 78: seldon.mlops.scheduler.Scheduler.ExperimentStatus:output_type -> seldon.mlops.scheduler.ExperimentStatusResponse - 60, // 79: seldon.mlops.scheduler.Scheduler.SchedulerStatus:output_type -> seldon.mlops.scheduler.SchedulerStatusResponse - 26, // 80: seldon.mlops.scheduler.Scheduler.SubscribeServerStatus:output_type -> seldon.mlops.scheduler.ServerStatusResponse - 21, // 81: seldon.mlops.scheduler.Scheduler.SubscribeModelStatus:output_type -> seldon.mlops.scheduler.ModelStatusResponse - 43, // 82: seldon.mlops.scheduler.Scheduler.SubscribeExperimentStatus:output_type -> seldon.mlops.scheduler.ExperimentStatusResponse - 56, // 83: seldon.mlops.scheduler.Scheduler.SubscribePipelineStatus:output_type -> seldon.mlops.scheduler.PipelineStatusResponse - 68, // [68:84] is the sub-list for method output_type - 52, // [52:68] is the sub-list for method input_type + 61, // 68: seldon.mlops.scheduler.Scheduler.SubscribeControlPlane:input_type -> seldon.mlops.scheduler.ControlPlaneSubscriptionRequest + 32, // 69: seldon.mlops.scheduler.Scheduler.ServerNotify:output_type -> seldon.mlops.scheduler.ServerNotifyResponse + 17, // 70: seldon.mlops.scheduler.Scheduler.LoadModel:output_type -> seldon.mlops.scheduler.LoadModelResponse + 20, // 71: seldon.mlops.scheduler.Scheduler.UnloadModel:output_type -> seldon.mlops.scheduler.UnloadModelResponse + 51, // 72: seldon.mlops.scheduler.Scheduler.LoadPipeline:output_type -> seldon.mlops.scheduler.LoadPipelineResponse + 53, // 73: seldon.mlops.scheduler.Scheduler.UnloadPipeline:output_type -> seldon.mlops.scheduler.UnloadPipelineResponse + 39, // 74: seldon.mlops.scheduler.Scheduler.StartExperiment:output_type -> seldon.mlops.scheduler.StartExperimentResponse + 41, // 75: seldon.mlops.scheduler.Scheduler.StopExperiment:output_type -> seldon.mlops.scheduler.StopExperimentResponse + 26, // 76: seldon.mlops.scheduler.Scheduler.ServerStatus:output_type -> seldon.mlops.scheduler.ServerStatusResponse + 21, // 77: seldon.mlops.scheduler.Scheduler.ModelStatus:output_type -> seldon.mlops.scheduler.ModelStatusResponse + 56, // 78: seldon.mlops.scheduler.Scheduler.PipelineStatus:output_type -> seldon.mlops.scheduler.PipelineStatusResponse + 43, // 79: seldon.mlops.scheduler.Scheduler.ExperimentStatus:output_type -> seldon.mlops.scheduler.ExperimentStatusResponse + 60, // 80: seldon.mlops.scheduler.Scheduler.SchedulerStatus:output_type -> seldon.mlops.scheduler.SchedulerStatusResponse + 26, // 81: seldon.mlops.scheduler.Scheduler.SubscribeServerStatus:output_type -> seldon.mlops.scheduler.ServerStatusResponse + 21, // 82: seldon.mlops.scheduler.Scheduler.SubscribeModelStatus:output_type -> seldon.mlops.scheduler.ModelStatusResponse + 43, // 83: seldon.mlops.scheduler.Scheduler.SubscribeExperimentStatus:output_type -> seldon.mlops.scheduler.ExperimentStatusResponse + 56, // 84: seldon.mlops.scheduler.Scheduler.SubscribePipelineStatus:output_type -> seldon.mlops.scheduler.PipelineStatusResponse + 62, // 85: seldon.mlops.scheduler.Scheduler.SubscribeControlPlane:output_type -> seldon.mlops.scheduler.ControlPlaneResponse + 69, // [69:86] is the sub-list for method output_type + 52, // [52:69] is the sub-list for method input_type 52, // [52:52] is the sub-list for extension type_name 52, // [52:52] is the sub-list for extension extendee 0, // [0:52] is the sub-list for field type_name @@ -5275,6 +5378,30 @@ func file_mlops_scheduler_scheduler_proto_init() { return nil } } + file_mlops_scheduler_scheduler_proto_msgTypes[54].Exporter = func(v any, i int) any { + switch v := v.(*ControlPlaneSubscriptionRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_scheduler_proto_msgTypes[55].Exporter = func(v any, i int) any { + switch v := v.(*ControlPlaneResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } } file_mlops_scheduler_scheduler_proto_msgTypes[2].OneofWrappers = []any{} file_mlops_scheduler_scheduler_proto_msgTypes[4].OneofWrappers = []any{} @@ -5304,7 +5431,7 @@ func file_mlops_scheduler_scheduler_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_mlops_scheduler_scheduler_proto_rawDesc, NumEnums: 7, - NumMessages: 58, + NumMessages: 60, NumExtensions: 0, NumServices: 1, }, diff --git a/apis/go/mlops/scheduler/scheduler_grpc.pb.go b/apis/go/mlops/scheduler/scheduler_grpc.pb.go index a125fb62d1..d5e4bd65e8 100644 --- a/apis/go/mlops/scheduler/scheduler_grpc.pb.go +++ b/apis/go/mlops/scheduler/scheduler_grpc.pb.go @@ -44,6 +44,7 @@ const ( Scheduler_SubscribeModelStatus_FullMethodName = "/seldon.mlops.scheduler.Scheduler/SubscribeModelStatus" Scheduler_SubscribeExperimentStatus_FullMethodName = "/seldon.mlops.scheduler.Scheduler/SubscribeExperimentStatus" Scheduler_SubscribePipelineStatus_FullMethodName = "/seldon.mlops.scheduler.Scheduler/SubscribePipelineStatus" + Scheduler_SubscribeControlPlane_FullMethodName = "/seldon.mlops.scheduler.Scheduler/SubscribeControlPlane" ) // SchedulerClient is the client API for Scheduler service. @@ -66,6 +67,8 @@ type SchedulerClient interface { SubscribeModelStatus(ctx context.Context, in *ModelSubscriptionRequest, opts ...grpc.CallOption) (Scheduler_SubscribeModelStatusClient, error) SubscribeExperimentStatus(ctx context.Context, in *ExperimentSubscriptionRequest, opts ...grpc.CallOption) (Scheduler_SubscribeExperimentStatusClient, error) SubscribePipelineStatus(ctx context.Context, in *PipelineSubscriptionRequest, opts ...grpc.CallOption) (Scheduler_SubscribePipelineStatusClient, error) + // control plane stream with controller + SubscribeControlPlane(ctx context.Context, in *ControlPlaneSubscriptionRequest, opts ...grpc.CallOption) (Scheduler_SubscribeControlPlaneClient, error) } type schedulerClient struct { @@ -420,6 +423,39 @@ func (x *schedulerSubscribePipelineStatusClient) Recv() (*PipelineStatusResponse return m, nil } +func (c *schedulerClient) SubscribeControlPlane(ctx context.Context, in *ControlPlaneSubscriptionRequest, opts ...grpc.CallOption) (Scheduler_SubscribeControlPlaneClient, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + stream, err := c.cc.NewStream(ctx, &Scheduler_ServiceDesc.Streams[8], Scheduler_SubscribeControlPlane_FullMethodName, cOpts...) + if err != nil { + return nil, err + } + x := &schedulerSubscribeControlPlaneClient{ClientStream: stream} + if err := x.ClientStream.SendMsg(in); err != nil { + return nil, err + } + if err := x.ClientStream.CloseSend(); err != nil { + return nil, err + } + return x, nil +} + +type Scheduler_SubscribeControlPlaneClient interface { + Recv() (*ControlPlaneResponse, error) + grpc.ClientStream +} + +type schedulerSubscribeControlPlaneClient struct { + grpc.ClientStream +} + +func (x *schedulerSubscribeControlPlaneClient) Recv() (*ControlPlaneResponse, error) { + m := new(ControlPlaneResponse) + if err := x.ClientStream.RecvMsg(m); err != nil { + return nil, err + } + return m, nil +} + // SchedulerServer is the server API for Scheduler service. // All implementations must embed UnimplementedSchedulerServer // for forward compatibility @@ -440,6 +476,8 @@ type SchedulerServer interface { SubscribeModelStatus(*ModelSubscriptionRequest, Scheduler_SubscribeModelStatusServer) error SubscribeExperimentStatus(*ExperimentSubscriptionRequest, Scheduler_SubscribeExperimentStatusServer) error SubscribePipelineStatus(*PipelineSubscriptionRequest, Scheduler_SubscribePipelineStatusServer) error + // control plane stream with controller + SubscribeControlPlane(*ControlPlaneSubscriptionRequest, Scheduler_SubscribeControlPlaneServer) error mustEmbedUnimplementedSchedulerServer() } @@ -495,6 +533,9 @@ func (UnimplementedSchedulerServer) SubscribeExperimentStatus(*ExperimentSubscri func (UnimplementedSchedulerServer) SubscribePipelineStatus(*PipelineSubscriptionRequest, Scheduler_SubscribePipelineStatusServer) error { return status.Errorf(codes.Unimplemented, "method SubscribePipelineStatus not implemented") } +func (UnimplementedSchedulerServer) SubscribeControlPlane(*ControlPlaneSubscriptionRequest, Scheduler_SubscribeControlPlaneServer) error { + return status.Errorf(codes.Unimplemented, "method SubscribeControlPlane not implemented") +} func (UnimplementedSchedulerServer) mustEmbedUnimplementedSchedulerServer() {} // UnsafeSchedulerServer may be embedded to opt out of forward compatibility for this service. @@ -820,6 +861,27 @@ func (x *schedulerSubscribePipelineStatusServer) Send(m *PipelineStatusResponse) return x.ServerStream.SendMsg(m) } +func _Scheduler_SubscribeControlPlane_Handler(srv interface{}, stream grpc.ServerStream) error { + m := new(ControlPlaneSubscriptionRequest) + if err := stream.RecvMsg(m); err != nil { + return err + } + return srv.(SchedulerServer).SubscribeControlPlane(m, &schedulerSubscribeControlPlaneServer{ServerStream: stream}) +} + +type Scheduler_SubscribeControlPlaneServer interface { + Send(*ControlPlaneResponse) error + grpc.ServerStream +} + +type schedulerSubscribeControlPlaneServer struct { + grpc.ServerStream +} + +func (x *schedulerSubscribeControlPlaneServer) Send(m *ControlPlaneResponse) error { + return x.ServerStream.SendMsg(m) +} + // Scheduler_ServiceDesc is the grpc.ServiceDesc for Scheduler service. // It's only intended for direct use with grpc.RegisterService, // and not to be introspected or modified (even as a copy) @@ -901,6 +963,11 @@ var Scheduler_ServiceDesc = grpc.ServiceDesc{ Handler: _Scheduler_SubscribePipelineStatus_Handler, ServerStreams: true, }, + { + StreamName: "SubscribeControlPlane", + Handler: _Scheduler_SubscribeControlPlane_Handler, + ServerStreams: true, + }, }, Metadata: "mlops/scheduler/scheduler.proto", } diff --git a/apis/mlops/scheduler/scheduler.proto b/apis/mlops/scheduler/scheduler.proto index d8d6463b5a..163afdfd5d 100644 --- a/apis/mlops/scheduler/scheduler.proto +++ b/apis/mlops/scheduler/scheduler.proto @@ -385,6 +385,14 @@ message SchedulerStatusResponse { string applicationVersion = 1; } +message ControlPlaneSubscriptionRequest { + string subscriberName = 1; //Name of the subscription caller +} + +message ControlPlaneResponse { + +} + // [END Messages] @@ -412,6 +420,9 @@ service Scheduler { rpc SubscribeModelStatus(ModelSubscriptionRequest) returns (stream ModelStatusResponse) {}; rpc SubscribeExperimentStatus(ExperimentSubscriptionRequest) returns (stream ExperimentStatusResponse) {}; rpc SubscribePipelineStatus(PipelineSubscriptionRequest) returns (stream PipelineStatusResponse) {}; + + // control plane stream with controller + rpc SubscribeControlPlane(ControlPlaneSubscriptionRequest) returns (stream ControlPlaneResponse) {}; } // [END Services] diff --git a/operator/scheduler/client.go b/operator/scheduler/client.go index aacc9a54c8..ce6990493b 100644 --- a/operator/scheduler/client.go +++ b/operator/scheduler/client.go @@ -73,12 +73,17 @@ func getSchedulerHost(namespace string) string { // we also add a retry mechanism to reconnect if the connection is lost, this can happen if the scheduler is restarted // or if the network connection is lost. We use an exponential backoff to retry the connection. // note that when the scheduler is completely dead we will be not be able to reconnect and these go routines will retry forever +// on reconnect we send the state of the different resources to the scheduler, this is to make sure that the scheduler has the correct state // TODO: add a max retry count and report back to the caller. +// TODO add done for graceful shutdown otherwise these go routines will run forever +// TODO tidy up ctx from the different handlers, currently they are all context.Background() func (s *SchedulerClient) startEventHanders(namespace string, conn *grpc.ClientConn) { + s.logger.Info("Starting event handling", "namespace", namespace) + // Subscribe the event streams from scheduler go func() { for { - err := retryFn(s.SubscribeModelEvents, conn, namespace, s.logger) + err := retryFn(s.SubscribeModelEvents, conn, namespace, s.logger.WithName("SubscribeModelEvents")) if err != nil { s.logger.Error(err, "Subscribe ended for model events", "namespace", namespace) } else { @@ -88,7 +93,7 @@ func (s *SchedulerClient) startEventHanders(namespace string, conn *grpc.ClientC }() go func() { for { - err := retryFn(s.SubscribeServerEvents, conn, namespace, s.logger) + err := retryFn(s.SubscribeServerEvents, conn, namespace, s.logger.WithName("SubscribeServerEvents")) if err != nil { s.logger.Error(err, "Subscribe ended for server events", "namespace", namespace) } else { @@ -98,7 +103,7 @@ func (s *SchedulerClient) startEventHanders(namespace string, conn *grpc.ClientC }() go func() { for { - err := retryFn(s.SubscribePipelineEvents, conn, namespace, s.logger) + err := retryFn(s.SubscribePipelineEvents, conn, namespace, s.logger.WithName("SubscribePipelineEvents")) if err != nil { s.logger.Error(err, "Subscribe ended for pipeline events", "namespace", namespace) } else { @@ -108,7 +113,7 @@ func (s *SchedulerClient) startEventHanders(namespace string, conn *grpc.ClientC }() go func() { for { - err := retryFn(s.SubscribeExperimentEvents, conn, namespace, s.logger) + err := retryFn(s.SubscribeExperimentEvents, conn, namespace, s.logger.WithName("SubscribeExperimentEvents")) if err != nil { s.logger.Error(err, "Subscribe ended for experiment events", "namespace", namespace) } else { @@ -116,6 +121,46 @@ func (s *SchedulerClient) startEventHanders(namespace string, conn *grpc.ClientC } } }() + go func() { + for { + err := retryFn(s.SubscribeControlPlaneEvents, conn, namespace, s.logger.WithName("SubscribeControlPlaneEvents")) + if err != nil { + s.logger.Error(err, "Subscribe ended for control plane events", "namespace", namespace) + } else { + s.logger.Info("Subscribe ended for control plane events", "namespace", namespace) + } + } + }() +} + +func (s *SchedulerClient) handleStateOnReconnect(context context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { + // on new reconnects we send a list of servers to the schedule + err := s.handleRegisteredServers(context, grpcClient, namespace) + if err != nil { + s.logger.Error(err, "Failed to send registered server to scheduler") + } + + if err == nil { + err = s.handleExperiments(context, grpcClient, namespace) + if err != nil { + s.logger.Error(err, "Failed to send experiments to scheduler") + } + } + + if err == nil { + err = s.handlePipelines(context, grpcClient, namespace) + if err != nil { + s.logger.Error(err, "Failed to send pipelines to scheduler") + } + } + + if err == nil { + err = s.handleModels(context, grpcClient, namespace) + if err != nil { + s.logger.Error(err, "Failed to send models to scheduler") + } + } + return err } func (s *SchedulerClient) RemoveConnection(namespace string) { @@ -253,7 +298,7 @@ func retryFn( fn func(context context.Context, grpcClient scheduler.SchedulerClient, namespace string) error, conn *grpc.ClientConn, namespace string, logger logr.Logger, ) error { - logger.Info("RetryFn", "namespace", namespace) + logger.Info("Retrying to connect", "namespace", namespace) logFailure := func(err error, delay time.Duration) { logger.Error(err, "Scheduler not ready") } diff --git a/operator/scheduler/control_plane.go b/operator/scheduler/control_plane.go new file mode 100644 index 0000000000..9dbf18a590 --- /dev/null +++ b/operator/scheduler/control_plane.go @@ -0,0 +1,83 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed by +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package scheduler + +import ( + "context" + "io" + "time" + + grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" +) + +const ( + execTimeOut = 5 * time.Minute +) + +func (s *SchedulerClient) SubscribeControlPlaneEvents(ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { + logger := s.logger.WithName("SubscribeControlPlaneEvents") + + stream, err := grpcClient.SubscribeControlPlane( + ctx, + &scheduler.ControlPlaneSubscriptionRequest{SubscriberName: "seldon manager"}, + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + ) + if err != nil { + return err + } + + for { + event, err := stream.Recv() + if err != nil { + if err == io.EOF { + break + } + logger.Error(err, "event recv failed") + return err + } + logger.Info("Received event to handle state", "event", event) + + fn := func() error { + return s.handleStateOnReconnect(ctx, grpcClient, namespace) + } + _, err = execWithTimeout(fn, execTimeOut) + if err != nil { + logger.Error(err, "Failed to handle state on reconnect") + return err + } + + logger.Info("Handled state on reconnect") + + } + return nil +} + +func execWithTimeout(f func() error, d time.Duration) (bool, error) { + errChan := make(chan error, 1) + go func() { + errChan <- f() + close(errChan) + }() + t := time.NewTimer(d) + select { + case <-t.C: + return true, status.Errorf(codes.DeadlineExceeded, "Failed to send event within timeout") + case err := <-errChan: + if !t.Stop() { + <-t.C + } + return false, err + } +} diff --git a/operator/scheduler/control_plane_test.go b/operator/scheduler/control_plane_test.go new file mode 100644 index 0000000000..3bdd535f55 --- /dev/null +++ b/operator/scheduler/control_plane_test.go @@ -0,0 +1,287 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed by +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package scheduler + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + + mlopsv1alpha1 "github.com/seldonio/seldon-core/operator/v2/apis/mlops/v1alpha1" + "github.com/seldonio/seldon-core/operator/v2/pkg/constants" +) + +func TestSendWithTimeout(t *testing.T) { + g := NewGomegaWithT(t) + + type test struct { + name string + sleepTime time.Duration + err error + isErr bool + isExpired bool + } + + fn := func(err error) error { + time.Sleep(5 * time.Millisecond) + return err + } + + tests := []test{ + { + name: "simple", + sleepTime: 10 * time.Millisecond, + err: nil, + isErr: false, + isExpired: false, + }, + { + name: "timeout", + sleepTime: 1 * time.Millisecond, + err: nil, + isErr: true, + isExpired: true, + }, + { + name: "error", + sleepTime: 10 * time.Millisecond, + err: fmt.Errorf("error"), + isErr: true, + isExpired: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + hasExpired, err := execWithTimeout(func() error { + return fn(test.err) + }, test.sleepTime) + g.Expect(hasExpired).To(Equal(test.isExpired)) + if test.isErr { + g.Expect(err).ToNot(BeNil()) + } else { + g.Expect(err).To(BeNil()) + } + }) + } +} + +func TestControlPlaneEvents(t *testing.T) { + g := NewGomegaWithT(t) + + type test struct { + name string + existing_resources []client.Object + expected_requests_pipelines []*scheduler.LoadPipelineRequest + expected_requests_models []*scheduler.LoadModelRequest + expected_requests_models_unload []*scheduler.UnloadModelRequest + expected_requests_servers []*scheduler.ServerNotify + expected_requests_experiments []*scheduler.StartExperimentRequest + } + now := metav1.Now() + + tests := []test{ + { + name: "with no deleted resources", + existing_resources: []client.Object{ + &mlopsv1alpha1.Pipeline{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + }, + Spec: mlopsv1alpha1.PipelineSpec{}, + }, + &mlopsv1alpha1.Pipeline{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bar", + Namespace: "default", + Generation: 1, + DeletionTimestamp: &now, + Finalizers: []string{constants.PipelineFinalizerName}, + }, + Spec: mlopsv1alpha1.PipelineSpec{}, + }, + &mlopsv1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + }, + Spec: mlopsv1alpha1.ModelSpec{}, + }, + &mlopsv1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bar", + Namespace: "default", + Generation: 1, + DeletionTimestamp: &now, + Finalizers: []string{constants.ModelFinalizerName}, + }, + Spec: mlopsv1alpha1.ModelSpec{}, + }, + &mlopsv1alpha1.Experiment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + }, + Spec: mlopsv1alpha1.ExperimentSpec{}, + }, + &mlopsv1alpha1.Experiment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bar", + Namespace: "default", + Generation: 1, + DeletionTimestamp: &now, + Finalizers: []string{constants.ExperimentFinalizerName}, + }, + Spec: mlopsv1alpha1.ExperimentSpec{}, + }, + &mlopsv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + }, + Spec: mlopsv1alpha1.ServerSpec{}, + }, + }, + expected_requests_pipelines: []*scheduler.LoadPipelineRequest{ + { + Pipeline: &scheduler.Pipeline{ + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + Name: "foo", + }, + }, + }, + expected_requests_models: []*scheduler.LoadModelRequest{ + { + Model: &scheduler.Model{ + Meta: &scheduler.MetaData{ + Name: "foo", + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + }, + ModelSpec: &scheduler.ModelSpec{}, + DeploymentSpec: &scheduler.DeploymentSpec{ + Replicas: 1, + }, + }, + }, + }, + expected_requests_experiments: []*scheduler.StartExperimentRequest{ + { + Experiment: &scheduler.Experiment{ + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + Name: "foo", + }, + }, + }, + expected_requests_models_unload: []*scheduler.UnloadModelRequest{ + { + Model: &scheduler.ModelReference{ + Name: "bar", + }, + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + }, + }, + expected_requests_servers: []*scheduler.ServerNotify{ + { + Name: "foo", + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + ExpectedReplicas: 1, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + grpcClient := mockSchedulerGrpcClient{} + + controller := newMockControllerClient(test.existing_resources...) + + err := controller.SubscribeControlPlaneEvents(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) + + // check state is correct for each resource + for _, r := range test.expected_requests_pipelines { + g.Expect(grpcClient.requests_pipelines).To(ContainElement(r)) + } + g.Expect(len(grpcClient.requests_pipelines)).To(Equal(len(test.expected_requests_pipelines))) + for _, r := range test.expected_requests_experiments { + g.Expect(grpcClient.requests_experiments).To(ContainElement(r)) + } + g.Expect(len(grpcClient.requests_experiments)).To(Equal(len(test.expected_requests_experiments))) + for _, r := range test.expected_requests_models_unload { + g.Expect(grpcClient.requests_models_unload).To(ContainElement(r)) + } + g.Expect(len(grpcClient.requests_models_unload)).To(Equal(len(test.expected_requests_models_unload))) + for _, r := range test.expected_requests_models { + g.Expect(grpcClient.requests_models).To(ContainElement(r)) + } + g.Expect(len(grpcClient.requests_models)).To(Equal(len(test.expected_requests_models))) + for _, r := range test.expected_requests_servers { + g.Expect(grpcClient.requests_servers).To(ContainElement(r)) + } + g.Expect(len(grpcClient.requests_servers)).To(Equal(len(test.expected_requests_servers))) + + // should have no pipelines or experiments as they are just removed from k8s + g.Expect(len(grpcClient.requests_pipelines_unload)).To(Equal(0)) + g.Expect(len(grpcClient.requests_experiments_unload)).To(Equal(0)) + + // we should have removed the pipeline and experiment from the controller + experiment := &mlopsv1alpha1.Experiment{} + err = controller.Get( + context.Background(), + client.ObjectKey{ + Name: "bar", + Namespace: "default", + }, + experiment, + ) + g.Expect(err).ToNot(BeNil()) + + pipeline := &mlopsv1alpha1.Pipeline{} + err = controller.Get( + context.Background(), + client.ObjectKey{ + Name: "bar", + Namespace: "default", + }, + pipeline, + ) + g.Expect(err).ToNot(BeNil()) + + }) + } +} diff --git a/operator/scheduler/experiment.go b/operator/scheduler/experiment.go index 4bdd6fecab..c2b4275800 100644 --- a/operator/scheduler/experiment.go +++ b/operator/scheduler/experiment.go @@ -76,25 +76,15 @@ func (s *SchedulerClient) StopExperiment(ctx context.Context, experiment *v1alph func (s *SchedulerClient) SubscribeExperimentEvents(ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { logger := s.logger.WithName("SubscribeExperimentEvents") - stream, err := grpcClient.SubscribeExperimentStatus(ctx, &scheduler.ExperimentSubscriptionRequest{SubscriberName: "seldon manager"}, grpc_retry.WithMax(1)) - if err != nil { - return err - } - - // get experiments from the scheduler - // if there are no experiments in the scheduler state then we need to create them - // this is likely because of a restart of the scheduler that migrated the state - // to v2 (where we delete the experiments from the scheduler state) - numExperimentsFromScheduler, err := getNumExperimentsFromScheduler(ctx, grpcClient) + stream, err := grpcClient.SubscribeExperimentStatus( + ctx, + &scheduler.ExperimentSubscriptionRequest{SubscriberName: "seldon manager"}, + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + ) if err != nil { return err } - // if there are no experiments in the scheduler state then we need to create them if they exist in k8s - // also remove finalizers from experiments that are being deleted - if numExperimentsFromScheduler == 0 { - handleLoadedExperiments(ctx, namespace, s, grpcClient) - handlePendingDeleteExperiments(ctx, namespace, s) - } for { event, err := stream.Recv() diff --git a/operator/scheduler/experiment_test.go b/operator/scheduler/experiment_test.go index 9874d84da7..60185e61c4 100644 --- a/operator/scheduler/experiment_test.go +++ b/operator/scheduler/experiment_test.go @@ -34,7 +34,7 @@ func TestSubscribeExperimentsEvents(t *testing.T) { } now := metav1.Now() - // note expected state is derived in the test, maybe we should be explictl about it in the future + // note expected state is derived in the test, maybe we should be explicitly about it in the future tests := []test{ { name: "experiment ready", @@ -322,7 +322,9 @@ func TestSubscribeExperimentsEvents(t *testing.T) { } } controller := newMockControllerClient(test.existing_resources...) - err := controller.SubscribeExperimentEvents(context.Background(), &grpcClient, "") + err := controller.handleExperiments(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) + err = controller.SubscribeExperimentEvents(context.Background(), &grpcClient, "") g.Expect(err).To(BeNil()) isBeingDeleted := map[string]bool{} diff --git a/operator/scheduler/model.go b/operator/scheduler/model.go index 72c4ba935d..767aabd84f 100644 --- a/operator/scheduler/model.go +++ b/operator/scheduler/model.go @@ -124,11 +124,6 @@ func (s *SchedulerClient) SubscribeModelEvents(ctx context.Context, grpcClient s return err } - // on new reconnects check if we have models that are stuck in deletion and therefore we need to reconcile their states - go handlePendingDeleteModels(ctx, namespace, s, grpcClient) - // on new reconnects we reload the models that are marked as loaded in k8s as the scheduler might have lost the state - go handleLoadedModels(ctx, namespace, s, grpcClient) - for { event, err := stream.Recv() if err != nil { diff --git a/operator/scheduler/model_test.go b/operator/scheduler/model_test.go new file mode 100644 index 0000000000..9cbf4c7d86 --- /dev/null +++ b/operator/scheduler/model_test.go @@ -0,0 +1,256 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package scheduler + +import ( + "context" + "testing" + + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + + mlopsv1alpha1 "github.com/seldonio/seldon-core/operator/v2/apis/mlops/v1alpha1" + "github.com/seldonio/seldon-core/operator/v2/pkg/constants" +) + +func TestSubscribeModelEvents(t *testing.T) { + g := NewGomegaWithT(t) + + type test struct { + name string + existing_resources []client.Object + results []*scheduler.ModelStatusResponse + noSchedulerState bool + } + now := metav1.Now() + + // note expected state is derived in the test, maybe we should be explicitly about it in the future + tests := []test{ + { + name: "model available", + existing_resources: []client.Object{ + &mlopsv1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + }, + Status: mlopsv1alpha1.ModelStatus{ + Replicas: 1, + }, + }, + }, + results: []*scheduler.ModelStatusResponse{ + { + ModelName: "foo", + Versions: []*scheduler.ModelVersionStatus{ + { + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + State: &scheduler.ModelStatus{ + State: scheduler.ModelStatus_ModelAvailable, + AvailableReplicas: 1, + }, + }, + }, + }, + }, + }, + { + name: "model not available", + existing_resources: []client.Object{ + &mlopsv1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + }, + Status: mlopsv1alpha1.ModelStatus{ + Replicas: 1, + }, + }, + }, + results: []*scheduler.ModelStatusResponse{ + { + ModelName: "foo", + Versions: []*scheduler.ModelVersionStatus{ + { + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + State: &scheduler.ModelStatus{ + State: scheduler.ModelStatus_ModelProgressing, + AvailableReplicas: 0, + UnavailableReplicas: 1, + }, + }, + }, + }, + }, + }, + { + name: "model being removed", + existing_resources: []client.Object{ + &mlopsv1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + DeletionTimestamp: &now, + Finalizers: []string{constants.ModelFinalizerName}, + }, + Status: mlopsv1alpha1.ModelStatus{ + Replicas: 1, + }, + }, + }, + results: []*scheduler.ModelStatusResponse{ + { + ModelName: "foo", + Versions: []*scheduler.ModelVersionStatus{ + { + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + State: &scheduler.ModelStatus{ + State: scheduler.ModelStatus_ModelTerminated, + }, + }, + }, + }, + }, + }, + { + name: "model not removed", + existing_resources: []client.Object{ + &mlopsv1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + Generation: 1, + DeletionTimestamp: &now, + Finalizers: []string{constants.ModelFinalizerName}, + }, + Status: mlopsv1alpha1.ModelStatus{ + Replicas: 1, + }, + }, + }, + results: []*scheduler.ModelStatusResponse{ + { + ModelName: "foo", + Versions: []*scheduler.ModelVersionStatus{ + { + KubernetesMeta: &scheduler.KubernetesMeta{ + Namespace: "default", + Generation: 1, + }, + State: &scheduler.ModelStatus{ + State: scheduler.ModelStatus_ModelTerminating, + AvailableReplicas: 1, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // note that if responses_pipelines is nil -> scheduler state is not existing + var grpcClient mockSchedulerGrpcClient + if !test.noSchedulerState { + grpcClient = mockSchedulerGrpcClient{ + responses_subscribe_models: test.results, + responses_models: test.results, + } + } else { + grpcClient = mockSchedulerGrpcClient{ + responses_subscribe_models: test.results, + } + } + controller := newMockControllerClient(test.existing_resources...) + err := controller.handleModels(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) + err = controller.SubscribeModelEvents(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) + + isBeingDeleted := map[string]bool{} + for _, req := range test.existing_resources { + if !req.GetDeletionTimestamp().IsZero() { + isBeingDeleted[req.GetName()] = true + } else { + isBeingDeleted[req.GetName()] = false + } + } + + // for model resources that are not deleted we reload them + // this is not necessary check but added for sanity + activeResources := 0 + for idx, req := range test.existing_resources { + if req.GetDeletionTimestamp().IsZero() { + g.Expect(req.GetName()).To(Equal(grpcClient.requests_models[idx].Model.GetMeta().GetName())) + activeResources++ + } + } + g.Expect(len(grpcClient.requests_models)).To(Equal(activeResources)) + + // check state is correct for each model + for _, r := range test.results { + if r.Versions[0].State.GetState() != scheduler.ModelStatus_ModelTerminated { + model := &mlopsv1alpha1.Model{} + err := controller.Get( + context.Background(), + client.ObjectKey{ + Name: r.GetModelName(), + Namespace: r.Versions[0].KubernetesMeta.Namespace, + }, + model, + ) + // we check if the model is not in k8s (existing_resources) then we should not act on it + if _, ok := isBeingDeleted[r.GetModelName()]; !ok { + g.Expect(err).ToNot(BeNil()) + } else { + g.Expect(err).To(BeNil()) + } + if r.Versions[0].State.GetState() == scheduler.ModelStatus_ModelAvailable { + g.Expect(model.Status.IsReady()).To(BeTrueBecause("Model state is ModelAvailable")) + } else { + g.Expect(model.Status.IsReady()).To(BeFalseBecause("Model state is not ModelAvailable")) + } + + g.Expect(uint32(model.Status.Replicas)).To(Equal(r.Versions[0].State.GetAvailableReplicas() + r.Versions[0].State.GetUnavailableReplicas())) + g.Expect(model.Status.Selector).To(Equal("server=" + r.Versions[0].ServerName)) + } else { + model := &mlopsv1alpha1.Model{} + err := controller.Get( + context.Background(), + client.ObjectKey{ + Name: r.GetModelName(), + Namespace: r.Versions[0].KubernetesMeta.Namespace, + }, + model, + ) + g.Expect(err).ToNot(BeNil()) + + } + } + + }) + } +} diff --git a/operator/scheduler/pipeline.go b/operator/scheduler/pipeline.go index 07883d6aa7..586e84103d 100644 --- a/operator/scheduler/pipeline.go +++ b/operator/scheduler/pipeline.go @@ -92,20 +92,6 @@ func (s *SchedulerClient) SubscribePipelineEvents(ctx context.Context, grpcClien return err } - // get pipelines from the scheduler - // if there are no pipelines in the scheduler state then we need to create them - // this is likely because of the scheduler state got deleted - numPipelinesFromScheduler, err := getNumPipelinesFromScheduler(ctx, grpcClient) - if err != nil { - return err - } - // if there are no pipelines in the scheduler state then we need to create them if they exist in k8s - // also remove finalizers from pipelines that are being deleted - if numPipelinesFromScheduler == 0 { - handleLoadedPipelines(ctx, namespace, s, grpcClient) - handlePendingDeletePipelines(ctx, namespace, s) - } - for { event, err := stream.Recv() if err != nil { diff --git a/operator/scheduler/pipeline_test.go b/operator/scheduler/pipeline_test.go index 59426d088e..0ad7d8d635 100644 --- a/operator/scheduler/pipeline_test.go +++ b/operator/scheduler/pipeline_test.go @@ -34,7 +34,7 @@ func TestSubscribePipelineEvents(t *testing.T) { } now := metav1.Now() - // note expected state is derived in the test, maybe we should be explictl about it in the future + // note expected state is derived in the test, maybe we should be explicitly about it in the future tests := []test{ { name: "model and pipeline ready - no scheduler state", @@ -376,7 +376,9 @@ func TestSubscribePipelineEvents(t *testing.T) { } } controller := newMockControllerClient(test.existing_resources...) - err := controller.SubscribePipelineEvents(context.Background(), &grpcClient, "") + err := controller.handlePipelines(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) + err = controller.SubscribePipelineEvents(context.Background(), &grpcClient, "") g.Expect(err).To(BeNil()) isBeingDeleted := map[string]bool{} diff --git a/operator/scheduler/server.go b/operator/scheduler/server.go index fa3144719f..553aca22d7 100644 --- a/operator/scheduler/server.go +++ b/operator/scheduler/server.go @@ -91,9 +91,6 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient return err } - // on new reconnects we send a list of servers to the schedule - go handleRegisteredServers(ctx, namespace, s, grpcClient) - for { event, err := stream.Recv() if err != nil { @@ -105,6 +102,7 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient } logger.Info("Received event", "server", event.ServerName) + if event.GetKubernetesMeta() == nil { logger.Info("Received server event with no k8s metadata so ignoring", "server", event.ServerName) continue diff --git a/operator/scheduler/server_test.go b/operator/scheduler/server_test.go index 20ab372744..0f624dc967 100644 --- a/operator/scheduler/server_test.go +++ b/operator/scheduler/server_test.go @@ -185,7 +185,7 @@ func TestSubscribeServerEvents(t *testing.T) { noSchedulerState bool } - // note expected state is derived in the test, maybe we should be explictl about it in the future + // note expected state is derived in the test, maybe we should be explicit about it in the future tests := []test{ { // no scheduler state means lost servers metadata diff --git a/operator/scheduler/utils.go b/operator/scheduler/utils.go index 000c8be4f5..2bdeda38b1 100644 --- a/operator/scheduler/utils.go +++ b/operator/scheduler/utils.go @@ -25,8 +25,8 @@ import ( // TODO: unify these helper functions as they do more or less the same thing -func handleLoadedExperiments( - ctx context.Context, namespace string, s *SchedulerClient, grpcClient scheduler.SchedulerClient) { +func (s *SchedulerClient) handleLoadedExperiments( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { experimentList := &v1alpha1.ExperimentList{} // Get all experiments in the namespace err := s.List( @@ -35,34 +35,42 @@ func handleLoadedExperiments( client.InNamespace(namespace), ) if err != nil { - return + return err } for _, experiment := range experimentList.Items { // experiments that are not in the process of being deleted has DeletionTimestamp as zero if experiment.ObjectMeta.DeletionTimestamp.IsZero() { s.logger.V(1).Info("Calling start experiment (on reconnect)", "experiment", experiment.Name) - if _, err := s.StartExperiment(ctx, &experiment, grpcClient); err != nil { - // if this is a retryable error, we will retry on the next connection reconnect + if retryable, err := s.StartExperiment(ctx, &experiment, grpcClient); err != nil { s.logger.Error(err, "Failed to call start experiment", "experiment", experiment.Name) + if retryable { + // if this is a retryable error, we break (caller will also stop sync process and force a reconnect to the scheduler) + return err + } else { + // if it is not retryable then we continue to the next experiment + continue + } } else { s.logger.V(1).Info("Start experiment called successfully", "experiment", experiment.Name) } } } + + return nil } -func handlePendingDeleteExperiments( - ctx context.Context, namespace string, s *SchedulerClient) { +func (s *SchedulerClient) handlePendingDeleteExperiments( + ctx context.Context, namespace string) error { experimentList := &v1alpha1.ExperimentList{} - // Get all models in the namespace + // Get all experiments in the namespace err := s.List( ctx, experimentList, client.InNamespace(namespace), ) if err != nil { - return + return err } // Check if any experiments are being deleted @@ -80,15 +88,18 @@ func handlePendingDeleteExperiments( }) if retryErr != nil { s.logger.Error(err, "Failed to remove finalizer after retries", "experiment", experiment.Name) + return retryErr } } } + + return nil } // when need to reload the models that are marked in k8s as loaded, this is because there could be a // case where the scheduler has load the models state (if the scheduler and the model server restart at the same time) -func handleLoadedModels( - ctx context.Context, namespace string, s *SchedulerClient, grpcClient scheduler.SchedulerClient) { +func (s *SchedulerClient) handleLoadedModels( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { modelList := &v1alpha1.ModelList{} // Get all models in the namespace err := s.List( @@ -97,27 +108,35 @@ func handleLoadedModels( client.InNamespace(namespace), ) if err != nil { - return + return err } for _, model := range modelList.Items { // models that are not in the process of being deleted has DeletionTimestamp as zero if model.ObjectMeta.DeletionTimestamp.IsZero() { s.logger.V(1).Info("Calling Load model (on reconnect)", "model", model.Name) - if _, err := s.LoadModel(ctx, &model, grpcClient); err != nil { - // if this is a retryable error, we will retry on the next connection reconnect + if retryable, err := s.LoadModel(ctx, &model, grpcClient); err != nil { s.logger.Error(err, "Failed to call load model", "model", model.Name) + if retryable { + // if this is a retryable error, we break (caller will also stop sync process and force a reconnect to the scheduler) + return err + } else { + // if it is not retryable then we continue to the next model + continue + } } else { s.logger.V(1).Info("Load model called successfully", "model", model.Name) } } else { - s.logger.V(1).Info("Model is being deleted, not loading", "model", model.Name) + s.logger.V(1).Info("Model is being deleted, skip loading", "model", model.Name) } } + + return nil } -func handleRegisteredServers( - ctx context.Context, namespace string, s *SchedulerClient, grpcClient scheduler.SchedulerClient) { +func (s *SchedulerClient) handleRegisteredServers( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { serverList := &v1alpha1.ServerList{} // Get all servers in the namespace err := s.List( @@ -126,16 +145,20 @@ func handleRegisteredServers( client.InNamespace(namespace), ) if err != nil { - return + s.logger.Error(err, "Failed to list servers", "namespace", namespace) + return err } if err := s.ServerNotify(ctx, grpcClient, serverList.Items, true); err != nil { s.logger.Error(err, "Failed to notify servers", "servers", serverList.Items) + return err } + + return nil } -func handlePendingDeleteModels( - ctx context.Context, namespace string, s *SchedulerClient, grpcClient scheduler.SchedulerClient) { +func (s *SchedulerClient) handlePendingDeleteModels( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { modelList := &v1alpha1.ModelList{} // Get all models in the namespace err := s.List( @@ -144,7 +167,7 @@ func handlePendingDeleteModels( client.InNamespace(namespace), ) if err != nil { - return + return err } // Check if any models are being deleted @@ -155,10 +178,10 @@ func handlePendingDeleteModels( if retryUnload { // caller will retry as this method is called on connection reconnect s.logger.Error(err, "Failed to call unload model", "model", model.Name) - continue + return err } else { // this is essentially a failed pre-condition (model does not exist in scheduler) - // we can remove + // we can remove the finalizer // note that there is still the chance the model is not updated from the different model servers // upon reconnection of the scheduler retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error { @@ -172,6 +195,7 @@ func handlePendingDeleteModels( }) if retryErr != nil { s.logger.Error(err, "Failed to remove finalizer after retries", "model", model.Name) + return retryErr } } } else { @@ -179,13 +203,15 @@ func handlePendingDeleteModels( s.logger.Info("Unload model called successfully, not removing finalizer", "model", model.Name) } } else { - s.logger.V(1).Info("Model is not being deleted, not unloading", "model", model.Name) + s.logger.V(1).Info("Model is not being deleted, skip unloading", "model", model.Name) } } + + return nil } -func handleLoadedPipelines( - ctx context.Context, namespace string, s *SchedulerClient, grpcClient scheduler.SchedulerClient) { +func (s *SchedulerClient) handleLoadedPipelines( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { pipelineList := &v1alpha1.PipelineList{} // Get all pipelines in the namespace err := s.List( @@ -194,25 +220,33 @@ func handleLoadedPipelines( client.InNamespace(namespace), ) if err != nil { - return + return err } for _, pipeline := range pipelineList.Items { // pipelines that are not in the process of being deleted has DeletionTimestamp as zero if pipeline.ObjectMeta.DeletionTimestamp.IsZero() { s.logger.V(1).Info("Calling load pipeline (on reconnect)", "pipeline", pipeline.Name) - if _, err := s.LoadPipeline(ctx, &pipeline, grpcClient); err != nil { - // if this is a retryable error, we will retry on the next connection reconnect + if retryable, err := s.LoadPipeline(ctx, &pipeline, grpcClient); err != nil { s.logger.Error(err, "Failed to call load pipeline", "pipeline", pipeline.Name) + if retryable { + // if this is a retryable error, we break (caller will also stop sync process and force a reconnect to the scheduler) + return err + } else { + // if it is not retryable then we continue to the next pipeline + continue + } } else { s.logger.V(1).Info("Load pipeline called successfully", "pipeline", pipeline.Name) } } } + + return nil } -func handlePendingDeletePipelines( - ctx context.Context, namespace string, s *SchedulerClient) { +func (s *SchedulerClient) handlePendingDeletePipelines( + ctx context.Context, namespace string) error { pipelineList := &v1alpha1.PipelineList{} // Get all models in the namespace err := s.List( @@ -221,7 +255,7 @@ func handlePendingDeletePipelines( client.InNamespace(namespace), ) if err != nil { - return + return err } // Check if any pipelines are being deleted @@ -239,9 +273,12 @@ func handlePendingDeletePipelines( }) if retryErr != nil { s.logger.Error(err, "Failed to remove finalizer after retries", "pipeline", pipeline.Name) + return retryErr } } } + + return nil } func getNumExperimentsFromScheduler(ctx context.Context, grpcClient scheduler.SchedulerClient) (int, error) { @@ -291,3 +328,66 @@ func getNumPipelinesFromScheduler(ctx context.Context, grpcClient scheduler.Sche } return numPipelinesFromScheduler, nil } + +func (s *SchedulerClient) handleExperiments( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { + // get experiments from the scheduler + // if there are no experiments in the scheduler state then we need to create them + // this is likely because of a restart of the scheduler that migrated the state + // to v2 (where we delete the experiments from the scheduler state) + numExperimentsFromScheduler, err := getNumExperimentsFromScheduler(ctx, grpcClient) + if err != nil { + return err + } + // if there are no experiments in the scheduler state then we need to create them if they exist in k8s + // also remove finalizers from experiments that are being deleted + if numExperimentsFromScheduler == 0 { + if err := s.handleLoadedExperiments(ctx, grpcClient, namespace); err != nil { + return err + } + if err := s.handlePendingDeleteExperiments(ctx, namespace); err != nil { + return err + } + } + + return nil +} + +func (s *SchedulerClient) handlePipelines( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { + // get pipelines from the scheduler + // if there are no pipelines in the scheduler state then we need to create them + // this is likely because of a restart of the scheduler that migrated the state + // to v2 (where we delete the pipelines from the scheduler state) + numPipelinesFromScheduler, err := getNumPipelinesFromScheduler(ctx, grpcClient) + if err != nil { + return err + } + // if there are no pipelines in the scheduler state then we need to create them if they exist in k8s + // also remove finalizers from pipelines that are being deleted + if numPipelinesFromScheduler == 0 { + if err := s.handleLoadedPipelines(ctx, grpcClient, namespace); err != nil { + return err + } + if err := s.handlePendingDeletePipelines(ctx, namespace); err != nil { + return err + } + } + + return nil +} + +func (s *SchedulerClient) handleModels( + ctx context.Context, grpcClient scheduler.SchedulerClient, namespace string) error { + + // on new reconnects check if we have models that are stuck in deletion and therefore we need to reconcile their states + if err := s.handlePendingDeleteModels(ctx, grpcClient, namespace); err != nil { + return err + } + // on new reconnects we reload the models that are marked as loaded in k8s as the scheduler might have lost the state + if err := s.handleLoadedModels(ctx, grpcClient, namespace); err != nil { + return err + } + + return nil +} diff --git a/operator/scheduler/utils_test.go b/operator/scheduler/utils_test.go index 644b280542..fc71c0b505 100644 --- a/operator/scheduler/utils_test.go +++ b/operator/scheduler/utils_test.go @@ -106,6 +106,27 @@ func (s *mockSchedulerServerSubscribeGrpcClient) Recv() (*scheduler.ServerStatus return nil, io.EOF } +// Control Plane subscribe mock grpc client + +type mockControlPlaneSubscribeGrpcClient struct { + sent bool + grpc.ClientStream +} + +var _ scheduler.Scheduler_SubscribeControlPlaneClient = (*mockControlPlaneSubscribeGrpcClient)(nil) + +func newMockControlPlaneSubscribeGrpcClient() *mockControlPlaneSubscribeGrpcClient { + return &mockControlPlaneSubscribeGrpcClient{} +} + +func (s *mockControlPlaneSubscribeGrpcClient) Recv() (*scheduler.ControlPlaneResponse, error) { + if !s.sent { + s.sent = true + return &scheduler.ControlPlaneResponse{}, nil + } + return nil, io.EOF +} + // Pipeline mock grpc client type mockSchedulerPipelineGrpcClient struct { @@ -156,6 +177,31 @@ func (s *mockSchedulerExperimentSubscribeGrpcClient) Recv() (*scheduler.Experime return nil, io.EOF } +// Model subscribe mock grpc client + +type mockSchedulerModelSubscribeGrpcClient struct { + counter int + results []*scheduler.ModelStatusResponse + grpc.ClientStream +} + +var _ scheduler.Scheduler_SubscribeModelStatusClient = (*mockSchedulerModelSubscribeGrpcClient)(nil) + +func newMockSchedulerModelSubscribeGrpcClient(results []*scheduler.ModelStatusResponse) *mockSchedulerModelSubscribeGrpcClient { + return &mockSchedulerModelSubscribeGrpcClient{ + results: results, + counter: 0, + } +} + +func (s *mockSchedulerModelSubscribeGrpcClient) Recv() (*scheduler.ModelStatusResponse, error) { + if s.counter < len(s.results) { + s.counter++ + return s.results[s.counter-1], nil + } + return nil, io.EOF +} + // Scheduler mock grpc client type mockSchedulerGrpcClient struct { @@ -165,9 +211,14 @@ type mockSchedulerGrpcClient struct { responses_subscribe_pipelines []*scheduler.PipelineStatusResponse responses_servers []*scheduler.ServerStatusResponse responses_subscribe_servers []*scheduler.ServerStatusResponse + responses_models []*scheduler.ModelStatusResponse + responses_subscribe_models []*scheduler.ModelStatusResponse requests_experiments []*scheduler.StartExperimentRequest + requests_experiments_unload []*scheduler.StopExperimentRequest requests_pipelines []*scheduler.LoadPipelineRequest + requests_pipelines_unload []*scheduler.UnloadPipelineRequest requests_models []*scheduler.LoadModelRequest + requests_models_unload []*scheduler.UnloadModelRequest requests_servers []*scheduler.ServerNotify errors map[string]error } @@ -192,6 +243,7 @@ func (s *mockSchedulerGrpcClient) UnloadModel(ctx context.Context, in *scheduler if ok { return nil, err } else { + s.requests_models_unload = append(s.requests_models_unload, in) return nil, nil } } @@ -200,6 +252,7 @@ func (s *mockSchedulerGrpcClient) LoadPipeline(ctx context.Context, in *schedule return nil, nil } func (s *mockSchedulerGrpcClient) UnloadPipeline(ctx context.Context, in *scheduler.UnloadPipelineRequest, opts ...grpc.CallOption) (*scheduler.UnloadPipelineResponse, error) { + s.requests_pipelines_unload = append(s.requests_pipelines_unload, in) return nil, nil } func (s *mockSchedulerGrpcClient) StartExperiment(ctx context.Context, in *scheduler.StartExperimentRequest, opts ...grpc.CallOption) (*scheduler.StartExperimentResponse, error) { @@ -207,6 +260,7 @@ func (s *mockSchedulerGrpcClient) StartExperiment(ctx context.Context, in *sched return nil, nil } func (s *mockSchedulerGrpcClient) StopExperiment(ctx context.Context, in *scheduler.StopExperimentRequest, opts ...grpc.CallOption) (*scheduler.StopExperimentResponse, error) { + s.requests_experiments_unload = append(s.requests_experiments_unload, in) return nil, nil } func (s *mockSchedulerGrpcClient) ServerStatus(ctx context.Context, in *scheduler.ServerStatusRequest, opts ...grpc.CallOption) (scheduler.Scheduler_ServerStatusClient, error) { @@ -226,7 +280,7 @@ func (s *mockSchedulerGrpcClient) SubscribeServerStatus(ctx context.Context, in return newMockSchedulerServerSubscribeGrpcClient(s.responses_subscribe_servers), nil } func (s *mockSchedulerGrpcClient) SubscribeModelStatus(ctx context.Context, in *scheduler.ModelSubscriptionRequest, opts ...grpc.CallOption) (scheduler.Scheduler_SubscribeModelStatusClient, error) { - return nil, nil + return newMockSchedulerModelSubscribeGrpcClient(s.responses_subscribe_models), nil } func (s *mockSchedulerGrpcClient) SubscribeExperimentStatus(ctx context.Context, in *scheduler.ExperimentSubscriptionRequest, opts ...grpc.CallOption) (scheduler.Scheduler_SubscribeExperimentStatusClient, error) { return newMockSchedulerExperimentSubscribeGrpcClient(s.responses_subscribe_experiments), nil @@ -234,6 +288,9 @@ func (s *mockSchedulerGrpcClient) SubscribeExperimentStatus(ctx context.Context, func (s *mockSchedulerGrpcClient) SubscribePipelineStatus(ctx context.Context, in *scheduler.PipelineSubscriptionRequest, opts ...grpc.CallOption) (scheduler.Scheduler_SubscribePipelineStatusClient, error) { return newMockSchedulerPipelineSubscribeGrpcClient(s.responses_subscribe_pipelines), nil } +func (s *mockSchedulerGrpcClient) SubscribeControlPlane(ctx context.Context, in *scheduler.ControlPlaneSubscriptionRequest, opts ...grpc.CallOption) (scheduler.Scheduler_SubscribeControlPlaneClient, error) { + return newMockControlPlaneSubscribeGrpcClient(), nil +} // new mockSchedulerClient (not grpc) func newMockControllerClient(objs ...client.Object) *SchedulerClient { @@ -305,7 +362,8 @@ func TestHandleLoadedExperiments(t *testing.T) { t.Run(test.name, func(t *testing.T) { grpcClient := mockSchedulerGrpcClient{} client := newMockControllerClient(test.resources...) - handleLoadedExperiments(context.Background(), "", client, &grpcClient) + err := client.handleLoadedExperiments(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) activeResources := 0 // TODO check the entire object for idx, req := range test.resources { @@ -375,7 +433,8 @@ func TestHandleLoadedModels(t *testing.T) { t.Run(test.name, func(t *testing.T) { grpcClient := mockSchedulerGrpcClient{} client := newMockControllerClient(test.resources...) - handleLoadedModels(context.Background(), "", client, &grpcClient) + err := client.handleLoadedModels(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) activeResources := 0 // TODO check the entire object for idx, req := range test.resources { @@ -445,7 +504,8 @@ func TestHandleLoadedPipelines(t *testing.T) { t.Run(test.name, func(t *testing.T) { grpcClient := mockSchedulerGrpcClient{} client := newMockControllerClient(test.resources...) - handleLoadedPipelines(context.Background(), "", client, &grpcClient) + err := client.handleLoadedPipelines(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) activeResources := 0 // TODO check the entire object for idx, req := range test.resources { @@ -514,11 +574,12 @@ func TestHandleDeletedExperiments(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { s := newMockControllerClient(test.resources...) - handlePendingDeleteExperiments(context.Background(), "", s) + err := s.handlePendingDeleteExperiments(context.Background(), "") + g.Expect(err).To(BeNil()) actualResourcesList := &mlopsv1alpha1.ExperimentList{} // Get all experiments in the namespace - err := s.List( + err = s.List( context.Background(), actualResourcesList, client.InNamespace(""), @@ -592,11 +653,12 @@ func TestHandleDeletedPipelines(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { s := newMockControllerClient(test.resources...) - handlePendingDeletePipelines(context.Background(), "", s) + err := s.handlePendingDeletePipelines(context.Background(), "") + g.Expect(err).To(BeNil()) actualResourcesList := &mlopsv1alpha1.PipelineList{} // Get all pipelines in the namespace - err := s.List( + err = s.List( context.Background(), actualResourcesList, client.InNamespace(""), @@ -675,11 +737,12 @@ func TestHandleDeletedModels(t *testing.T) { }, } s := newMockControllerClient(test.resources...) - handlePendingDeleteModels(context.Background(), "", s, &grpcClient) + err := s.handlePendingDeleteModels(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) actualResourcesList := &mlopsv1alpha1.ModelList{} // Get all models in the namespace - err := s.List( + err = s.List( context.Background(), actualResourcesList, client.InNamespace(""), @@ -777,7 +840,8 @@ func TestHandleRegisteredServers(t *testing.T) { t.Run(test.name, func(t *testing.T) { grpcClient := mockSchedulerGrpcClient{} client := newMockControllerClient(test.resources...) - handleRegisteredServers(context.Background(), "", client, &grpcClient) + err := client.handleRegisteredServers(context.Background(), &grpcClient, "") + g.Expect(err).To(BeNil()) g.Expect(grpcClient.requests_servers).To(Equal(test.expected)) }) } diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index 1163376186..132025bcea 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -283,6 +283,7 @@ func main() { s.StopSendServerEvents() s.StopSendExperimentEvents() s.StopSendPipelineEvents() + s.StopSendControlPlaneEvents() cs.StopSendPipelineEvents() as.StopAgentStreams() diff --git a/scheduler/pkg/server/control_plane.go b/scheduler/pkg/server/control_plane.go new file mode 100644 index 0000000000..a3031d6df8 --- /dev/null +++ b/scheduler/pkg/server/control_plane.go @@ -0,0 +1,71 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed by +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package server + +import ( + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" +) + +func (s *SchedulerServer) SubscribeControlPlane(req *pb.ControlPlaneSubscriptionRequest, stream pb.Scheduler_SubscribeControlPlaneServer) error { + logger := s.logger.WithField("func", "SubscribeControlPlane") + logger.Infof("Received subscribe request from %s", req.GetSubscriberName()) + + err := s.sendStartServerStreamMarker(stream) + if err != nil { + logger.WithError(err).Errorf("Failed to send start marker to %s", req.GetSubscriberName()) + return err + } + + fin := make(chan bool) + + s.controlPlaneStream.mu.Lock() + s.controlPlaneStream.streams[stream] = &ControlPlaneSubsription{ + name: req.GetSubscriberName(), + stream: stream, + fin: fin, + } + s.controlPlaneStream.mu.Unlock() + + ctx := stream.Context() + // Keep this scope alive because once this scope exits - the stream is closed + for { + select { + case <-fin: + logger.Infof("Closing stream for %s", req.GetSubscriberName()) + return nil + case <-ctx.Done(): + logger.Infof("Stream disconnected %s", req.GetSubscriberName()) + s.controlPlaneStream.mu.Lock() + delete(s.controlPlaneStream.streams, stream) + s.controlPlaneStream.mu.Unlock() + return nil + } + } +} + +func (s *SchedulerServer) StopSendControlPlaneEvents() { + s.controlPlaneStream.mu.Lock() + defer s.controlPlaneStream.mu.Unlock() + for _, subscription := range s.controlPlaneStream.streams { + close(subscription.fin) + } +} + +// this is to mark the initial start of a new stream (at application level) +// as otherwise the other side sometimes doesnt know if the scheduler has established a new stream explicitly +func (s *SchedulerServer) sendStartServerStreamMarker(stream pb.Scheduler_SubscribeControlPlaneServer) error { + ssr := &pb.ControlPlaneResponse{} + _, err := sendWithTimeout(func() error { return stream.Send(ssr) }, s.timeout) + if err != nil { + return err + } + + return nil +} diff --git a/scheduler/pkg/server/control_plane_test.go b/scheduler/pkg/server/control_plane_test.go new file mode 100644 index 0000000000..b7d0eb5dfe --- /dev/null +++ b/scheduler/pkg/server/control_plane_test.go @@ -0,0 +1,73 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed by +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package server + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + log "github.com/sirupsen/logrus" + + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" +) + +func TestStartServerStream(t *testing.T) { + g := NewGomegaWithT(t) + type test struct { + name string + server *SchedulerServer + err bool + } + + tests := []test{ + { + name: "ok", + server: &SchedulerServer{ + modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + logger: log.New(), + timeout: 10 * time.Millisecond, + }, + }, + { + name: "timeout", + server: &SchedulerServer{ + modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + logger: log.New(), + timeout: 1 * time.Millisecond, + }, + err: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + stream := newStubControlPlaneServer(1, 5*time.Millisecond) + err := test.server.sendStartServerStreamMarker(stream) + if test.err { + g.Expect(err).ToNot(BeNil()) + } else { + g.Expect(err).To(BeNil()) + + var msr *pb.ControlPlaneResponse + select { + case next := <-stream.msgs: + msr = next + default: + t.Fail() + } + + g.Expect(msr).ToNot(BeNil()) + } + }) + } +} diff --git a/scheduler/pkg/server/experiment_status.go b/scheduler/pkg/server/experiment_status.go index 847acacb44..de0988d241 100644 --- a/scheduler/pkg/server/experiment_status.go +++ b/scheduler/pkg/server/experiment_status.go @@ -23,6 +23,8 @@ func (s *SchedulerServer) SubscribeExperimentStatus(req *pb.ExperimentSubscripti logger := s.logger.WithField("func", "SubscribeExperimentStatus") logger.Infof("Received subscribe request from %s", req.GetSubscriberName()) + s.synchroniser.WaitReady() + err := s.sendCurrentExperimentStatuses(stream) if err != nil { logger.WithError(err).Errorf("Failed to send current experiment statuses to %s", req.GetSubscriberName()) diff --git a/scheduler/pkg/server/pipeline_status.go b/scheduler/pkg/server/pipeline_status.go index 4581a437a9..852d09d506 100644 --- a/scheduler/pkg/server/pipeline_status.go +++ b/scheduler/pkg/server/pipeline_status.go @@ -23,6 +23,8 @@ func (s *SchedulerServer) SubscribePipelineStatus(req *pb.PipelineSubscriptionRe logger := s.logger.WithField("func", "SubscribePipelineStatus") logger.Infof("Received subscribe request from %s", req.GetSubscriberName()) + s.synchroniser.WaitReady() + err := s.sendCurrentPipelineStatuses(stream, false) if err != nil { return err diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index 64e436be91..3f74c40da1 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -60,6 +60,7 @@ type SchedulerServer struct { serverEventStream ServerEventStream experimentEventStream ExperimentEventStream pipelineEventStream PipelineEventStream + controlPlaneStream ControlPlaneStream certificateStore *seldontls.CertificateStore timeout time.Duration synchroniser synchroniser.Synchroniser @@ -89,6 +90,11 @@ type PipelineEventStream struct { streams map[pb.Scheduler_SubscribePipelineStatusServer]*PipelineSubscription } +type ControlPlaneStream struct { + mu sync.Mutex + streams map[pb.Scheduler_SubscribeControlPlaneServer]*ControlPlaneSubsription +} + type ModelSubscription struct { name string stream pb.Scheduler_SubscribeModelStatusServer @@ -113,6 +119,12 @@ type PipelineSubscription struct { fin chan bool } +type ControlPlaneSubsription struct { + name string + stream pb.Scheduler_SubscribeControlPlaneServer + fin chan bool +} + func (s *SchedulerServer) startServer(port uint, secure bool) error { logger := s.logger.WithField("func", "startServer") lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) @@ -198,6 +210,9 @@ func NewSchedulerServer( experimentEventStream: ExperimentEventStream{ streams: make(map[pb.Scheduler_SubscribeExperimentStatusServer]*ExperimentSubscription), }, + controlPlaneStream: ControlPlaneStream{ + streams: make(map[pb.Scheduler_SubscribeControlPlaneServer]*ControlPlaneSubsription), + }, timeout: sendTimeout, synchroniser: synchroniser, } @@ -245,7 +260,7 @@ func (s *SchedulerServer) ServerNotify(ctx context.Context, req *pb.ServerNotify } numExpectedReplicas += uint(server.ExpectedReplicas) } - if req.IsFirstSync { + if req.IsFirstSync && !s.synchroniser.IsReady() { s.synchroniser.Signals(numExpectedReplicas) logger.Infof("Signalling synchroniser with %d expected server agents to connect", numExpectedReplicas) } @@ -280,10 +295,12 @@ func (s *SchedulerServer) LoadModel(ctx context.Context, req *pb.LoadModelReques if err != nil { return nil, status.Errorf(codes.FailedPrecondition, err.Error()) } - err = s.scheduler.Schedule(req.GetModel().GetMeta().GetName()) - if err != nil { - return nil, status.Errorf(codes.FailedPrecondition, err.Error()) - } + go func() { + err := s.scheduler.Schedule(req.GetModel().GetMeta().GetName()) + if err != nil { + logger.WithError(err).Warnf("Failed to schedule model %s", req.GetModel().GetMeta().GetName()) + } + }() return &pb.LoadModelResponse{}, nil } @@ -294,10 +311,12 @@ func (s *SchedulerServer) UnloadModel(ctx context.Context, req *pb.UnloadModelRe if err != nil { return nil, status.Errorf(codes.FailedPrecondition, err.Error()) } - err = s.scheduler.Schedule(req.GetModel().Name) - if err != nil { - return nil, status.Errorf(codes.FailedPrecondition, err.Error()) - } + go func() { + err := s.scheduler.Schedule(req.GetModel().Name) + if err != nil { + logger.WithError(err).Warnf("Failed to schedule model %s (for unload)", req.GetModel().GetName()) + } + }() return &pb.UnloadModelResponse{}, nil } diff --git a/scheduler/pkg/server/server_status.go b/scheduler/pkg/server/server_status.go index 3eb1750e9b..118fda22ec 100644 --- a/scheduler/pkg/server/server_status.go +++ b/scheduler/pkg/server/server_status.go @@ -21,6 +21,8 @@ func (s *SchedulerServer) SubscribeModelStatus(req *pb.ModelSubscriptionRequest, logger := s.logger.WithField("func", "SubscribeModelStatus") logger.Infof("Received subscribe request from %s", req.GetSubscriberName()) + s.synchroniser.WaitReady() + err := s.sendCurrentModelStatuses(stream) if err != nil { logger.WithError(err).Errorf("Failed to send current model statuses to %s", req.GetSubscriberName()) @@ -128,7 +130,6 @@ func (s *SchedulerServer) SubscribeServerStatus(req *pb.ServerSubscriptionReques logger := s.logger.WithField("func", "SubscribeServerStatus") logger.Infof("Received subscribe request from %s", req.GetSubscriberName()) - // on reconnect we send the current state of the servers to the subscriber (controller) as we may have missed events err := s.sendCurrentServerStatuses(stream) if err != nil { logger.WithError(err).Errorf("Failed to send current server statuses to %s", req.GetSubscriberName()) diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index fec0dbf726..6262b802e4 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -13,6 +13,7 @@ import ( "context" "fmt" "sort" + "sync/atomic" "testing" "time" @@ -48,7 +49,7 @@ func (m *mockAgentHandler) SendAgentSync(modelName string) { func TestLoadModel(t *testing.T) { g := NewGomegaWithT(t) - createTestScheduler := func() (*SchedulerServer, *mockAgentHandler) { + createTestScheduler := func() (*SchedulerServer, *mockAgentHandler, *coordinator.EventHub) { logger := log.New() logger.SetLevel(log.WarnLevel) @@ -69,17 +70,17 @@ func TestLoadModel(t *testing.T) { sync.Signals(1) mockAgent := &mockAgentHandler{} - return s, mockAgent + return s, mockAgent, eventHub } smallMemory := uint64(100) largeMemory := uint64(2000) type test struct { - name string - req []*pba.AgentSubscribeRequest - model *pb.Model - code codes.Code + name string + req []*pba.AgentSubscribeRequest + model *pb.Model + scheduleFailed bool } tests := []test{ @@ -108,7 +109,7 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}, }, - code: codes.OK, + scheduleFailed: false, }, { name: "TooManyReplicas", @@ -134,7 +135,7 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}, }, - code: codes.FailedPrecondition, + scheduleFailed: true, }, { name: "TooMuchMemory", @@ -161,7 +162,7 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}, }, - code: codes.FailedPrecondition, + scheduleFailed: true, }, { name: "FailedRequirements", @@ -188,7 +189,7 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}, }, - code: codes.FailedPrecondition, + scheduleFailed: true, }, { name: "MultipleRequirements", @@ -215,7 +216,7 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}, }, - code: codes.OK, + scheduleFailed: false, }, { name: "TwoReplicas", @@ -254,7 +255,7 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}, }, - code: codes.OK, + scheduleFailed: false, }, { name: "TwoReplicasFail", @@ -293,34 +294,55 @@ func TestLoadModel(t *testing.T) { }, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}, }, - code: codes.FailedPrecondition, + scheduleFailed: true, }, // schedule to 2 replicas but 1 fails } for _, test := range tests { t.Run(test.name, func(t *testing.T) { // Given - s, _ := createTestScheduler() + s, _, h := createTestScheduler() for _, repReq := range test.req { err := s.modelStore.AddServerReplica(repReq) g.Expect(err).To(BeNil()) } + scheduledFailed := atomic.Bool{} + + // Subscribe to model events + h.RegisterModelEventHandler( + "handler-model", + 10, + log.New(), + func(event coordinator.ModelEventMsg) { + if event.ModelName != test.model.Meta.Name { + return + } + model, _ := s.modelStore.GetModel(event.ModelName) + latest := model.GetLatest() + if latest.ModelState().State == store.ScheduleFailed { + scheduledFailed.Store(true) + } else { + scheduledFailed.Store(false) + } + }, + ) + // When lm := pb.LoadModelRequest{ Model: test.model, } r, err := s.LoadModel(context.Background(), &lm) + time.Sleep(100 * time.Millisecond) + // Then - if test.code != codes.OK { - g.Expect(err).ToNot(BeNil()) - e, ok := status.FromError(err) - g.Expect(ok).To(BeTrue()) - g.Expect(e.Code()).To(Equal(test.code)) + g.Expect(r).ToNot(BeNil()) + g.Expect(err).To(BeNil()) + if test.scheduleFailed { + g.Expect(scheduledFailed.Load()).To(BeTrueBecause("schedule failed")) } else { - g.Expect(err).To(BeNil()) - g.Expect(r).ToNot(BeNil()) + g.Expect(scheduledFailed.Load()).To(BeFalseBecause("schedule ok")) } }) } @@ -350,11 +372,11 @@ func TestUnloadModel(t *testing.T) { } type test struct { - name string - req []*pba.AgentSubscribeRequest - model *pb.Model - code codes.Code - modelReplicaStates map[int]store.ModelReplicaState + name string + req []*pba.AgentSubscribeRequest + model *pb.Model + code codes.Code + modelState store.ModelState } modelName := "model1" smallMemory := uint64(100) @@ -364,18 +386,18 @@ func TestUnloadModel(t *testing.T) { req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}}, - model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - code: codes.OK, - modelReplicaStates: map[int]store.ModelReplicaState{0: store.UnloadEnvoyRequested}, + model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + code: codes.OK, + modelState: store.ModelTerminated, }, { name: "Multiple", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn", "xgboost"}}}}, - model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn", "xgboost"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - code: codes.OK, - modelReplicaStates: map[int]store.ModelReplicaState{0: store.UnloadEnvoyRequested}, + model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn", "xgboost"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + code: codes.OK, + modelState: store.ModelTerminated, }, { name: "TwoReplicas", @@ -384,9 +406,9 @@ func TestUnloadModel(t *testing.T) { ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}, {ServerName: "server1", ReplicaIdx: 1, Shared: true, AvailableMemoryBytes: 1000, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}}, - model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - code: codes.OK, - modelReplicaStates: map[int]store.ModelReplicaState{0: store.UnloadEnvoyRequested, 1: store.UnloadEnvoyRequested}, + model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + code: codes.OK, + modelState: store.ModelTerminated, }, { name: "NotExist", @@ -417,6 +439,7 @@ func TestUnloadModel(t *testing.T) { } rm := &pb.UnloadModelRequest{Model: &pb.ModelReference{Name: modelName}} r, err := s.UnloadModel(context.Background(), rm) + if test.code != codes.OK { g.Expect(err).ToNot(BeNil()) e, ok := status.FromError(err) @@ -427,9 +450,8 @@ func TestUnloadModel(t *testing.T) { g.Expect(r).ToNot(BeNil()) ms, err := s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - for replicaIdx, state := range test.modelReplicaStates { - g.Expect(ms.GetLatest().GetModelReplicaState(replicaIdx)).To(Equal(state)) - } + g.Expect(ms.GetLatest().ModelState().State).To(Equal(test.modelState)) + } }) } @@ -763,7 +785,7 @@ func TestServerNotify(t *testing.T) { }) g.Expect(actualServers).To(Equal(test.expectedServerStates)) - g.Expect(sync.IsReady()).To(Equal(test.signalTriggered)) + g.Expect(sync.IsTriggered()).To(Equal(test.signalTriggered)) }) } } @@ -831,6 +853,27 @@ func (s *stubServerStatusServer) Send(r *pb.ServerStatusResponse) error { return nil } +type stubControlPlaneServer struct { + msgs chan *pb.ControlPlaneResponse + sleepTime time.Duration + grpc.ServerStream +} + +var _ pb.Scheduler_SubscribeControlPlaneServer = (*stubControlPlaneServer)(nil) + +func newStubControlPlaneServer(capacity int, sleepTime time.Duration) *stubControlPlaneServer { + return &stubControlPlaneServer{ + msgs: make(chan *pb.ControlPlaneResponse, capacity), + sleepTime: sleepTime, + } +} + +func (s *stubControlPlaneServer) Send(r *pb.ControlPlaneResponse) error { + time.Sleep(s.sleepTime) + s.msgs <- r + return nil +} + type stubExperimentStatusServer struct { msgs chan *pb.ExperimentStatusResponse sleepTime time.Duration diff --git a/scheduler/pkg/synchroniser/servers_sync.go b/scheduler/pkg/synchroniser/servers_sync.go index e17164c65f..c3e8d468cc 100644 --- a/scheduler/pkg/synchroniser/servers_sync.go +++ b/scheduler/pkg/synchroniser/servers_sync.go @@ -8,6 +8,8 @@ the Change License after the Change Date as each is defined in accordance with t */ // This file includes the ServerBasedSynchroniser struct and its methods. +// The main logic is to get the controller state of how many servers (x) it has in its etcd storet, and then the scheduler will wait for x agent connections. +// The synchroniser will also wait for a timeout duration before proceeding if not all servers connect in time. // The ServerBasedSynchroniser struct is responsible for synchronising the starting up of the different components of the "scheduler". // It ensures that the time between the scheduler starting and the different model servers connecting does not affect the data plane (inferences). // In general terms, the synchroniser waits for all servers to connect before proceeding with processing events, especially those that are related to the servers connecting (i.e model scheduling). @@ -113,6 +115,10 @@ func (s *ServerBasedSynchroniser) Signals(numSignals uint) { } } +func (s *ServerBasedSynchroniser) IsTriggered() bool { + return s.triggered.Load() +} + func (s *ServerBasedSynchroniser) doneFn() { if s.isReady.CompareAndSwap(false, true) { s.doneWg.Done() diff --git a/scheduler/pkg/synchroniser/sync.go b/scheduler/pkg/synchroniser/sync.go index acc3ec25a3..a61bf38c30 100644 --- a/scheduler/pkg/synchroniser/sync.go +++ b/scheduler/pkg/synchroniser/sync.go @@ -19,6 +19,9 @@ import ( ) type Synchroniser interface { + // mainly for testing, this api should mean little in production as the synchroniser should be + // rely on the other methods to determine if it is ready. + IsTriggered() bool IsReady() bool WaitReady() Signals(uint) @@ -39,21 +42,21 @@ func NewSimpleSynchroniser(timeout time.Duration) *SimpleSynchroniser { } s.isReady.Store(false) s.triggered.Store(false) + s.wg.Add(1) + time.AfterFunc(s.timeout, s.done) return s } +func (s *SimpleSynchroniser) IsTriggered() bool { + return s.triggered.Load() +} + func (s *SimpleSynchroniser) IsReady() bool { return s.isReady.Load() } -func (s *SimpleSynchroniser) Signals(numSignals uint) { - if !s.IsReady() { - swapped := s.triggered.CompareAndSwap(false, true) // make sure we run only once - if swapped { - s.wg.Add(int(numSignals)) - time.AfterFunc(s.timeout, s.done) - } - } +func (s *SimpleSynchroniser) Signals(_ uint) { + s.triggered.Store(true) } func (s *SimpleSynchroniser) WaitReady() { diff --git a/scheduler/pkg/synchroniser/sync_test.go b/scheduler/pkg/synchroniser/sync_test.go index af33585ee8..73ef5b266b 100644 --- a/scheduler/pkg/synchroniser/sync_test.go +++ b/scheduler/pkg/synchroniser/sync_test.go @@ -25,32 +25,48 @@ func TestSimpleSynchroniser(t *testing.T) { type test struct { name string timeout time.Duration - signals uint + signal bool } tests := []test{ { name: "Simple", timeout: 100 * time.Millisecond, - signals: 1, + signal: true, + }, + { + name: "Longer timeout", + timeout: 500 * time.Millisecond, + signal: true, }, { name: "No timer", timeout: 0 * time.Millisecond, - signals: 1, + signal: true, + }, + { + name: "No signal", + timeout: 100 * time.Millisecond, + signal: false, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { s := NewSimpleSynchroniser(test.timeout) + startTime := time.Now() g.Expect(s.IsReady()).To(BeFalse()) - s.Signals(test.signals) - // this should have no effect - s.Signals(100000) + if test.signal { + s.Signals(1) + } s.WaitReady() + elapsed := time.Since(startTime) g.Expect(s.IsReady()).To(BeTrue()) + g.Expect(elapsed).To(BeNumerically(">", test.timeout)) + // this should have no effect + s.Signals(100000) + g.Expect(s.IsReady()).To(BeTrue()) // make sure we are graceful after this point s.Signals(10) g.Expect(s.IsReady()).To(BeTrue())