diff --git a/.github/.golangci.yml b/.github/.golangci.yml index b4c89281f0..abf9a0dd8e 100644 --- a/.github/.golangci.yml +++ b/.github/.golangci.yml @@ -69,6 +69,9 @@ linters: # internal server pbs have their own suffix to avoid naming conflicts - pkg: go.temporal.io/server/api/(\w+)/v1 alias: ${1}spb + testifylint: + disable: + - suite-method-signature # parallelsuite.Run supports extra args passed to Test* methods exhaustive: # Presence of "default" case in switch statements satisfies exhaustiveness, # even if all enum members are not listed. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index d54f4052c7..8bbb22cbc4 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -19,7 +19,7 @@ Apply these patterns when reviewing PRs or suggesting code changes. - Avoid stuttering: don't use `ActivityStatus` in package `activity`, just `Status` - Use `ok` boolean pattern instead of nil checks where idiomatic -## 3. Testify Suite Correctness +## 3. Testify Suite Correctness and Reliability - Never use `s.T()` in subtests - use the subtest's `t` parameter - Never use suite assertion methods (`s.NoError`, `s.Equal`) from goroutines - causes panics @@ -27,6 +27,15 @@ Apply these patterns when reviewing PRs or suggesting code changes. - Use `require.ErrorAs(t, err, &specificErr)` for specific error type checks - Prefer `require` over `assert` - it's rarely useful to continue a test after a failed assertion - Add comments explaining why `Eventually` is needed (e.g., eventual consistency) +- Do not use single-value type assertions on errors (`err.(*T)`); this panics instead of failing the test when the type doesn't match. Use `errors.As` with a guarded return. +- When launching a goroutine to maintain a precondition for later assertions (e.g., keeping pollers active so a deployment version gets registered), loop until context cancellation rather than running once. A single attempt that times out exits silently, leaving downstream Eventually/propagation waits to hang until their own deadline. +- Never call testify assertions (`s.NoError`, `s.Equal`, `require.NoError`, even `assert.NoError`) inside a `go func()` — if the goroutine outlives the test, the assertion panics the binary with `panic: Fail in goroutine after TestXxx has completed`. Move assertions to the test goroutine or use a buffered error channel. +- Any `<-ch` that isn't inside a `select` with `ctx.Done()` will hang indefinitely if the sender never sends. Always provide a context cancellation fallback. +- Never write to package-level or global variables in tests — parallel tests share the same process; thread values through function parameters instead. +- Never use `time.Sleep` or `time.Since(start) > threshold` to enforce ordering — use channels, `sync.WaitGroup`, or `EventuallyWithT` instead. +- When using `EventuallyWithT` (or similar) to wait for a condition driven by a background goroutine, ensure the goroutine's timeout is longer than the `EventuallyWithT` deadline — if the background op times out first, the condition will never be satisfied and the wait will hang until its own deadline. +- Do not silently discard errors from precondition operations with `_, _ = f()` — if `f()` failing invalidates the rest of the test, surface the error or loop until it succeeds. +- Be suspicious of `go s.someHelper(ctx, ...)` calls where the goroutine runs exactly once and the test then immediately waits for something that helper was supposed to cause. If the operation can fail transiently (network, tight deadline, busy CI), the single attempt may fail silently and the wait will never succeed. Either loop the goroutine until `ctx.Done()`, or check that the operation succeeded before proceeding. ## 4. Inline Code / Avoid Abstractions @@ -70,3 +79,4 @@ Apply these patterns when reviewing PRs or suggesting code changes. - Prefer `sync.Mutex` over `sync.RWMutex` almost always, except when reads are much more common than writes (>1000×) or readers hold the lock for significant time - Don't do IO while holding locks - use side effect tasks - Clone data before releasing locks if it might be modified +- Proto message fields accessed outside the workflow lock must be cloned, not aliased: use `common.CloneProto(...)` rather than returning the pointer directly. diff --git a/.github/workflows/ci-success-report.yml b/.github/workflows/ci-success-report.yml index d6d4db394f..1538c4eadb 100644 --- a/.github/workflows/ci-success-report.yml +++ b/.github/workflows/ci-success-report.yml @@ -32,7 +32,7 @@ jobs: steps: - name: Generate token id: generate_token - uses: actions/create-github-app-token@v1 + uses: actions/create-github-app-token@v2 with: app-id: ${{ secrets.TEMPORAL_CICD_APP_ID }} private-key: ${{ secrets.TEMPORAL_CICD_PRIVATE_KEY }} @@ -45,7 +45,7 @@ jobs: token: ${{ steps.generate_token.outputs.token }} - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: 'go.mod' diff --git a/.github/workflows/features-integration.yml b/.github/workflows/features-integration.yml index 83cdef55be..217d31c0d7 100644 --- a/.github/workflows/features-integration.yml +++ b/.github/workflows/features-integration.yml @@ -56,7 +56,7 @@ jobs: cp ./develop/docker-compose/docker-compose.yml /tmp/docker-compose.yml - name: Upload Docker artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: temporal-server-docker path: | diff --git a/.github/workflows/flaky-tests-report.yml b/.github/workflows/flaky-tests-report.yml index f12fab257f..aeccc794cc 100644 --- a/.github/workflows/flaky-tests-report.yml +++ b/.github/workflows/flaky-tests-report.yml @@ -27,7 +27,7 @@ jobs: steps: - name: Generate token id: generate_token - uses: actions/create-github-app-token@v1 + uses: actions/create-github-app-token@v2 with: app-id: ${{ secrets.TEMPORAL_CICD_APP_ID }} private-key: ${{ secrets.TEMPORAL_CICD_PRIVATE_KEY }} @@ -41,7 +41,7 @@ jobs: fetch-depth: 0 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: "go.mod" @@ -68,7 +68,7 @@ jobs: --sha "$SHA" - name: Upload generated reports - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 if: steps.process-flaky-tests.outcome == 'success' with: name: flaky-tests-reports-${{ github.run_number }} diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml index 6314afd69c..a3110beb51 100644 --- a/.github/workflows/linters.yml +++ b/.github/workflows/linters.yml @@ -99,7 +99,7 @@ jobs: check-latest: true cache: true - - name: format golang import statements + - name: apply formatters run: | make fmt diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 9564e93703..6422c701f5 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -228,7 +228,7 @@ jobs: - name: Restore dependencies id: restore-deps - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/go/pkg/mod key: go-${{ runner.os }}${{ runner.arch }}-${{ hashFiles('go.mod') }}-deps-${{ hashFiles('go.sum') }} @@ -236,14 +236,14 @@ jobs: - run: make pre-build-functional-test-coverage - name: Save dependencies - uses: actions/cache/save@v4 + uses: actions/cache/save@v5 if: ${{ steps.restore-deps.outputs.cache-hit != 'true' }} with: path: ~/go/pkg/mod key: ${{ steps.restore-deps.outputs.cache-primary-key }} - name: Save build outputs - uses: actions/cache/save@v4 + uses: actions/cache/save@v5 with: path: ~/.cache/go-build key: go-${{ runner.os }}${{ runner.arch }}-build-${{ env.COMMIT }} @@ -266,13 +266,13 @@ jobs: cache: false # do our own caching - name: Restore dependencies - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/go/pkg/mod key: go-${{ runner.os }}${{ runner.arch }}-${{ hashFiles('go.mod') }}-deps-${{ hashFiles('go.sum') }} - name: Restore build outputs - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/.cache/go-build key: go-${{ runner.os }}${{ runner.arch }}-build-${{ env.COMMIT }} @@ -303,13 +303,13 @@ jobs: cache: false # do our own caching - name: Restore dependencies - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/go/pkg/mod key: go-${{ runner.os }}${{ runner.arch }}-${{ hashFiles('go.mod') }}-deps-${{ hashFiles('go.sum') }} - name: Restore build outputs - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/.cache/go-build key: go-${{ runner.os }}${{ runner.arch }}-build-${{ env.COMMIT }} @@ -329,8 +329,8 @@ jobs: CRASH_REPORT_NAME="$GITHUB_JOB" make report-test-crash - name: Generate test summary - uses: mikepenz/action-junit-report@v5.0.0-rc01 - if: failure() + uses: mikepenz/action-junit-report@v6 + if: ${{ !cancelled() }} with: report_paths: ./.testoutput/junit.*.xml detailed_summary: true @@ -363,7 +363,7 @@ jobs: - name: Upload test results to GitHub # Can't pin to major because the action linter doesn't recognize the include-hidden-files flag. - uses: actions/upload-artifact@v4.4.3 + uses: actions/upload-artifact@v6 if: ${{ !cancelled() }} with: name: junit-xml--${{ github.run_id }}--${{ steps.get_job_id.outputs.job_id }}--${{ github.run_attempt }}--unit-test @@ -397,13 +397,13 @@ jobs: cache: false # do our own caching - name: Restore dependencies - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/go/pkg/mod key: go-${{ runner.os }}${{ runner.arch }}-${{ hashFiles('go.mod') }}-deps-${{ hashFiles('go.sum') }} - name: Restore build outputs - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/.cache/go-build key: go-${{ runner.os }}${{ runner.arch }}-build-${{ env.COMMIT }} @@ -429,8 +429,8 @@ jobs: CRASH_REPORT_NAME="$GITHUB_JOB" make report-test-crash - name: Generate test summary - uses: mikepenz/action-junit-report@v5.0.0-rc01 - if: failure() + uses: mikepenz/action-junit-report@v6 + if: ${{ !cancelled() }} with: report_paths: ./.testoutput/junit.*.xml detailed_summary: true @@ -463,7 +463,7 @@ jobs: - name: Upload test results to GitHub # Can't pin to major because the action linter doesn't recognize the include-hidden-files flag. - uses: actions/upload-artifact@v4.4.3 + uses: actions/upload-artifact@v6 if: ${{ !cancelled() }} with: name: junit-xml--${{ github.run_id }}--${{ steps.get_job_id.outputs.job_id }}--${{ github.run_attempt }}--integration-test @@ -515,13 +515,13 @@ jobs: cache: false # do our own caching - name: Restore dependencies - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/go/pkg/mod key: go-${{ runner.os }}${{ runner.arch }}-${{ hashFiles('go.mod') }}-deps-${{ hashFiles('go.sum') }} - name: Restore build outputs - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/.cache/go-build key: go-${{ runner.os }}${{ runner.arch }}-build-${{ env.COMMIT }} @@ -568,8 +568,8 @@ jobs: CRASH_REPORT_NAME="$GITHUB_JOB" make report-test-crash - name: Generate test summary - uses: mikepenz/action-junit-report@v5.0.0-rc01 - if: failure() + uses: mikepenz/action-junit-report@v6 + if: ${{ !cancelled() }} with: report_paths: ./.testoutput/junit.*.xml detailed_summary: true @@ -595,7 +595,7 @@ jobs: - name: Upload test results to GitHub # Can't pin to major because the action linter doesn't recognize the include-hidden-files flag. - uses: actions/upload-artifact@v4.4.3 + uses: actions/upload-artifact@v6 if: ${{ !cancelled() }} with: name: junit-xml--${{ github.run_id }}--${{ steps.get_job_id.outputs.job_id }}--${{ github.run_attempt }}--${{ matrix.name }}--${{ matrix.display_name }}--functional-test @@ -604,7 +604,7 @@ jobs: retention-days: 28 - name: Upload debug logs - uses: actions/upload-artifact@v4.4.3 + uses: actions/upload-artifact@v6 if: ${{ !cancelled() }} with: name: debug-logs--${{ github.run_id }}--${{ steps.get_job_id.outputs.job_id }}--${{ github.run_attempt }}--${{ matrix.name }}--${{ matrix.display_name }}--functional-test @@ -635,13 +635,13 @@ jobs: cache: false - name: Restore dependencies - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/go/pkg/mod key: go-${{ runner.os }}${{ runner.arch }}-${{ hashFiles('go.mod') }}-deps-${{ hashFiles('go.sum') }} - name: Restore build outputs - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: ~/.cache/go-build key: go-${{ runner.os }}${{ runner.arch }}-build-${{ env.COMMIT }} @@ -705,10 +705,10 @@ jobs: actions: read steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: "go.mod" cache: true diff --git a/.github/workflows/trigger-version-info-service.yml b/.github/workflows/trigger-version-info-service.yml index caa615bef5..3058242ede 100644 --- a/.github/workflows/trigger-version-info-service.yml +++ b/.github/workflows/trigger-version-info-service.yml @@ -19,7 +19,7 @@ jobs: steps: - name: Generate token id: generate_token - uses: actions/create-github-app-token@v1 + uses: actions/create-github-app-token@v2 with: app-id: ${{ secrets.TEMPORAL_CICD_APP_ID }} private-key: ${{ secrets.TEMPORAL_CICD_PRIVATE_KEY }} diff --git a/AGENTS.md b/AGENTS.md index dbbba33a4c..a4807125da 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -76,6 +76,8 @@ Before starting the implementation of any request, you MUST REVIEW the following - Run tests after altering code or tests - Start with unit tests for fastest feedback - Prefer `require` over `assert`, avoid testify suites in unit tests (functional tests require suites for test cluster setup), use `require.Eventually` instead of `time.Sleep` (forbidden by linter) +- For float comparisons in tests, use `InDelta` or `InEpsilon` instead of `Equal` (enforced by `testifylint`) +- For error assertions in testify suites, use `s.Require().NoError(err)` instead of `s.NoError(err)` (enforced by `testifylint`) # Primary Workflows ## Software Engineering Tasks diff --git a/Makefile b/Makefile index d361fc027a..893c5bf571 100644 --- a/Makefile +++ b/Makefile @@ -409,7 +409,7 @@ lint-protos: $(BUF) $(INTERNAL_BINPB) $(CHASM_BINPB) @$(BUF) lint $(INTERNAL_BINPB) @$(BUF) lint --config chasm/lib/buf.yaml $(CHASM_BINPB) -fmt: fmt-gofix fmt-imports fmt-yaml +fmt: fmt-gofix fmt-imports fmt-protos fmt-yaml # Some fixes enable others (e.g. rangeint may expose minmax opportunities), # so - as recommended by the Go team - we run go fix in a loop until it reaches @@ -438,6 +438,11 @@ parallelize-tests: @printf $(COLOR) "Add t.Parallel() to tests..." @go run ./cmd/tools/parallelize $(INTEGRATION_TEST_DIRS) +fmt-protos: $(BUF) + @printf $(COLOR) "Formatting proto files..." + @$(BUF) format -w $(PROTO_ROOT)/internal + @$(BUF) format -w --config chasm/lib/buf.yaml chasm/lib + fmt-yaml: $(YAMLFMT) @printf $(COLOR) "Formatting YAML files..." @$(YAMLFMT) -conf .github/.yamlfmt . diff --git a/api/adminservice/v1/request_response.pb.go b/api/adminservice/v1/request_response.pb.go index f5226be1f4..f65c46e7e4 100644 --- a/api/adminservice/v1/request_response.pb.go +++ b/api/adminservice/v1/request_response.pb.go @@ -5790,7 +5790,7 @@ var File_temporal_server_api_adminservice_v1_request_response_proto protoreflect const file_temporal_server_api_adminservice_v1_request_response_proto_rawDesc = "" + "\n" + - ":temporal/server/api/adminservice/v1/request_response.proto\x12#temporal.server.api.adminservice.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\"temporal/api/enums/v1/common.proto\x1a&temporal/api/enums/v1/task_queue.proto\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/version/v1/message.proto\x1a&temporal/api/workflow/v1/message.proto\x1a'temporal/api/namespace/v1/message.proto\x1a)temporal/api/replication/v1/message.proto\x1a'temporal/api/taskqueue/v1/message.proto\x1a,temporal/server/api/cluster/v1/message.proto\x1a'temporal/server/api/common/v1/dlq.proto\x1a)temporal/server/api/enums/v1/common.proto\x1a*temporal/server/api/enums/v1/cluster.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a&temporal/server/api/enums/v1/dlq.proto\x1a,temporal/server/api/history/v1/message.proto\x1a.temporal/server/api/namespace/v1/message.proto\x1a0temporal/server/api/replication/v1/message.proto\x1a9temporal/server/api/persistence/v1/cluster_metadata.proto\x1a3temporal/server/api/persistence/v1/executions.proto\x1a?temporal/server/api/persistence/v1/workflow_mutable_state.proto\x1a.temporal/server/api/persistence/v1/tasks.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a.temporal/server/api/taskqueue/v1/message.proto\x1a+temporal/server/api/health/v1/message.proto\"\x83\x01\n" + + ":temporal/server/api/adminservice/v1/request_response.proto\x12#temporal.server.api.adminservice.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a\"temporal/api/enums/v1/common.proto\x1a&temporal/api/enums/v1/task_queue.proto\x1a'temporal/api/namespace/v1/message.proto\x1a)temporal/api/replication/v1/message.proto\x1a'temporal/api/taskqueue/v1/message.proto\x1a%temporal/api/version/v1/message.proto\x1a&temporal/api/workflow/v1/message.proto\x1a,temporal/server/api/cluster/v1/message.proto\x1a'temporal/server/api/common/v1/dlq.proto\x1a*temporal/server/api/enums/v1/cluster.proto\x1a)temporal/server/api/enums/v1/common.proto\x1a&temporal/server/api/enums/v1/dlq.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a+temporal/server/api/health/v1/message.proto\x1a,temporal/server/api/history/v1/message.proto\x1a.temporal/server/api/namespace/v1/message.proto\x1a9temporal/server/api/persistence/v1/cluster_metadata.proto\x1a3temporal/server/api/persistence/v1/executions.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a.temporal/server/api/persistence/v1/tasks.proto\x1a?temporal/server/api/persistence/v1/workflow_mutable_state.proto\x1a0temporal/server/api/replication/v1/message.proto\x1a.temporal/server/api/taskqueue/v1/message.proto\"\x83\x01\n" + "\x1aRebuildMutableStateRequest\x12\x1c\n" + "\tnamespace\x18\x01 \x01(\tR\tnamespace\x12G\n" + "\texecution\x18\x02 \x01(\v2).temporal.api.common.v1.WorkflowExecutionR\texecution\"\x1d\n" + diff --git a/api/archiver/v1/message.pb.go b/api/archiver/v1/message.pb.go index cfac4f407b..8bd8356939 100644 --- a/api/archiver/v1/message.pb.go +++ b/api/archiver/v1/message.pb.go @@ -348,7 +348,7 @@ var File_temporal_server_api_archiver_v1_message_proto protoreflect.FileDescript const file_temporal_server_api_archiver_v1_message_proto_rawDesc = "" + "\n" + - "-temporal/server/api/archiver/v1/message.proto\x12\x1ftemporal.server.api.archiver.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/history/v1/message.proto\x1a$temporal/api/enums/v1/workflow.proto\"\xfa\x02\n" + + "-temporal/server/api/archiver/v1/message.proto\x12\x1ftemporal.server.api.archiver.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a%temporal/api/history/v1/message.proto\"\xfa\x02\n" + "\x11HistoryBlobHeader\x12\x1c\n" + "\tnamespace\x18\x01 \x01(\tR\tnamespace\x12!\n" + "\fnamespace_id\x18\x02 \x01(\tR\vnamespaceId\x12\x1f\n" + diff --git a/api/batch/v1/request_response.pb.go b/api/batch/v1/request_response.pb.go index d936bc4ee6..8856ee4691 100644 --- a/api/batch/v1/request_response.pb.go +++ b/api/batch/v1/request_response.pb.go @@ -135,7 +135,7 @@ var File_temporal_server_api_batch_v1_request_response_proto protoreflect.FileDe const file_temporal_server_api_batch_v1_request_response_proto_rawDesc = "" + "\n" + - "3temporal/server/api/batch/v1/request_response.proto\x12\x1ctemporal.server.api.batch.v1\x1a6temporal/api/workflowservice/v1/request_response.proto\x1a+temporal/api/enums/v1/batch_operation.proto\x1a:temporal/server/api/adminservice/v1/request_response.proto\x1a\x1egoogle/protobuf/duration.proto\"\xb0\x04\n" + + "3temporal/server/api/batch/v1/request_response.proto\x12\x1ctemporal.server.api.batch.v1\x1a\x1egoogle/protobuf/duration.proto\x1a+temporal/api/enums/v1/batch_operation.proto\x1a6temporal/api/workflowservice/v1/request_response.proto\x1a:temporal/server/api/adminservice/v1/request_response.proto\"\xb0\x04\n" + "\x13BatchOperationInput\x12!\n" + "\fnamespace_id\x18\x01 \x01(\tR\vnamespaceId\x12 \n" + "\vconcurrency\x18\x02 \x01(\x03R\vconcurrency\x12=\n" + diff --git a/api/checksum/v1/message.pb.go b/api/checksum/v1/message.pb.go index deda4f5b76..64fd75c37d 100644 --- a/api/checksum/v1/message.pb.go +++ b/api/checksum/v1/message.pb.go @@ -273,7 +273,7 @@ var File_temporal_server_api_checksum_v1_message_proto protoreflect.FileDescript const file_temporal_server_api_checksum_v1_message_proto_rawDesc = "" + "\n" + - "-temporal/server/api/checksum/v1/message.proto\x12\x1ftemporal.server.api.checksum.v1\x1a$temporal/api/enums/v1/workflow.proto\x1a,temporal/server/api/history/v1/message.proto\x1a+temporal/server/api/enums/v1/workflow.proto\"\xa2\f\n" + + "-temporal/server/api/checksum/v1/message.proto\x12\x1ftemporal.server.api.checksum.v1\x1a$temporal/api/enums/v1/workflow.proto\x1a+temporal/server/api/enums/v1/workflow.proto\x1a,temporal/server/api/history/v1/message.proto\"\xa2\f\n" + "\x1bMutableStateChecksumPayload\x12)\n" + "\x10cancel_requested\x18\x01 \x01(\bR\x0fcancelRequested\x12J\n" + "\x05state\x18\x02 \x01(\x0e24.temporal.server.api.enums.v1.WorkflowExecutionStateR\x05state\x12F\n" + diff --git a/api/deployment/v1/message.pb.go b/api/deployment/v1/message.pb.go index 9003ec3d7f..7485dc1f67 100644 --- a/api/deployment/v1/message.pb.go +++ b/api/deployment/v1/message.pb.go @@ -200,6 +200,8 @@ type WorkerDeploymentVersionData struct { // immediately delete the version data from task queues. instead, we mark them as deleted while // keeping the revision number. // Old enough deleted versions are GCed based on update_time. + // Deprecated. This mechanism is not safe against reactivation of versions after delete. + // Use forget_version flag for synchronous deletion of the version data from TQ. Deleted bool `protobuf:"varint,3,opt,name=deleted,proto3" json:"deleted,omitempty"` Status v1.WorkerDeploymentVersionStatus `protobuf:"varint,6,opt,name=status,proto3,enum=temporal.api.enums.v1.WorkerDeploymentVersionStatus" json:"status,omitempty"` unknownFields protoimpl.UnknownFields @@ -3416,7 +3418,7 @@ var File_temporal_server_api_deployment_v1_message_proto protoreflect.FileDescri const file_temporal_server_api_deployment_v1_message_proto_rawDesc = "" + "\n" + - "/temporal/server/api/deployment/v1/message.proto\x12!temporal.server.api.deployment.v1\x1a&temporal/api/enums/v1/task_queue.proto\x1a&temporal/api/enums/v1/deployment.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a(temporal/api/deployment/v1/message.proto\x1a$temporal/api/common/v1/message.proto\"]\n" + + "/temporal/server/api/deployment/v1/message.proto\x12!temporal.server.api.deployment.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a(temporal/api/deployment/v1/message.proto\x1a&temporal/api/enums/v1/deployment.proto\x1a&temporal/api/enums/v1/task_queue.proto\"]\n" + "\x17WorkerDeploymentVersion\x12'\n" + "\x0fdeployment_name\x18\x01 \x01(\tR\x0edeploymentName\x12\x19\n" + "\bbuild_id\x18\x02 \x01(\tR\abuildId\"\xc4\x03\n" + diff --git a/api/historyservice/v1/request_response.pb.go b/api/historyservice/v1/request_response.pb.go index 20f71ae83a..e4f8c7b9e1 100644 --- a/api/historyservice/v1/request_response.pb.go +++ b/api/historyservice/v1/request_response.pb.go @@ -10408,7 +10408,7 @@ var File_temporal_server_api_historyservice_v1_request_response_proto protorefle const file_temporal_server_api_historyservice_v1_request_response_proto_rawDesc = "" + "\n" + - ".temporal.server.api.historyservice.v1.GetMutableStateResponse\"\x06\x8a\xb5\x18\x02\b\x01\x12\x9b\x01\n" + diff --git a/api/matchingservice/v1/request_response.pb.go b/api/matchingservice/v1/request_response.pb.go index 3dfab1b319..924fc90ee2 100644 --- a/api/matchingservice/v1/request_response.pb.go +++ b/api/matchingservice/v1/request_response.pb.go @@ -2819,7 +2819,6 @@ type SyncDeploymentUserDataRequest struct { // aip.dev/not-precedent: Not following Google API format --) UpsertVersionsData map[string]*v110.WorkerDeploymentVersionData `protobuf:"bytes,11,rep,name=upsert_versions_data,json=upsertVersionsData,proto3" json:"upsert_versions_data,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` // List of build ids to forget from task queue. - // Deprecated. Use upsert_versions_data with deleted=true. ForgetVersions []string `protobuf:"bytes,12,rep,name=forget_versions,json=forgetVersions,proto3" json:"forget_versions,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache @@ -5742,7 +5741,7 @@ var File_temporal_server_api_matchingservice_v1_request_response_proto protorefl const file_temporal_server_api_matchingservice_v1_request_response_proto_rawDesc = "" + "\n" + - "=temporal/server/api/matchingservice/v1/request_response.proto\x12&temporal.server.api.matchingservice.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a(temporal/api/deployment/v1/message.proto\x1a&temporal/api/enums/v1/task_queue.proto\x1a%temporal/api/failure/v1/message.proto\x1a%temporal/api/history/v1/message.proto\x1a'temporal/api/taskqueue/v1/message.proto\x1a#temporal/api/query/v1/message.proto\x1a&temporal/api/protocol/v1/message.proto\x1a*temporal/server/api/clock/v1/message.proto\x1a/temporal/server/api/deployment/v1/message.proto\x1a,temporal/server/api/history/v1/message.proto\x1a.temporal/server/api/persistence/v1/nexus.proto\x1a4temporal/server/api/persistence/v1/task_queues.proto\x1a.temporal/server/api/taskqueue/v1/message.proto\x1a1temporal/server/api/enums/v1/fairness_state.proto\x1a6temporal/api/workflowservice/v1/request_response.proto\x1a#temporal/api/nexus/v1/message.proto\x1a$temporal/api/worker/v1/message.proto\"\xc3\x02\n" + + "=temporal/server/api/matchingservice/v1/request_response.proto\x12&temporal.server.api.matchingservice.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a(temporal/api/deployment/v1/message.proto\x1a&temporal/api/enums/v1/task_queue.proto\x1a%temporal/api/failure/v1/message.proto\x1a%temporal/api/history/v1/message.proto\x1a#temporal/api/nexus/v1/message.proto\x1a&temporal/api/protocol/v1/message.proto\x1a#temporal/api/query/v1/message.proto\x1a'temporal/api/taskqueue/v1/message.proto\x1a$temporal/api/worker/v1/message.proto\x1a6temporal/api/workflowservice/v1/request_response.proto\x1a*temporal/server/api/clock/v1/message.proto\x1a/temporal/server/api/deployment/v1/message.proto\x1a1temporal/server/api/enums/v1/fairness_state.proto\x1a,temporal/server/api/history/v1/message.proto\x1a.temporal/server/api/persistence/v1/nexus.proto\x1a4temporal/server/api/persistence/v1/task_queues.proto\x1a.temporal/server/api/taskqueue/v1/message.proto\"\xc3\x02\n" + "\x1cPollWorkflowTaskQueueRequest\x12!\n" + "\fnamespace_id\x18\x01 \x01(\tR\vnamespaceId\x12\x1b\n" + "\tpoller_id\x18\x02 \x01(\tR\bpollerId\x12`\n" + diff --git a/api/matchingservice/v1/service.pb.go b/api/matchingservice/v1/service.pb.go index be923b4f33..10479d1ca5 100644 --- a/api/matchingservice/v1/service.pb.go +++ b/api/matchingservice/v1/service.pb.go @@ -26,7 +26,7 @@ var File_temporal_server_api_matchingservice_v1_service_proto protoreflect.FileD const file_temporal_server_api_matchingservice_v1_service_proto_rawDesc = "" + "\n" + - "4temporal/server/api/matchingservice/v1/service.proto\x12&temporal.server.api.matchingservice.v1\x1a=temporal/server/api/matchingservice/v1/request_response.proto\x1a0temporal/server/api/common/v1/api_category.proto2\xa38\n" + + "4temporal/server/api/matchingservice/v1/service.proto\x12&temporal.server.api.matchingservice.v1\x1a0temporal/server/api/common/v1/api_category.proto\x1a=temporal/server/api/matchingservice/v1/request_response.proto2\xa38\n" + "\x0fMatchingService\x12\xac\x01\n" + "\x15PollWorkflowTaskQueue\x12D.temporal.server.api.matchingservice.v1.PollWorkflowTaskQueueRequest\x1aE.temporal.server.api.matchingservice.v1.PollWorkflowTaskQueueResponse\"\x06\x8a\xb5\x18\x02\b\x02\x12\xac\x01\n" + "\x15PollActivityTaskQueue\x12D.temporal.server.api.matchingservice.v1.PollActivityTaskQueueRequest\x1aE.temporal.server.api.matchingservice.v1.PollActivityTaskQueueResponse\"\x06\x8a\xb5\x18\x02\b\x02\x12\x9a\x01\n" + diff --git a/api/persistence/v1/executions.pb.go b/api/persistence/v1/executions.pb.go index 36c3afdba8..913f3a211a 100644 --- a/api/persistence/v1/executions.pb.go +++ b/api/persistence/v1/executions.pb.go @@ -4643,7 +4643,7 @@ var File_temporal_server_api_persistence_v1_executions_proto protoreflect.FileDe const file_temporal_server_api_persistence_v1_executions_proto_rawDesc = "" + "\n" + - "3temporal/server/api/persistence/v1/executions.proto\x12\"temporal.server.api.persistence.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a\"temporal/api/enums/v1/common.proto\x1a&temporal/api/enums/v1/event_type.proto\x1a(temporal/api/enums/v1/failed_cause.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/workflow/v1/message.proto\x1a%temporal/api/history/v1/message.proto\x1a(temporal/api/deployment/v1/message.proto\x1a*temporal/server/api/clock/v1/message.proto\x1a)temporal/server/api/enums/v1/common.proto\x1a(temporal/server/api/enums/v1/nexus.proto\x1a+temporal/server/api/enums/v1/workflow.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a5temporal/server/api/enums/v1/workflow_task_type.proto\x1a,temporal/server/api/history/v1/message.proto\x1a.temporal/server/api/persistence/v1/chasm.proto\x1a/temporal/server/api/persistence/v1/queues.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a/temporal/server/api/persistence/v1/update.proto\x1a-temporal/server/api/workflow/v1/message.proto\"\xa3\x05\n" + + "3temporal/server/api/persistence/v1/executions.proto\x12\"temporal.server.api.persistence.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a(temporal/api/deployment/v1/message.proto\x1a\"temporal/api/enums/v1/common.proto\x1a&temporal/api/enums/v1/event_type.proto\x1a(temporal/api/enums/v1/failed_cause.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a%temporal/api/failure/v1/message.proto\x1a%temporal/api/history/v1/message.proto\x1a&temporal/api/workflow/v1/message.proto\x1a*temporal/server/api/clock/v1/message.proto\x1a)temporal/server/api/enums/v1/common.proto\x1a(temporal/server/api/enums/v1/nexus.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a+temporal/server/api/enums/v1/workflow.proto\x1a5temporal/server/api/enums/v1/workflow_task_type.proto\x1a,temporal/server/api/history/v1/message.proto\x1a.temporal/server/api/persistence/v1/chasm.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a/temporal/server/api/persistence/v1/queues.proto\x1a/temporal/server/api/persistence/v1/update.proto\x1a-temporal/server/api/workflow/v1/message.proto\"\xa3\x05\n" + "\tShardInfo\x12\x19\n" + "\bshard_id\x18\x01 \x01(\x05R\ashardId\x12\x19\n" + "\brange_id\x18\x02 \x01(\x03R\arangeId\x12\x14\n" + @@ -5377,8 +5377,8 @@ func file_temporal_server_api_persistence_v1_executions_proto_init() { return } file_temporal_server_api_persistence_v1_chasm_proto_init() - file_temporal_server_api_persistence_v1_queues_proto_init() file_temporal_server_api_persistence_v1_hsm_proto_init() + file_temporal_server_api_persistence_v1_queues_proto_init() file_temporal_server_api_persistence_v1_update_proto_init() file_temporal_server_api_persistence_v1_executions_proto_msgTypes[1].OneofWrappers = []any{ (*WorkflowExecutionInfo_LastWorkflowTaskFailureCause)(nil), diff --git a/api/persistence/v1/workflow_mutable_state.pb.go b/api/persistence/v1/workflow_mutable_state.pb.go index 85f96a97ce..f099d8fb53 100644 --- a/api/persistence/v1/workflow_mutable_state.pb.go +++ b/api/persistence/v1/workflow_mutable_state.pb.go @@ -364,7 +364,7 @@ var File_temporal_server_api_persistence_v1_workflow_mutable_state_proto protore const file_temporal_server_api_persistence_v1_workflow_mutable_state_proto_rawDesc = "" + "\n" + - "?temporal/server/api/persistence/v1/workflow_mutable_state.proto\x12\"temporal.server.api.persistence.v1\x1a%temporal/api/history/v1/message.proto\x1a3temporal/server/api/persistence/v1/executions.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a.temporal/server/api/persistence/v1/chasm.proto\x1a/temporal/server/api/persistence/v1/update.proto\"\xd0\x0e\n" + + "?temporal/server/api/persistence/v1/workflow_mutable_state.proto\x12\"temporal.server.api.persistence.v1\x1a%temporal/api/history/v1/message.proto\x1a.temporal/server/api/persistence/v1/chasm.proto\x1a3temporal/server/api/persistence/v1/executions.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a/temporal/server/api/persistence/v1/update.proto\"\xd0\x0e\n" + "\x14WorkflowMutableState\x12r\n" + "\x0eactivity_infos\x18\x01 \x03(\v2K.temporal.server.api.persistence.v1.WorkflowMutableState.ActivityInfosEntryR\ractivityInfos\x12i\n" + "\vtimer_infos\x18\x02 \x03(\v2H.temporal.server.api.persistence.v1.WorkflowMutableState.TimerInfosEntryR\n" + @@ -536,9 +536,9 @@ func file_temporal_server_api_persistence_v1_workflow_mutable_state_proto_init() if File_temporal_server_api_persistence_v1_workflow_mutable_state_proto != nil { return } + file_temporal_server_api_persistence_v1_chasm_proto_init() file_temporal_server_api_persistence_v1_executions_proto_init() file_temporal_server_api_persistence_v1_hsm_proto_init() - file_temporal_server_api_persistence_v1_chasm_proto_init() file_temporal_server_api_persistence_v1_update_proto_init() type x struct{} out := protoimpl.TypeBuilder{ diff --git a/api/replication/v1/message.pb.go b/api/replication/v1/message.pb.go index a61557d751..1fdcca398b 100644 --- a/api/replication/v1/message.pb.go +++ b/api/replication/v1/message.pb.go @@ -2117,7 +2117,7 @@ var File_temporal_server_api_replication_v1_message_proto protoreflect.FileDescr const file_temporal_server_api_replication_v1_message_proto_rawDesc = "" + "\n" + - "0temporal/server/api/replication/v1/message.proto\x12\"temporal.server.api.replication.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a.temporal/server/api/enums/v1/replication.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a,temporal/server/api/history/v1/message.proto\x1a3temporal/server/api/persistence/v1/executions.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a4temporal/server/api/persistence/v1/task_queues.proto\x1a?temporal/server/api/persistence/v1/workflow_mutable_state.proto\x1a$temporal/api/common/v1/message.proto\x1a'temporal/api/namespace/v1/message.proto\x1a)temporal/api/replication/v1/message.proto\x1a%temporal/api/failure/v1/message.proto\x1a-temporal/server/api/workflow/v1/message.proto\"\xa1\x0f\n" + + "0temporal/server/api/replication/v1/message.proto\x12\"temporal.server.api.replication.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/failure/v1/message.proto\x1a'temporal/api/namespace/v1/message.proto\x1a)temporal/api/replication/v1/message.proto\x1a.temporal/server/api/enums/v1/replication.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a,temporal/server/api/history/v1/message.proto\x1a3temporal/server/api/persistence/v1/executions.proto\x1a,temporal/server/api/persistence/v1/hsm.proto\x1a4temporal/server/api/persistence/v1/task_queues.proto\x1a?temporal/server/api/persistence/v1/workflow_mutable_state.proto\x1a-temporal/server/api/workflow/v1/message.proto\"\xa1\x0f\n" + "\x0fReplicationTask\x12N\n" + "\ttask_type\x18\x01 \x01(\x0e21.temporal.server.api.enums.v1.ReplicationTaskTypeR\btaskType\x12$\n" + "\x0esource_task_id\x18\x02 \x01(\x03R\fsourceTaskId\x12y\n" + diff --git a/api/schedule/v1/message.pb.go b/api/schedule/v1/message.pb.go index 632899ca7e..513c6c1d61 100644 --- a/api/schedule/v1/message.pb.go +++ b/api/schedule/v1/message.pb.go @@ -1054,7 +1054,7 @@ var File_temporal_server_api_schedule_v1_message_proto protoreflect.FileDescript const file_temporal_server_api_schedule_v1_message_proto_rawDesc = "" + "\n" + - "-temporal/server/api/schedule/v1/message.proto\x12\x1ftemporal.server.api.schedule.v1\x1a$temporal/api/common/v1/message.proto\x1a$temporal/api/enums/v1/schedule.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/schedule/v1/message.proto\x1a6temporal/api/workflowservice/v1/request_response.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x95\x05\n" + + "-temporal/server/api/schedule/v1/message.proto\x12\x1ftemporal.server.api.schedule.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a$temporal/api/enums/v1/schedule.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/schedule/v1/message.proto\x1a6temporal/api/workflowservice/v1/request_response.proto\"\x95\x05\n" + "\rBufferedStart\x12=\n" + "\fnominal_time\x18\x01 \x01(\v2\x1a.google.protobuf.TimestampR\vnominalTime\x12;\n" + "\vactual_time\x18\x02 \x01(\v2\x1a.google.protobuf.TimestampR\n" + diff --git a/api/taskqueue/v1/message.pb.go b/api/taskqueue/v1/message.pb.go index 3d60d23041..384c9be1a0 100644 --- a/api/taskqueue/v1/message.pb.go +++ b/api/taskqueue/v1/message.pb.go @@ -931,7 +931,7 @@ var File_temporal_server_api_taskqueue_v1_message_proto protoreflect.FileDescrip const file_temporal_server_api_taskqueue_v1_message_proto_rawDesc = "" + "\n" + - ".temporal/server/api/taskqueue/v1/message.proto\x12 temporal.server.api.taskqueue.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a(temporal/api/deployment/v1/message.proto\x1a&temporal/api/enums/v1/task_queue.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a'temporal/api/taskqueue/v1/message.proto\x1a'temporal/server/api/enums/v1/task.proto\x1a/temporal/server/api/deployment/v1/message.proto\"\xbf\x03\n" + + ".temporal/server/api/taskqueue/v1/message.proto\x12 temporal.server.api.taskqueue.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a(temporal/api/deployment/v1/message.proto\x1a&temporal/api/enums/v1/task_queue.proto\x1a$temporal/api/enums/v1/workflow.proto\x1a'temporal/api/taskqueue/v1/message.proto\x1a/temporal/server/api/deployment/v1/message.proto\x1a'temporal/server/api/enums/v1/task.proto\"\xbf\x03\n" + "\x14TaskVersionDirective\x12J\n" + "\x14use_assignment_rules\x18\x01 \x01(\v2\x16.google.protobuf.EmptyH\x00R\x12useAssignmentRules\x12,\n" + "\x11assigned_build_id\x18\x02 \x01(\tH\x00R\x0fassignedBuildId\x12E\n" + diff --git a/chasm/context.go b/chasm/context.go index 3c74979cbe..3df7ef0313 100644 --- a/chasm/context.go +++ b/chasm/context.go @@ -27,7 +27,7 @@ type Context interface { ExecutionCloseTime() time.Time // Logger returns a logger tagged with execution key and other chasm framework internal information. Logger() log.Logger - // MetricsHandler returns a metrics handler with bare minimum tags (no namespace tag). + // MetricsHandler returns a metrics handler with namespace tag. MetricsHandler() metrics.Handler // Value returns the value associated with this context for key. The behavior is the same as context.Context.Value(). // Use WithContextValues RegistrableComponentOption to set key values pair for a component upon registration. diff --git a/chasm/lib/activity/gen/activitypb/v1/service.pb.go b/chasm/lib/activity/gen/activitypb/v1/service.pb.go index 06e80b0010..c33425c639 100644 --- a/chasm/lib/activity/gen/activitypb/v1/service.pb.go +++ b/chasm/lib/activity/gen/activitypb/v1/service.pb.go @@ -27,7 +27,7 @@ var File_temporal_server_chasm_lib_activity_proto_v1_service_proto protoreflect. const file_temporal_server_chasm_lib_activity_proto_v1_service_proto_rawDesc = "" + "\n" + - "9temporal/server/chasm/lib/activity/proto/v1/service.proto\x12+temporal.server.chasm.lib.activity.proto.v1\x1aBtemporal/server/chasm/lib/activity/proto/v1/request_response.proto\x1a.temporal/server/api/routing/v1/extension.proto\x1a0temporal/server/api/common/v1/api_category.proto2\xf2\n" + + "9temporal/server/chasm/lib/activity/proto/v1/service.proto\x12+temporal.server.chasm.lib.activity.proto.v1\x1aBtemporal/server/chasm/lib/activity/proto/v1/request_response.proto\x1a0temporal/server/api/common/v1/api_category.proto\x1a.temporal/server/api/routing/v1/extension.proto2\xf2\n" + "\n" + "\x0fActivityService\x12\xdb\x01\n" + "\x16StartActivityExecution\x12J.temporal.server.chasm.lib.activity.proto.v1.StartActivityExecutionRequest\x1aK.temporal.server.chasm.lib.activity.proto.v1.StartActivityExecutionResponse\"(\x92\xc4\x03\x1e\x1a\x1cfrontend_request.activity_id\x8a\xb5\x18\x02\b\x01\x12\xe4\x01\n" + diff --git a/chasm/lib/activity/handler.go b/chasm/lib/activity/handler.go index cad5dd0ea3..f97a0f1a9a 100644 --- a/chasm/lib/activity/handler.go +++ b/chasm/lib/activity/handler.go @@ -119,6 +119,11 @@ func (h *handler) DescribeActivityExecution( RunID: req.GetFrontendRequest().GetRunId(), }) + token := req.GetFrontendRequest().GetLongPollToken() + if len(token) == 0 { + return chasm.ReadComponent(ctx, ref, (*Activity).buildDescribeActivityExecutionResponse, req) + } + // Below, we send an empty non-error response on context deadline expiry. Here we compute a // deadline that causes us to send that response before the caller's own deadline (see // chasm.activity.longPollBuffer). We also cap the caller's deadline at @@ -131,10 +136,6 @@ func (h *handler) DescribeActivityExecution( ) defer cancel() - token := req.GetFrontendRequest().GetLongPollToken() - if len(token) == 0 { - return chasm.ReadComponent(ctx, ref, (*Activity).buildDescribeActivityExecutionResponse, req) - } response, _, err = chasm.PollComponent(ctx, ref, func( a *Activity, ctx chasm.Context, diff --git a/chasm/lib/activity/proto/v1/activity_state.proto b/chasm/lib/activity/proto/v1/activity_state.proto index 85809f59b5..00ebee8b9e 100644 --- a/chasm/lib/activity/proto/v1/activity_state.proto +++ b/chasm/lib/activity/proto/v1/activity_state.proto @@ -2,8 +2,6 @@ syntax = "proto3"; package temporal.server.chasm.lib.activity.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; - import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; import "temporal/api/common/v1/message.proto"; @@ -12,91 +10,93 @@ import "temporal/api/failure/v1/message.proto"; import "temporal/api/sdk/v1/user_metadata.proto"; import "temporal/api/taskqueue/v1/message.proto"; +option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; + enum ActivityExecutionStatus { - ACTIVITY_EXECUTION_STATUS_UNSPECIFIED = 0; - // The activity has been scheduled, but a worker has not accepted the task for the current - // attempt. The activity may be backing off between attempts or waiting for a worker to pick it - // up. - ACTIVITY_EXECUTION_STATUS_SCHEDULED = 1; - // A worker has accepted a task for the current attempt. - ACTIVITY_EXECUTION_STATUS_STARTED = 2; - // A caller has requested cancellation of the activity. - ACTIVITY_EXECUTION_STATUS_CANCEL_REQUESTED = 3; - // The activity completed successfully. - ACTIVITY_EXECUTION_STATUS_COMPLETED = 4; - // The activity completed with failure. - ACTIVITY_EXECUTION_STATUS_FAILED = 5; - // The activity completed as canceled. - // Requesting to cancel an activity does not automatically transition the activity to canceled status. If the worker - // responds to cancel the activity after requesting cancellation, the status will transition to cancelled. If the - // activity completes, fails, times out or terminates after cancel is requested and before the worker responds with - // cancelled. The activity will be stay in the terminal non-cancelled status. - ACTIVITY_EXECUTION_STATUS_CANCELED = 6; - // The activity was terminated. Termination does not reach the worker and the activity code cannot react to it. - // A terminated activity may have a running attempt and will be requested to be canceled by the server when it - // heartbeats. - ACTIVITY_EXECUTION_STATUS_TERMINATED = 7; - // The activity has timed out by reaching the specified schedule-to-start or schedule-to-close timeouts. - // Additionally, after all retries are exhausted for start-to-close or heartbeat timeouts, the activity will also - // transition to timed out status. - ACTIVITY_EXECUTION_STATUS_TIMED_OUT = 8; + ACTIVITY_EXECUTION_STATUS_UNSPECIFIED = 0; + // The activity has been scheduled, but a worker has not accepted the task for the current + // attempt. The activity may be backing off between attempts or waiting for a worker to pick it + // up. + ACTIVITY_EXECUTION_STATUS_SCHEDULED = 1; + // A worker has accepted a task for the current attempt. + ACTIVITY_EXECUTION_STATUS_STARTED = 2; + // A caller has requested cancellation of the activity. + ACTIVITY_EXECUTION_STATUS_CANCEL_REQUESTED = 3; + // The activity completed successfully. + ACTIVITY_EXECUTION_STATUS_COMPLETED = 4; + // The activity completed with failure. + ACTIVITY_EXECUTION_STATUS_FAILED = 5; + // The activity completed as canceled. + // Requesting to cancel an activity does not automatically transition the activity to canceled status. If the worker + // responds to cancel the activity after requesting cancellation, the status will transition to cancelled. If the + // activity completes, fails, times out or terminates after cancel is requested and before the worker responds with + // cancelled. The activity will be stay in the terminal non-cancelled status. + ACTIVITY_EXECUTION_STATUS_CANCELED = 6; + // The activity was terminated. Termination does not reach the worker and the activity code cannot react to it. + // A terminated activity may have a running attempt and will be requested to be canceled by the server when it + // heartbeats. + ACTIVITY_EXECUTION_STATUS_TERMINATED = 7; + // The activity has timed out by reaching the specified schedule-to-start or schedule-to-close timeouts. + // Additionally, after all retries are exhausted for start-to-close or heartbeat timeouts, the activity will also + // transition to timed out status. + ACTIVITY_EXECUTION_STATUS_TIMED_OUT = 8; } message ActivityState { - // The type of the activity, a string that maps to a registered activity on a worker. - temporal.api.common.v1.ActivityType activity_type = 1; - - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - - // Indicates how long the caller is willing to wait for an activity completion. Limits how long - // retries will be attempted. Either this or `start_to_close_timeout` must be specified. - // - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_close_timeout = 3; - // Limits time an activity task can stay in a task queue before a worker picks it up. This - // timeout is always non retryable, as all a retry would achieve is to put it back into the same - // queue. Defaults to `schedule_to_close_timeout` or workflow execution timeout if not - // specified. - // - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_start_timeout = 4; - // Maximum time an activity is allowed to execute after being picked up by a worker. This - // timeout is always retryable. Either this or `schedule_to_close_timeout` must be - // specified. - // - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration start_to_close_timeout = 5; - // Maximum permitted time between successful worker heartbeats. - google.protobuf.Duration heartbeat_timeout = 6; - // The retry policy for the activity. Will never exceed `schedule_to_close_timeout`. - temporal.api.common.v1.RetryPolicy retry_policy = 7; - - // All of the possible activity statuses (covers both the public ActivityExecutionStatus and PendingActivityState). - // TODO: consider moving this into ActivityAttemptState and renaming that message. This could save mutating two - // components on each attempt transition. - ActivityExecutionStatus status = 8; - - // Time the activity was originally scheduled via a StartActivityExecution request. - google.protobuf.Timestamp schedule_time = 9; - - // Priority metadata. - temporal.api.common.v1.Priority priority = 10; - - // Set if activity cancellation was requested. - ActivityCancelState cancel_state = 11; - - // Set if the activity was terminated - ActivityTerminateState terminate_state = 12; + // The type of the activity, a string that maps to a registered activity on a worker. + temporal.api.common.v1.ActivityType activity_type = 1; + + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + + // Indicates how long the caller is willing to wait for an activity completion. Limits how long + // retries will be attempted. Either this or `start_to_close_timeout` must be specified. + // + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_close_timeout = 3; + // Limits time an activity task can stay in a task queue before a worker picks it up. This + // timeout is always non retryable, as all a retry would achieve is to put it back into the same + // queue. Defaults to `schedule_to_close_timeout` or workflow execution timeout if not + // specified. + // + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_start_timeout = 4; + // Maximum time an activity is allowed to execute after being picked up by a worker. This + // timeout is always retryable. Either this or `schedule_to_close_timeout` must be + // specified. + // + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration start_to_close_timeout = 5; + // Maximum permitted time between successful worker heartbeats. + google.protobuf.Duration heartbeat_timeout = 6; + // The retry policy for the activity. Will never exceed `schedule_to_close_timeout`. + temporal.api.common.v1.RetryPolicy retry_policy = 7; + + // All of the possible activity statuses (covers both the public ActivityExecutionStatus and PendingActivityState). + // TODO: consider moving this into ActivityAttemptState and renaming that message. This could save mutating two + // components on each attempt transition. + ActivityExecutionStatus status = 8; + + // Time the activity was originally scheduled via a StartActivityExecution request. + google.protobuf.Timestamp schedule_time = 9; + + // Priority metadata. + temporal.api.common.v1.Priority priority = 10; + + // Set if activity cancellation was requested. + ActivityCancelState cancel_state = 11; + + // Set if the activity was terminated + ActivityTerminateState terminate_state = 12; } message ActivityCancelState { - string request_id = 1; - google.protobuf.Timestamp request_time = 2; - string identity = 3; - string reason = 4; + string request_id = 1; + google.protobuf.Timestamp request_time = 2; + string identity = 3; + string reason = 4; } message ActivityTerminateState { @@ -104,84 +104,84 @@ message ActivityTerminateState { } message ActivityAttemptState { - // The attempt this activity is currently on. - // Incremented each time a new attempt is scheduled. A newly created activity will immediately be scheduled, and - // the count is set to 1. - int32 count = 1; - - // Time from the last attempt failure to the next activity retry. - // If the activity is currently running, this represents the next retry interval in case the attempt fails. - // If activity is currently backing off between attempt, this represents the current retry interval. - // If there is no next retry allowed, this field will be null. - // This interval is typically calculated from the specified retry policy, but may be modified if an activity fails - // with a retryable application failure specifying a retry delay. - google.protobuf.Duration current_retry_interval = 2; - - // Time the last attempt was started. - google.protobuf.Timestamp started_time = 3; - - // The time when the last activity attempt completed. If activity has not been completed yet, it will be null. - google.protobuf.Timestamp complete_time = 4; - - message LastFailureDetails { - // The last time the activity attempt failed. - google.protobuf.Timestamp time = 1; - - // Failure details from the last failed attempt. - temporal.api.failure.v1.Failure failure = 2; - } - - // Details about the last failure. This will only be updated when an activity attempt fails, - // including start-to-close timeout. Activity success, termination, schedule-to-start and schedule-to-close timeouts - // will not reset it. - LastFailureDetails last_failure_details = 5; - - // An incremental version number used to validate tasks. - // Initially this only verifies that a task belong to the current attempt. - // Later on this stamp will be used to also invalidate tasks when the activity is paused, reset, or has its options - // updated. - int32 stamp = 6; - - string last_worker_identity = 7; - - // The Worker Deployment Version this activity was dispatched to most recently. - // If nil, the activity has not yet been dispatched or was last dispatched to an unversioned worker. - temporal.api.deployment.v1.WorkerDeploymentVersion last_deployment_version = 8; - - // The request ID that came from matching's RecordActivityTaskStarted API call. Used to make this API idempotent in - // case of implicit retries. - string start_request_id = 9; + // The attempt this activity is currently on. + // Incremented each time a new attempt is scheduled. A newly created activity will immediately be scheduled, and + // the count is set to 1. + int32 count = 1; + + // Time from the last attempt failure to the next activity retry. + // If the activity is currently running, this represents the next retry interval in case the attempt fails. + // If activity is currently backing off between attempt, this represents the current retry interval. + // If there is no next retry allowed, this field will be null. + // This interval is typically calculated from the specified retry policy, but may be modified if an activity fails + // with a retryable application failure specifying a retry delay. + google.protobuf.Duration current_retry_interval = 2; + + // Time the last attempt was started. + google.protobuf.Timestamp started_time = 3; + + // The time when the last activity attempt completed. If activity has not been completed yet, it will be null. + google.protobuf.Timestamp complete_time = 4; + + message LastFailureDetails { + // The last time the activity attempt failed. + google.protobuf.Timestamp time = 1; + + // Failure details from the last failed attempt. + temporal.api.failure.v1.Failure failure = 2; + } + + // Details about the last failure. This will only be updated when an activity attempt fails, + // including start-to-close timeout. Activity success, termination, schedule-to-start and schedule-to-close timeouts + // will not reset it. + LastFailureDetails last_failure_details = 5; + + // An incremental version number used to validate tasks. + // Initially this only verifies that a task belong to the current attempt. + // Later on this stamp will be used to also invalidate tasks when the activity is paused, reset, or has its options + // updated. + int32 stamp = 6; + + string last_worker_identity = 7; + + // The Worker Deployment Version this activity was dispatched to most recently. + // If nil, the activity has not yet been dispatched or was last dispatched to an unversioned worker. + temporal.api.deployment.v1.WorkerDeploymentVersion last_deployment_version = 8; + + // The request ID that came from matching's RecordActivityTaskStarted API call. Used to make this API idempotent in + // case of implicit retries. + string start_request_id = 9; } message ActivityHeartbeatState { - // Details provided in the last recorded activity heartbeat. - temporal.api.common.v1.Payloads details = 1; - // Time the last heartbeat was recorded. - google.protobuf.Timestamp recorded_time = 2; + // Details provided in the last recorded activity heartbeat. + temporal.api.common.v1.Payloads details = 1; + // Time the last heartbeat was recorded. + google.protobuf.Timestamp recorded_time = 2; } message ActivityRequestData { - // Serialized activity input, passed as arguments to the activity function. - temporal.api.common.v1.Payloads input = 1; - temporal.api.common.v1.Header header = 2; + // Serialized activity input, passed as arguments to the activity function. + temporal.api.common.v1.Payloads input = 1; + temporal.api.common.v1.Header header = 2; - // Metadata for use by user interfaces to display the fixed as-of-start summary and details of the activity. - temporal.api.sdk.v1.UserMetadata user_metadata = 3; + // Metadata for use by user interfaces to display the fixed as-of-start summary and details of the activity. + temporal.api.sdk.v1.UserMetadata user_metadata = 3; } message ActivityOutcome { - message Successful { - temporal.api.common.v1.Payloads output = 1; - } - - message Failed { - // Only filled on schedule-to-start timeouts, schedule-to-close timeouts or terminations. All other attempt - // failures will be recorded in ActivityAttemptState.last_failure_details. - temporal.api.failure.v1.Failure failure = 1; - } - - oneof variant { - Successful successful = 1; - Failed failed = 2; - } + message Successful { + temporal.api.common.v1.Payloads output = 1; + } + + message Failed { + // Only filled on schedule-to-start timeouts, schedule-to-close timeouts or terminations. All other attempt + // failures will be recorded in ActivityAttemptState.last_failure_details. + temporal.api.failure.v1.Failure failure = 1; + } + + oneof variant { + Successful successful = 1; + Failed failed = 2; + } } diff --git a/chasm/lib/activity/proto/v1/request_response.proto b/chasm/lib/activity/proto/v1/request_response.proto index acdc9f7107..918c6f4de3 100644 --- a/chasm/lib/activity/proto/v1/request_response.proto +++ b/chasm/lib/activity/proto/v1/request_response.proto @@ -2,63 +2,60 @@ syntax = "proto3"; package temporal.server.chasm.lib.activity.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; - import "temporal/api/workflowservice/v1/request_response.proto"; +option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; + message StartActivityExecutionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.api.workflowservice.v1.StartActivityExecutionRequest frontend_request = 2; + temporal.api.workflowservice.v1.StartActivityExecutionRequest frontend_request = 2; } message StartActivityExecutionResponse { - temporal.api.workflowservice.v1.StartActivityExecutionResponse frontend_response = 1; + temporal.api.workflowservice.v1.StartActivityExecutionResponse frontend_response = 1; } message DescribeActivityExecutionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.api.workflowservice.v1.DescribeActivityExecutionRequest frontend_request = 2; + temporal.api.workflowservice.v1.DescribeActivityExecutionRequest frontend_request = 2; } message DescribeActivityExecutionResponse { - temporal.api.workflowservice.v1.DescribeActivityExecutionResponse frontend_response = 1; + temporal.api.workflowservice.v1.DescribeActivityExecutionResponse frontend_response = 1; } message PollActivityExecutionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.api.workflowservice.v1.PollActivityExecutionRequest frontend_request = 2; + temporal.api.workflowservice.v1.PollActivityExecutionRequest frontend_request = 2; } message PollActivityExecutionResponse { - temporal.api.workflowservice.v1.PollActivityExecutionResponse frontend_response = 1; + temporal.api.workflowservice.v1.PollActivityExecutionResponse frontend_response = 1; } message TerminateActivityExecutionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.api.workflowservice.v1.TerminateActivityExecutionRequest frontend_request = 2; + temporal.api.workflowservice.v1.TerminateActivityExecutionRequest frontend_request = 2; } -message TerminateActivityExecutionResponse { -} +message TerminateActivityExecutionResponse {} message RequestCancelActivityExecutionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.api.workflowservice.v1.RequestCancelActivityExecutionRequest frontend_request = 2; + temporal.api.workflowservice.v1.RequestCancelActivityExecutionRequest frontend_request = 2; } -message RequestCancelActivityExecutionResponse { -} +message RequestCancelActivityExecutionResponse {} message DeleteActivityExecutionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.api.workflowservice.v1.DeleteActivityExecutionRequest frontend_request = 2; + temporal.api.workflowservice.v1.DeleteActivityExecutionRequest frontend_request = 2; } -message DeleteActivityExecutionResponse { -} +message DeleteActivityExecutionResponse {} diff --git a/chasm/lib/activity/proto/v1/service.proto b/chasm/lib/activity/proto/v1/service.proto index 6fa8fda429..69810bee55 100644 --- a/chasm/lib/activity/proto/v1/service.proto +++ b/chasm/lib/activity/proto/v1/service.proto @@ -2,40 +2,40 @@ syntax = "proto3"; package temporal.server.chasm.lib.activity.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; - import "chasm/lib/activity/proto/v1/request_response.proto"; -import "temporal/server/api/routing/v1/extension.proto"; import "temporal/server/api/common/v1/api_category.proto"; +import "temporal/server/api/routing/v1/extension.proto"; + +option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; service ActivityService { - rpc StartActivityExecution(StartActivityExecutionRequest) returns (StartActivityExecutionResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc DescribeActivityExecution(DescribeActivityExecutionRequest) returns (DescribeActivityExecutionResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc PollActivityExecution(PollActivityExecutionRequest) returns (PollActivityExecutionResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - rpc TerminateActivityExecution(TerminateActivityExecutionRequest) returns (TerminateActivityExecutionResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc RequestCancelActivityExecution(RequestCancelActivityExecutionRequest) returns (RequestCancelActivityExecutionResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc DeleteActivityExecution(DeleteActivityExecutionRequest) returns (DeleteActivityExecutionResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } + rpc StartActivityExecution(StartActivityExecutionRequest) returns (StartActivityExecutionResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc DescribeActivityExecution(DescribeActivityExecutionRequest) returns (DescribeActivityExecutionResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc PollActivityExecution(PollActivityExecutionRequest) returns (PollActivityExecutionResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + rpc TerminateActivityExecution(TerminateActivityExecutionRequest) returns (TerminateActivityExecutionResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc RequestCancelActivityExecution(RequestCancelActivityExecutionRequest) returns (RequestCancelActivityExecutionResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc DeleteActivityExecution(DeleteActivityExecutionRequest) returns (DeleteActivityExecutionResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.activity_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } } diff --git a/chasm/lib/activity/proto/v1/tasks.proto b/chasm/lib/activity/proto/v1/tasks.proto index 59d4bd2d90..9a1996e3dd 100644 --- a/chasm/lib/activity/proto/v1/tasks.proto +++ b/chasm/lib/activity/proto/v1/tasks.proto @@ -5,25 +5,24 @@ package temporal.server.chasm.lib.activity.proto.v1; option go_package = "go.temporal.io/server/chasm/lib/activity/gen/activitypb;activitypb"; message ActivityDispatchTask { - // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. - int32 stamp = 1; + // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. + int32 stamp = 1; } message ScheduleToStartTimeoutTask { - // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. - int32 stamp = 1; + // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. + int32 stamp = 1; } -message ScheduleToCloseTimeoutTask { -} +message ScheduleToCloseTimeoutTask {} message StartToCloseTimeoutTask { - // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. - int32 stamp = 1; + // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. + int32 stamp = 1; } // HeartbeatTimeoutTask is a pure task that enforces heartbeat timeouts. message HeartbeatTimeoutTask { - // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. - int32 stamp = 1; + // The current stamp for this activity execution. Used for task validation. See also [ActivityAttemptState]. + int32 stamp = 1; } diff --git a/chasm/lib/callback/proto/v1/message.proto b/chasm/lib/callback/proto/v1/message.proto index 0b0f102e80..057e5c470e 100644 --- a/chasm/lib/callback/proto/v1/message.proto +++ b/chasm/lib/callback/proto/v1/message.proto @@ -2,70 +2,69 @@ syntax = "proto3"; package temporal.server.chasm.lib.callbacks.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/callbacks/gen/callbackspb;callbackspb"; - import "google/protobuf/timestamp.proto"; import "temporal/api/common/v1/message.proto"; import "temporal/api/failure/v1/message.proto"; +option go_package = "go.temporal.io/server/chasm/lib/callbacks/gen/callbackspb;callbackspb"; + message CallbackState { - // Trigger for when the workflow is closed. - message WorkflowClosed {} + // Trigger for when the workflow is closed. + message WorkflowClosed {} - // Information on how this callback should be invoked (e.g. its URL and type). - Callback callback = 1; - // The time when the callback was registered. - google.protobuf.Timestamp registration_time = 3; + // Information on how this callback should be invoked (e.g. its URL and type). + Callback callback = 1; + // The time when the callback was registered. + google.protobuf.Timestamp registration_time = 3; - CallbackStatus status = 4; - // The number of attempts made to deliver the callback. - // This number represents a minimum bound since the attempt is incremented after the callback request completes. - int32 attempt = 5; + CallbackStatus status = 4; + // The number of attempts made to deliver the callback. + // This number represents a minimum bound since the attempt is incremented after the callback request completes. + int32 attempt = 5; - // The time when the last attempt completed. - google.protobuf.Timestamp last_attempt_complete_time = 6; - // The last attempt's failure, if any. - temporal.api.failure.v1.Failure last_attempt_failure = 7; - // The time when the next attempt is scheduled. - // NOTE (seankane): this field might go away in the future, discussion: - // https://github.com/temporalio/temporal/pull/8473#discussion_r2427348436 - google.protobuf.Timestamp next_attempt_schedule_time = 8; + // The time when the last attempt completed. + google.protobuf.Timestamp last_attempt_complete_time = 6; + // The last attempt's failure, if any. + temporal.api.failure.v1.Failure last_attempt_failure = 7; + // The time when the next attempt is scheduled. + // NOTE (seankane): this field might go away in the future, discussion: + // https://github.com/temporalio/temporal/pull/8473#discussion_r2427348436 + google.protobuf.Timestamp next_attempt_schedule_time = 8; - // Request ID that added the callback. - string request_id = 9; + // Request ID that added the callback. + string request_id = 9; } - // Status of a callback. enum CallbackStatus { - // Default value, unspecified state. - CALLBACK_STATUS_UNSPECIFIED = 0; - // Callback is standing by, waiting to be triggered. - CALLBACK_STATUS_STANDBY = 1; - // Callback is in the queue waiting to be executed or is currently executing. - CALLBACK_STATUS_SCHEDULED = 2; - // Callback has failed with a retryable error and is backing off before the next attempt. - CALLBACK_STATUS_BACKING_OFF = 3; - // Callback has failed. - CALLBACK_STATUS_FAILED = 4; - // Callback has succeeded. - CALLBACK_STATUS_SUCCEEDED = 5; + // Default value, unspecified state. + CALLBACK_STATUS_UNSPECIFIED = 0; + // Callback is standing by, waiting to be triggered. + CALLBACK_STATUS_STANDBY = 1; + // Callback is in the queue waiting to be executed or is currently executing. + CALLBACK_STATUS_SCHEDULED = 2; + // Callback has failed with a retryable error and is backing off before the next attempt. + CALLBACK_STATUS_BACKING_OFF = 3; + // Callback has failed. + CALLBACK_STATUS_FAILED = 4; + // Callback has succeeded. + CALLBACK_STATUS_SUCCEEDED = 5; } message Callback { - message Nexus { - // Callback URL. - // (-- api-linter: core::0140::uri=disabled - // aip.dev/not-precedent: Not respecting aip here. --) - string url = 1; - // Header to attach to callback request. - map header = 2; - } + message Nexus { + // Callback URL. + // (-- api-linter: core::0140::uri=disabled + // aip.dev/not-precedent: Not respecting aip here. --) + string url = 1; + // Header to attach to callback request. + map header = 2; + } - reserved 1; // For a generic callback mechanism to be added later. - oneof variant { - Nexus nexus = 2; - } + reserved 1; // For a generic callback mechanism to be added later. + oneof variant { + Nexus nexus = 2; + } - repeated temporal.api.common.v1.Link links = 100; + repeated temporal.api.common.v1.Link links = 100; } diff --git a/chasm/lib/callback/proto/v1/tasks.proto b/chasm/lib/callback/proto/v1/tasks.proto index b4a9bf68e3..f4cb65faa6 100644 --- a/chasm/lib/callback/proto/v1/tasks.proto +++ b/chasm/lib/callback/proto/v1/tasks.proto @@ -5,11 +5,11 @@ package temporal.server.chasm.lib.callbacks.proto.v1; option go_package = "go.temporal.io/server/chasm/lib/callbacks/gen/callbackspb;callbackspb"; message InvocationTask { - // The attempt number for this invocation. - int32 attempt = 1; + // The attempt number for this invocation. + int32 attempt = 1; } message BackoffTask { - // The attempt number for this invocation. - int32 attempt = 1; -} \ No newline at end of file + // The attempt number for this invocation. + int32 attempt = 1; +} diff --git a/chasm/lib/callback/tasks_test.go b/chasm/lib/callback/tasks_test.go index 1368fae1d6..82248a80fd 100644 --- a/chasm/lib/callback/tasks_test.go +++ b/chasm/lib/callback/tasks_test.go @@ -143,6 +143,7 @@ func TestExecuteInvocationTaskNexus_Outcomes(t *testing.T) { // Setup metrics expectations metricsHandler := metrics.NewMockHandler(ctrl) + metricsHandler.EXPECT().WithTags(gomock.Any()).Return(metricsHandler).AnyTimes() counter := metrics.NewMockCounterIface(ctrl) timer := metrics.NewMockTimerIface(ctrl) metricsHandler.EXPECT().Counter(RequestCounter.Name()).Return(counter) diff --git a/chasm/lib/nexusoperation/proto/v1/operation.proto b/chasm/lib/nexusoperation/proto/v1/operation.proto index b8203d8203..b5eabf3001 100644 --- a/chasm/lib/nexusoperation/proto/v1/operation.proto +++ b/chasm/lib/nexusoperation/proto/v1/operation.proto @@ -10,24 +10,24 @@ message OperationState { enum OperationStatus { // Default value, unspecified status. - OPERATION_STATUS_UNSPECIFIED = 0; + OPERATION_STATUS_UNSPECIFIED = 0; // Operation is in the queue waiting to be executed or is currently executing. - OPERATION_STATUS_SCHEDULED = 1; + OPERATION_STATUS_SCHEDULED = 1; // Operation has failed with a retryable error and is backing off before the next attempt. - OPERATION_STATUS_BACKING_OFF = 2; + OPERATION_STATUS_BACKING_OFF = 2; // Operation was started and will complete asynchronously. - OPERATION_STATUS_STARTED = 3; + OPERATION_STATUS_STARTED = 3; // Operation succeeded. // This may happen either as a response to a start request or as reported via callback. - OPERATION_STATUS_SUCCEEDED = 4; + OPERATION_STATUS_SUCCEEDED = 4; // Operation failed either when a start request encounters a non-retryable error or as reported via callback. - OPERATION_STATUS_FAILED = 5; + OPERATION_STATUS_FAILED = 5; // Operation completed as canceled (may have not ever been delivered). // This may happen either as a response to a start request or as reported via callback. - OPERATION_STATUS_CANCELED = 6; + OPERATION_STATUS_CANCELED = 6; // Operation timed out - exceeded the user supplied schedule-to-close timeout. // Any attempts to complete the operation in this status will be ignored. - OPERATION_STATUS_TIMED_OUT = 7; + OPERATION_STATUS_TIMED_OUT = 7; } message CancellationState { @@ -36,17 +36,17 @@ message CancellationState { enum CancellationStatus { // Default value, unspecified status. - CANCELLATION_STATUS_UNSPECIFIED = 0; + CANCELLATION_STATUS_UNSPECIFIED = 0; // Cancellation request is in the queue waiting to be executed or is currently executing. - CANCELLATION_STATUS_SCHEDULED = 1; + CANCELLATION_STATUS_SCHEDULED = 1; // Cancellation request has failed with a retryable error and is backing off before the next attempt. - CANCELLATION_STATUS_BACKING_OFF = 2; + CANCELLATION_STATUS_BACKING_OFF = 2; // Cancellation request succeeded. - CANCELLATION_STATUS_SUCCEEDED = 3; + CANCELLATION_STATUS_SUCCEEDED = 3; // Cancellation request failed with a non-retryable error. - CANCELLATION_STATUS_FAILED = 4; + CANCELLATION_STATUS_FAILED = 4; // The associated operation timed out - exceeded the user supplied schedule-to-close timeout. - CANCELLATION_STATUS_TIMED_OUT = 5; + CANCELLATION_STATUS_TIMED_OUT = 5; // Cancellation request is blocked (eg: by circuit breaker). - CANCELLATION_STATUS_BLOCKED = 6; + CANCELLATION_STATUS_BLOCKED = 6; } diff --git a/chasm/lib/nexusoperation/proto/v1/tasks.proto b/chasm/lib/nexusoperation/proto/v1/tasks.proto index a144c98882..a5ccddce06 100644 --- a/chasm/lib/nexusoperation/proto/v1/tasks.proto +++ b/chasm/lib/nexusoperation/proto/v1/tasks.proto @@ -22,4 +22,4 @@ message CancellationTask { message CancellationBackoffTask { int32 attempt = 1; -} \ No newline at end of file +} diff --git a/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go b/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go index dad02f948a..16e69c8559 100644 --- a/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go +++ b/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go @@ -601,7 +601,7 @@ var File_temporal_server_chasm_lib_scheduler_proto_v1_message_proto protoreflect const file_temporal_server_chasm_lib_scheduler_proto_v1_message_proto_rawDesc = "" + "\n" + - ":temporal/server/chasm/lib/scheduler/proto/v1/message.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/schedule/v1/message.proto\x1a-temporal/server/api/schedule/v1/message.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\xbe\x03\n" + + ":temporal/server/chasm/lib/scheduler/proto/v1/message.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/schedule/v1/message.proto\x1a-temporal/server/api/schedule/v1/message.proto\"\xbe\x03\n" + "\x0eSchedulerState\x12>\n" + "\bschedule\x18\x02 \x01(\v2\".temporal.api.schedule.v1.ScheduleR\bschedule\x12:\n" + "\x04info\x18\x03 \x01(\v2&.temporal.api.schedule.v1.ScheduleInfoR\x04info\x12\x1c\n" + diff --git a/chasm/lib/scheduler/gen/schedulerpb/v1/request_response.pb.go b/chasm/lib/scheduler/gen/schedulerpb/v1/request_response.pb.go index 06b70ebc66..1329a1557b 100644 --- a/chasm/lib/scheduler/gen/schedulerpb/v1/request_response.pb.go +++ b/chasm/lib/scheduler/gen/schedulerpb/v1/request_response.pb.go @@ -903,7 +903,7 @@ var File_temporal_server_chasm_lib_scheduler_proto_v1_request_response_proto pro const file_temporal_server_chasm_lib_scheduler_proto_v1_request_response_proto_rawDesc = "" + "\n" + - "Ctemporal/server/chasm/lib/scheduler/proto/v1/request_response.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1a6temporal/api/workflowservice/v1/request_response.proto\x1a:temporal/server/chasm/lib/scheduler/proto/v1/message.proto\"\x9d\x01\n" + + "Ctemporal/server/chasm/lib/scheduler/proto/v1/request_response.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1a:temporal/server/chasm/lib/scheduler/proto/v1/message.proto\x1a6temporal/api/workflowservice/v1/request_response.proto\"\x9d\x01\n" + "\x15CreateScheduleRequest\x12!\n" + "\fnamespace_id\x18\x01 \x01(\tR\vnamespaceId\x12a\n" + "\x10frontend_request\x18\x02 \x01(\v26.temporal.api.workflowservice.v1.CreateScheduleRequestR\x0ffrontendRequest\"~\n" + diff --git a/chasm/lib/scheduler/gen/schedulerpb/v1/service.pb.go b/chasm/lib/scheduler/gen/schedulerpb/v1/service.pb.go index 9a88933374..8c5afe3c65 100644 --- a/chasm/lib/scheduler/gen/schedulerpb/v1/service.pb.go +++ b/chasm/lib/scheduler/gen/schedulerpb/v1/service.pb.go @@ -27,7 +27,7 @@ var File_temporal_server_chasm_lib_scheduler_proto_v1_service_proto protoreflect const file_temporal_server_chasm_lib_scheduler_proto_v1_service_proto_rawDesc = "" + "\n" + - ":temporal/server/chasm/lib/scheduler/proto/v1/service.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1aCtemporal/server/chasm/lib/scheduler/proto/v1/request_response.proto\x1a.temporal/server/api/routing/v1/extension.proto\x1a0temporal/server/api/common/v1/api_category.proto2\xb6\x0e\n" + + ":temporal/server/chasm/lib/scheduler/proto/v1/service.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1aCtemporal/server/chasm/lib/scheduler/proto/v1/request_response.proto\x1a0temporal/server/api/common/v1/api_category.proto\x1a.temporal/server/api/routing/v1/extension.proto2\xb6\x0e\n" + "\x10SchedulerService\x12\xc5\x01\n" + "\x0eCreateSchedule\x12C.temporal.server.chasm.lib.scheduler.proto.v1.CreateScheduleRequest\x1aD.temporal.server.chasm.lib.scheduler.proto.v1.CreateScheduleResponse\"(\x92\xc4\x03\x1e\x1a\x1cfrontend_request.schedule_id\x8a\xb5\x18\x02\b\x01\x12\xc5\x01\n" + "\x0eUpdateSchedule\x12C.temporal.server.chasm.lib.scheduler.proto.v1.UpdateScheduleRequest\x1aD.temporal.server.chasm.lib.scheduler.proto.v1.UpdateScheduleResponse\"(\x92\xc4\x03\x1e\x1a\x1cfrontend_request.schedule_id\x8a\xb5\x18\x02\b\x01\x12\xc2\x01\n" + diff --git a/chasm/lib/scheduler/proto/v1/message.proto b/chasm/lib/scheduler/proto/v1/message.proto index 0187e7cd8d..999dd406dc 100644 --- a/chasm/lib/scheduler/proto/v1/message.proto +++ b/chasm/lib/scheduler/proto/v1/message.proto @@ -2,122 +2,121 @@ syntax = "proto3"; package temporal.server.chasm.lib.scheduler.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; - +import "google/protobuf/timestamp.proto"; import "temporal/api/common/v1/message.proto"; import "temporal/api/failure/v1/message.proto"; import "temporal/api/schedule/v1/message.proto"; import "temporal/server/api/schedule/v1/message.proto"; -import "google/protobuf/timestamp.proto"; +option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; // CHASM scheduler top-level state. message SchedulerState { - // Scheduler request parameters and metadata. - temporal.api.schedule.v1.Schedule schedule = 2; - temporal.api.schedule.v1.ScheduleInfo info = 3; - - // State common to all generators is stored in the top-level machine. - string namespace = 5; - string namespace_id = 6; - string schedule_id = 7; - - // Implemented as a sequence number. Used for optimistic locking against - // update requests. - int64 conflict_token = 8; - - // The closed flag is set true after a schedule completes, and the idle timer - // expires. - bool closed = 9; - - // When true, this scheduler is a sentinel that exists only to reserve the - // schedule ID. All API operations return NotFound. - bool sentinel = 10; - - // Set when a migration to workflow-backed scheduler (V1) is pending. - // Unpause operations are blocked while this is set. - WorkflowMigrationState workflow_migration = 11; + // Scheduler request parameters and metadata. + temporal.api.schedule.v1.Schedule schedule = 2; + temporal.api.schedule.v1.ScheduleInfo info = 3; + + // State common to all generators is stored in the top-level machine. + string namespace = 5; + string namespace_id = 6; + string schedule_id = 7; + + // Implemented as a sequence number. Used for optimistic locking against + // update requests. + int64 conflict_token = 8; + + // The closed flag is set true after a schedule completes, and the idle timer + // expires. + bool closed = 9; + + // When true, this scheduler is a sentinel that exists only to reserve the + // schedule ID. All API operations return NotFound. + bool sentinel = 10; + + // Set when a migration to workflow-backed scheduler (V1) is pending. + // Unpause operations are blocked while this is set. + WorkflowMigrationState workflow_migration = 11; } // WorkflowMigrationState tracks the state of an in-progress V2-to-V1 migration. message WorkflowMigrationState { - // The schedule's paused state before migration was initiated. Used to - // restore the correct paused state when passing state to the V1 workflow. - bool pre_migration_paused = 1; + // The schedule's paused state before migration was initiated. Used to + // restore the correct paused state when passing state to the V1 workflow. + bool pre_migration_paused = 1; - // The schedule's notes before migration was initiated. - string pre_migration_notes = 2; + // The schedule's notes before migration was initiated. + string pre_migration_notes = 2; } // CHASM scheduler's Generator internal state. message GeneratorState { - // High water mark. - google.protobuf.Timestamp last_processed_time = 3; + // High water mark. + google.protobuf.Timestamp last_processed_time = 3; - // A list of upcoming times an action will be triggered. - repeated google.protobuf.Timestamp future_action_times = 4; + // A list of upcoming times an action will be triggered. + repeated google.protobuf.Timestamp future_action_times = 4; } // CHASM scheduler's Invoker internal state. message InvokerState { - // Buffered starts that will be started by the Invoker. - repeated temporal.server.api.schedule.v1.BufferedStart buffered_starts = 2; + // Buffered starts that will be started by the Invoker. + repeated temporal.server.api.schedule.v1.BufferedStart buffered_starts = 2; - // Workflow executions that will be cancelled due to overlap policy. - repeated temporal.api.common.v1.WorkflowExecution cancel_workflows = 3; + // Workflow executions that will be cancelled due to overlap policy. + repeated temporal.api.common.v1.WorkflowExecution cancel_workflows = 3; - // Workflow executions that will be terminated due to overlap policy. - repeated temporal.api.common.v1.WorkflowExecution terminate_workflows = 4; + // Workflow executions that will be terminated due to overlap policy. + repeated temporal.api.common.v1.WorkflowExecution terminate_workflows = 4; - // High water mark, used for evaluating when to fire tasks that are backing - // off from a retry. LastProcessedTime is stored as state so that task - // generation will be consistent, regardless of when generation occurs, such - // as after applying a replicated state (as opposed to evaluating based on - // present time). - google.protobuf.Timestamp last_processed_time = 5; + // High water mark, used for evaluating when to fire tasks that are backing + // off from a retry. LastProcessedTime is stored as state so that task + // generation will be consistent, regardless of when generation occurs, such + // as after applying a replicated state (as opposed to evaluating based on + // present time). + google.protobuf.Timestamp last_processed_time = 5; - reserved 6; + reserved 6; } // CHASM scheduler's Backfiller internal state. Backfill requests are 1:1 // with Backfiller nodes. Backfiller nodes also handle immediate trigger requests. message BackfillerState { - oneof request { - temporal.api.schedule.v1.BackfillRequest backfill_request = 1; + oneof request { + temporal.api.schedule.v1.BackfillRequest backfill_request = 1; - // When set, immediately buffer a single manual action. - temporal.api.schedule.v1.TriggerImmediatelyRequest trigger_request = 2; - } + // When set, immediately buffer a single manual action. + temporal.api.schedule.v1.TriggerImmediatelyRequest trigger_request = 2; + } - // Every Backfiller should be assigned a unique ID upon creation, used - // for deduplication. - string backfill_id = 6; + // Every Backfiller should be assigned a unique ID upon creation, used + // for deduplication. + string backfill_id = 6; - // High water mark. - google.protobuf.Timestamp last_processed_time = 7; + // High water mark. + google.protobuf.Timestamp last_processed_time = 7; - // Attempt count, incremented when the buffer is full and the Backfiller - // needs to back off before retrying to fill. - int64 attempt = 8; + // Attempt count, incremented when the buffer is full and the Backfiller + // needs to back off before retrying to fill. + int64 attempt = 8; } // CHASM scheduler retains the payload data for the last completed workflow. Both // last success and failure are stored simultaneously. message LastCompletionResult { - temporal.api.common.v1.Payload success = 1; - temporal.api.failure.v1.Failure failure = 2; + temporal.api.common.v1.Payload success = 1; + temporal.api.failure.v1.Failure failure = 2; } // SchedulerMigrationState is a stack-agnostic interchange format for migrating // scheduler state between V1 (workflow-backed) and V2 (CHASM) implementations. message SchedulerMigrationState { - SchedulerState scheduler_state = 1; - GeneratorState generator_state = 2; - InvokerState invoker_state = 3; - map backfillers = 4; - LastCompletionResult last_completion_result = 5; - - // Visibility data. - map search_attributes = 6; - map memo = 7; + SchedulerState scheduler_state = 1; + GeneratorState generator_state = 2; + InvokerState invoker_state = 3; + map backfillers = 4; + LastCompletionResult last_completion_result = 5; + + // Visibility data. + map search_attributes = 6; + map memo = 7; } diff --git a/chasm/lib/scheduler/proto/v1/request_response.proto b/chasm/lib/scheduler/proto/v1/request_response.proto index 26eb1b05b7..9dab1efa97 100644 --- a/chasm/lib/scheduler/proto/v1/request_response.proto +++ b/chasm/lib/scheduler/proto/v1/request_response.proto @@ -2,106 +2,106 @@ syntax = "proto3"; package temporal.server.chasm.lib.scheduler.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; - -import "temporal/api/workflowservice/v1/request_response.proto"; import "chasm/lib/scheduler/proto/v1/message.proto"; +import "temporal/api/workflowservice/v1/request_response.proto"; + +option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; message CreateScheduleRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - temporal.api.workflowservice.v1.CreateScheduleRequest frontend_request = 2; + temporal.api.workflowservice.v1.CreateScheduleRequest frontend_request = 2; } message CreateScheduleResponse { - temporal.api.workflowservice.v1.CreateScheduleResponse frontend_response = 1; + temporal.api.workflowservice.v1.CreateScheduleResponse frontend_response = 1; } message UpdateScheduleRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - temporal.api.workflowservice.v1.UpdateScheduleRequest frontend_request = 2; + temporal.api.workflowservice.v1.UpdateScheduleRequest frontend_request = 2; } message UpdateScheduleResponse { - temporal.api.workflowservice.v1.UpdateScheduleResponse frontend_response = 1; + temporal.api.workflowservice.v1.UpdateScheduleResponse frontend_response = 1; } message PatchScheduleRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - temporal.api.workflowservice.v1.PatchScheduleRequest frontend_request = 2; + temporal.api.workflowservice.v1.PatchScheduleRequest frontend_request = 2; } message PatchScheduleResponse { - temporal.api.workflowservice.v1.PatchScheduleResponse frontend_response = 1; + temporal.api.workflowservice.v1.PatchScheduleResponse frontend_response = 1; } message DeleteScheduleRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - temporal.api.workflowservice.v1.DeleteScheduleRequest frontend_request = 2; + temporal.api.workflowservice.v1.DeleteScheduleRequest frontend_request = 2; } message DeleteScheduleResponse { - temporal.api.workflowservice.v1.DeleteScheduleResponse frontend_response = 1; + temporal.api.workflowservice.v1.DeleteScheduleResponse frontend_response = 1; } message DescribeScheduleRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - temporal.api.workflowservice.v1.DescribeScheduleRequest frontend_request = 2; + temporal.api.workflowservice.v1.DescribeScheduleRequest frontend_request = 2; } message DescribeScheduleResponse { - temporal.api.workflowservice.v1.DescribeScheduleResponse frontend_response = 1; + temporal.api.workflowservice.v1.DescribeScheduleResponse frontend_response = 1; } message ListScheduleMatchingTimesRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - temporal.api.workflowservice.v1.ListScheduleMatchingTimesRequest frontend_request = 2; + temporal.api.workflowservice.v1.ListScheduleMatchingTimesRequest frontend_request = 2; } message ListScheduleMatchingTimesResponse { - temporal.api.workflowservice.v1.ListScheduleMatchingTimesResponse frontend_response = 1; + temporal.api.workflowservice.v1.ListScheduleMatchingTimesResponse frontend_response = 1; } message CreateFromMigrationStateRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - SchedulerMigrationState state = 2; + SchedulerMigrationState state = 2; } message CreateFromMigrationStateResponse {} message CreateSentinelRequest { - // Internal namespace ID (UUID). - string namespace_id = 1; + // Internal namespace ID (UUID). + string namespace_id = 1; - string namespace = 2; + string namespace = 2; - string schedule_id = 3; + string schedule_id = 3; } message CreateSentinelResponse {} message MigrateToWorkflowRequest { - // The namespace ID of the schedule to migrate. - string namespace_id = 1; - // The schedule ID to migrate from CHASM to workflow-backed. - string schedule_id = 2; - // The identity of the caller initiating the migration. - string identity = 3; - // A unique request ID for idempotency. - string request_id = 4; + // The namespace ID of the schedule to migrate. + string namespace_id = 1; + // The schedule ID to migrate from CHASM to workflow-backed. + string schedule_id = 2; + // The identity of the caller initiating the migration. + string identity = 3; + // A unique request ID for idempotency. + string request_id = 4; } message MigrateToWorkflowResponse {} diff --git a/chasm/lib/scheduler/proto/v1/service.proto b/chasm/lib/scheduler/proto/v1/service.proto index 132811ac82..65c21e949c 100644 --- a/chasm/lib/scheduler/proto/v1/service.proto +++ b/chasm/lib/scheduler/proto/v1/service.proto @@ -2,50 +2,50 @@ syntax = "proto3"; package temporal.server.chasm.lib.scheduler.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; - import "chasm/lib/scheduler/proto/v1/request_response.proto"; -import "temporal/server/api/routing/v1/extension.proto"; import "temporal/server/api/common/v1/api_category.proto"; +import "temporal/server/api/routing/v1/extension.proto"; + +option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; service SchedulerService { - rpc CreateSchedule(CreateScheduleRequest) returns (CreateScheduleResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc UpdateSchedule(UpdateScheduleRequest) returns (UpdateScheduleResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc PatchSchedule(PatchScheduleRequest) returns (PatchScheduleResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc DeleteSchedule(DeleteScheduleRequest) returns (DeleteScheduleResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc DescribeSchedule(DescribeScheduleRequest) returns (DescribeScheduleResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc ListScheduleMatchingTimes(ListScheduleMatchingTimesRequest) returns (ListScheduleMatchingTimesResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - rpc CreateFromMigrationState(CreateFromMigrationStateRequest) returns (CreateFromMigrationStateResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "state.scheduler_state.schedule_id"; - } - - rpc CreateSentinel(CreateSentinelRequest) returns (CreateSentinelResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "schedule_id"; - } - rpc MigrateToWorkflow(MigrateToWorkflowRequest) returns (MigrateToWorkflowResponse) { - option (temporal.server.api.routing.v1.routing).business_id = "schedule_id"; - } + rpc CreateSchedule(CreateScheduleRequest) returns (CreateScheduleResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc UpdateSchedule(UpdateScheduleRequest) returns (UpdateScheduleResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc PatchSchedule(PatchScheduleRequest) returns (PatchScheduleResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc DeleteSchedule(DeleteScheduleRequest) returns (DeleteScheduleResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc DescribeSchedule(DescribeScheduleRequest) returns (DescribeScheduleResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc ListScheduleMatchingTimes(ListScheduleMatchingTimesRequest) returns (ListScheduleMatchingTimesResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "frontend_request.schedule_id"; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + rpc CreateFromMigrationState(CreateFromMigrationStateRequest) returns (CreateFromMigrationStateResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "state.scheduler_state.schedule_id"; + } + + rpc CreateSentinel(CreateSentinelRequest) returns (CreateSentinelResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "schedule_id"; + } + rpc MigrateToWorkflow(MigrateToWorkflowRequest) returns (MigrateToWorkflowResponse) { + option (temporal.server.api.routing.v1.routing).business_id = "schedule_id"; + } } diff --git a/chasm/lib/scheduler/proto/v1/tasks.proto b/chasm/lib/scheduler/proto/v1/tasks.proto index 20700f186d..077e71fe0b 100644 --- a/chasm/lib/scheduler/proto/v1/tasks.proto +++ b/chasm/lib/scheduler/proto/v1/tasks.proto @@ -2,10 +2,10 @@ syntax = "proto3"; package temporal.server.chasm.lib.scheduler.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; - import "google/protobuf/duration.proto"; +option go_package = "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb;schedulerpb"; + // Fires when the scheduler's idle period has lapsed, and the scheduler should // be closed. message SchedulerIdleTask { diff --git a/chasm/lib/scheduler/scheduler.go b/chasm/lib/scheduler/scheduler.go index dedd9b4ac9..6107ca8ef1 100644 --- a/chasm/lib/scheduler/scheduler.go +++ b/chasm/lib/scheduler/scheduler.go @@ -744,6 +744,11 @@ func (s *Scheduler) Update( if s.Sentinel { return nil, ErrSentinel } + // UpdateComponent does not reject mutations on completed executions, + // so we must check explicitly here. + if s.Closed { + return nil, ErrClosed + } if !s.validateConflictToken(req.FrontendRequest.ConflictToken) { return nil, ErrConflictTokenMismatch } @@ -796,6 +801,11 @@ func (s *Scheduler) Patch( if s.Sentinel { return nil, ErrSentinel } + // UpdateComponent does not reject mutations on completed executions, + // so we must check explicitly here. + if s.Closed { + return nil, ErrClosed + } // Handle paused status. if req.FrontendRequest.Patch.Pause != "" { s.Schedule.State.Paused = true diff --git a/chasm/lib/tests/gen/testspb/v1/service.pb.go b/chasm/lib/tests/gen/testspb/v1/service.pb.go index e3b105a6b0..236ecca1ee 100644 --- a/chasm/lib/tests/gen/testspb/v1/service.pb.go +++ b/chasm/lib/tests/gen/testspb/v1/service.pb.go @@ -27,7 +27,7 @@ var File_temporal_server_chasm_lib_tests_proto_v1_service_proto protoreflect.Fil const file_temporal_server_chasm_lib_tests_proto_v1_service_proto_rawDesc = "" + "\n" + - "6temporal/server/chasm/lib/tests/proto/v1/service.proto\x12(temporal.server.chasm.lib.tests.proto.v1\x1a?temporal/server/chasm/lib/tests/proto/v1/request_response.proto\x1a.temporal/server/api/routing/v1/extension.proto\x1a0temporal/server/api/common/v1/api_category.proto2\x93\x01\n" + + "6temporal/server/chasm/lib/tests/proto/v1/service.proto\x12(temporal.server.chasm.lib.tests.proto.v1\x1a?temporal/server/chasm/lib/tests/proto/v1/request_response.proto\x1a0temporal/server/api/common/v1/api_category.proto\x1a.temporal/server/api/routing/v1/extension.proto2\x93\x01\n" + "\vTestService\x12\x83\x01\n" + "\x04Test\x125.temporal.server.chasm.lib.tests.proto.v1.TestRequest\x1a6.temporal.server.chasm.lib.tests.proto.v1.TestResponse\"\f\x92\xc4\x03\x02\b\x01\x8a\xb5\x18\x02\b\x01B;Z9go.temporal.io/server/chasm/lib/tests/gen/testspb;testspbb\x06proto3" diff --git a/chasm/lib/tests/proto/v1/message.proto b/chasm/lib/tests/proto/v1/message.proto index 3013429a04..98f077d4c2 100644 --- a/chasm/lib/tests/proto/v1/message.proto +++ b/chasm/lib/tests/proto/v1/message.proto @@ -2,24 +2,23 @@ syntax = "proto3"; package temporal.server.chasm.lib.tests.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/tests/gen/testspb;testspb"; - import "google/protobuf/timestamp.proto"; +option go_package = "go.temporal.io/server/chasm/lib/tests/gen/testspb;testspb"; + message TestPayloadStore { - int64 total_count = 1; - int64 total_size = 2; - // (-- api-linter: core::0142::time-field-type=disabled --) - map expiration_times = 3; - bool closed = 4; - bool canceled = 5; + int64 total_count = 1; + int64 total_size = 2; + // (-- api-linter: core::0142::time-field-type=disabled --) + map expiration_times = 3; + bool closed = 4; + bool canceled = 5; } message TestPayloadTTLPureTask { - string payload_key = 1; + string payload_key = 1; } message TestPayloadTTLSideEffectTask { - string payload_key = 1; + string payload_key = 1; } - diff --git a/chasm/lib/tests/proto/v1/request_response.proto b/chasm/lib/tests/proto/v1/request_response.proto index ba8f0b4697..43b0a48659 100644 --- a/chasm/lib/tests/proto/v1/request_response.proto +++ b/chasm/lib/tests/proto/v1/request_response.proto @@ -5,10 +5,10 @@ package temporal.server.chasm.lib.tests.proto.v1; option go_package = "go.temporal.io/server/chasm/lib/tests/gen/testspb;testspb"; message TestRequest { - string request_id = 1; + string request_id = 1; } message TestResponse { - string request_id = 1; - bool has_engine_ctx = 2; + string request_id = 1; + bool has_engine_ctx = 2; } diff --git a/chasm/lib/tests/proto/v1/service.proto b/chasm/lib/tests/proto/v1/service.proto index 02c8f9cc65..782e4aa27e 100644 --- a/chasm/lib/tests/proto/v1/service.proto +++ b/chasm/lib/tests/proto/v1/service.proto @@ -2,15 +2,15 @@ syntax = "proto3"; package temporal.server.chasm.lib.tests.proto.v1; -option go_package = "go.temporal.io/server/chasm/lib/tests/gen/testspb;testspb"; - import "chasm/lib/tests/proto/v1/request_response.proto"; -import "temporal/server/api/routing/v1/extension.proto"; import "temporal/server/api/common/v1/api_category.proto"; +import "temporal/server/api/routing/v1/extension.proto"; + +option go_package = "go.temporal.io/server/chasm/lib/tests/gen/testspb;testspb"; service TestService { - rpc Test(TestRequest) returns (TestResponse) { - option (temporal.server.api.routing.v1.routing).random = true; - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } + rpc Test(TestRequest) returns (TestResponse) { + option (temporal.server.api.routing.v1.routing).random = true; + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } } diff --git a/chasm/tree.go b/chasm/tree.go index 46ca707b38..ec5ffc3d20 100644 --- a/chasm/tree.go +++ b/chasm/tree.go @@ -2993,7 +2993,14 @@ func (n *Node) ExecutePureTask( defer log.CapturePanic(n.logger, &retErr) - return true, registrableTask.pureTaskExecuteFn( + archetypeTag := metrics.ArchetypeTag("") + if name, ok := n.registry.ArchetypeDisplayName(n.ArchetypeID()); ok { + archetypeTag = metrics.ArchetypeTag(name) + } + chasmTaskTypeTag := metrics.ChasmTaskTypeTag(registrableTask.fqType()) + metricsHandler := n.metricsHandler.WithTags(archetypeTag) + + execErr := registrableTask.pureTaskExecuteFn( executionContext, component, taskAttributes, @@ -3001,12 +3008,21 @@ func (n *Node) ExecutePureTask( n.registry, ) + metrics.ChasmPureTaskRequests.With(metricsHandler).Record(1, chasmTaskTypeTag) + + if execErr != nil { + metrics.ChasmPureTaskErrors.With(metricsHandler).Record(1, chasmTaskTypeTag) + return true, execErr + } + // TODO - a task validator must succeed validation after a task executes // successfully (without error), otherwise it will generate an infinite loop. // Check for this case by marking the in-memory task as having executed, which the // CloseTransaction method will check against. // // See: https://github.com/temporalio/temporal/pull/7701#discussion_r2072026993 + + return true, nil } // ValidatePureTask runs a pure task's associated validator, returning true diff --git a/chasm/visibility.go b/chasm/visibility.go index 9c4b6b4081..50f74530dd 100644 --- a/chasm/visibility.go +++ b/chasm/visibility.go @@ -161,10 +161,12 @@ func NewVisibilityWithData( }, } - if len(customSearchAttributes) != 0 { + // Filter out nil/empty payload values for search attributes. + filteredSA := payload.MergeMapOfPayload(nil, customSearchAttributes) + if len(filteredSA) != 0 { visibility.SA = NewDataField( mutableContext, - &commonpb.SearchAttributes{IndexedFields: customSearchAttributes}, + &commonpb.SearchAttributes{IndexedFields: filteredSA}, ) } if len(customMemo) != 0 { @@ -228,12 +230,16 @@ func (v *Visibility) MergeCustomSearchAttributes( } // ReplaceCustomSearchAttributes replaces the existing custom search attribute fields with the provided ones. -// If `customSearchAttributes` is empty, the underlying search attributes node is deleted. +// Nil/empty payload values are filtered. +// If `customSearchAttributes` is empty or all values are nil after filtering, the underlying search attributes node is deleted. func (v *Visibility) ReplaceCustomSearchAttributes( mutableContext MutableContext, customSearchAttributes map[string]*commonpb.Payload, ) { - if len(customSearchAttributes) == 0 { + // Filter out nil/empty payload values. + filteredSA := payload.MergeMapOfPayload(nil, customSearchAttributes) + + if len(filteredSA) == 0 { _, ok := v.SA.TryGet(mutableContext) if !ok { // Already empty, no-op @@ -244,7 +250,7 @@ func (v *Visibility) ReplaceCustomSearchAttributes( } else { v.SA = NewDataField( mutableContext, - &commonpb.SearchAttributes{IndexedFields: customSearchAttributes}, + &commonpb.SearchAttributes{IndexedFields: filteredSA}, ) } diff --git a/chasm/visibility_test.go b/chasm/visibility_test.go index 9a9436b795..4308791e60 100644 --- a/chasm/visibility_test.go +++ b/chasm/visibility_test.go @@ -131,6 +131,29 @@ func (s *visibilitySuite) TestMergeCustomSearchAttributes() { s.Nil(s.visibility.CustomSearchAttributes(s.mockContext)) } +func (s *visibilitySuite) TestNewVisibilityWithData_FilterNilSearchAttributes() { + stringKey, stringVal := "stringKey", "stringValue" + // SA with 1 valid and 2 nil values - nil values should be filtered out + customSearchAttributes := map[string]*commonpb.Payload{ + stringKey: s.mustEncode(stringVal), + "nilKey1": nil, + "nilKey2": nil, + } + // Memo with 1 valid and 2 nil values - nil values should NOT be filtered out + customMemo := map[string]*commonpb.Payload{ + stringKey: s.mustEncode(stringVal), + "nilKey1": nil, + "nilKey2": nil, + } + visibility := NewVisibilityWithData(s.mockMutableContext, customSearchAttributes, customMemo) + // SA should have only 1 field (nil values filtered out) + s.Len(visibility.SA.Get(s.mockContext).IndexedFields, 1) + s.NotNil(visibility.SA.Get(s.mockContext).IndexedFields[stringKey]) + // Memo should have all 3 fields (nil values NOT filtered) + s.Len(visibility.Memo.Get(s.mockContext).Fields, 3) + s.NotNil(visibility.Memo.Get(s.mockContext).Fields[stringKey]) +} + func (s *visibilitySuite) TestReplaceCustomSearchAttributes() { stringKey, stringVal := "stringKey", "stringValue" intKey, intVal := "intKey", 42 @@ -176,6 +199,36 @@ func (s *visibilitySuite) TestReplaceCustomSearchAttributes() { _, ok := s.visibility.SA.TryGet(s.mockContext) s.False(ok) s.Nil(s.visibility.CustomSearchAttributes(s.mockContext)) + + // Test that nil values are filtered out during replace. + s.visibility.ReplaceCustomSearchAttributes( + s.mockMutableContext, + map[string]*commonpb.Payload{ + stringKey: s.mustEncode(stringVal), + intKey: nil, // Should be filtered out + }, + ) + s.Len(s.mockMutableContext.Tasks, 4) + s.assertTaskPayload(5, s.mockMutableContext.Tasks[3].Payload) + + sa = s.visibility.CustomSearchAttributes(s.mockMutableContext) + s.Len(sa, 1, "nil values should be filtered out") + s.NotNil(sa[stringKey]) + s.Nil(sa[intKey]) + + // Test that replacing with all nil values removes the node. + s.visibility.ReplaceCustomSearchAttributes( + s.mockMutableContext, + map[string]*commonpb.Payload{ + stringKey: nil, + intKey: nil, + }, + ) + s.Len(s.mockMutableContext.Tasks, 5) + s.assertTaskPayload(6, s.mockMutableContext.Tasks[4].Payload) + _, ok = s.visibility.SA.TryGet(s.mockContext) + s.False(ok) + s.Nil(s.visibility.CustomSearchAttributes(s.mockContext)) } func (s *visibilitySuite) TestMergeCustomMemo() { diff --git a/common/archiver/gcloud/connector/client.go b/common/archiver/gcloud/connector/client.go index 8fb8b6af49..f6f8dbfe78 100644 --- a/common/archiver/gcloud/connector/client.go +++ b/common/archiver/gcloud/connector/client.go @@ -73,11 +73,10 @@ func NewClientWithParams(clientD GcloudStorageClient) (Client, error) { func (s *storageWrapper) Upload(ctx context.Context, URI archiver.URI, fileName string, file []byte) (err error) { bucket := s.client.Bucket(URI.Hostname()) writer := bucket.Object(formatSinkPath(URI.Path()) + "/" + fileName).NewWriter(ctx) + defer func() { + err = multierr.Combine(err, writer.Close()) + }() _, err = io.Copy(writer, bytes.NewReader(file)) - if err == nil { - err = writer.Close() - } - return err } diff --git a/common/authorization/authorizer.go b/common/authorization/authorizer.go index 9a6202ea4f..ac79cc837e 100644 --- a/common/authorization/authorizer.go +++ b/common/authorization/authorizer.go @@ -7,6 +7,7 @@ import ( "fmt" "strings" + commonpb "go.temporal.io/api/common/v1" "go.temporal.io/server/common/config" ) @@ -40,6 +41,8 @@ type ( Decision Decision // Reason may contain a message explaining the value of the Decision field. Reason string + // Principal is the server-computed identity of the caller. Can be nil when not computed. + Principal *commonpb.Principal } // Decision is enum type for auth decision diff --git a/common/authorization/default_authorizer.go b/common/authorization/default_authorizer.go index dbc1d13dfe..03ddd31cf9 100644 --- a/common/authorization/default_authorizer.go +++ b/common/authorization/default_authorizer.go @@ -3,6 +3,7 @@ package authorization import ( "context" + commonpb "go.temporal.io/api/common/v1" "go.temporal.io/server/common/api" ) @@ -56,7 +57,9 @@ func (a *defaultAuthorizer) Authorize(_ context.Context, claims *Claims, target } if hasRole >= getRequiredRole(metadata.Access) { - return resultAllow, nil + result := Result{Decision: DecisionAllow} + result.Principal = &commonpb.Principal{Type: claims.AuthType, Name: claims.Subject} + return result, nil } return resultDeny, nil } diff --git a/common/authorization/default_jwt_claim_mapper.go b/common/authorization/default_jwt_claim_mapper.go index 42a51a75e3..a4e58d4e72 100644 --- a/common/authorization/default_jwt_claim_mapper.go +++ b/common/authorization/default_jwt_claim_mapper.go @@ -75,7 +75,7 @@ var _ ClaimMapper = (*defaultJWTClaimMapper)(nil) func (a *defaultJWTClaimMapper) GetClaims(authInfo *AuthInfo) (*Claims, error) { - claims := Claims{} + claims := Claims{AuthType: "jwt"} if authInfo.AuthToken == "" { return &claims, nil diff --git a/common/authorization/interceptor.go b/common/authorization/interceptor.go index 2c089a7de2..06a3d26745 100644 --- a/common/authorization/interceptor.go +++ b/common/authorization/interceptor.go @@ -7,6 +7,7 @@ import ( "crypto/x509/pkix" "time" + commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" "go.temporal.io/api/serviceerror" "go.temporal.io/api/workflowservice/v1" @@ -90,6 +91,7 @@ type Interceptor struct { authExtraHeaderName string exposeAuthorizerErrors dynamicconfig.BoolPropertyFn enableCrossNamespaceCommands dynamicconfig.BoolPropertyFn + enablePrincipalPropagation dynamicconfig.BoolPropertyFnWithNamespaceFilter } // NewInterceptor creates an authorization interceptor. @@ -104,6 +106,7 @@ func NewInterceptor( authExtraHeaderName string, exposeAuthorizerErrors dynamicconfig.BoolPropertyFn, enableCrossNamespaceCommands dynamicconfig.BoolPropertyFn, + enablePrincipalPropagation dynamicconfig.BoolPropertyFnWithNamespaceFilter, ) *Interceptor { return &Interceptor{ claimMapper: claimMapper, @@ -116,6 +119,7 @@ func NewInterceptor( audienceGetter: audienceGetter, exposeAuthorizerErrors: exposeAuthorizerErrors, enableCrossNamespaceCommands: enableCrossNamespaceCommands, + enablePrincipalPropagation: enablePrincipalPropagation, } } @@ -146,6 +150,10 @@ func (a *Interceptor) Intercept( ctx = a.EnhanceContext(ctx, authInfo, claims) } + // Always strip inbound principal headers to prevent external callers from + // spoofing principal identity, regardless of whether the authorizer is enabled. + ctx = headers.StripPrincipal(ctx) + if a.authorizer != nil { var namespace string requestWithNamespace, ok := req.(hasNamespace) @@ -157,9 +165,13 @@ func (a *Interceptor) Intercept( APIName: info.FullMethod, Request: req, } - if err := a.Authorize(ctx, claims, ct); err != nil { + principal, err := a.Authorize(ctx, claims, ct) + if err != nil { return nil, err } + if a.enablePrincipalPropagation != nil && a.enablePrincipalPropagation(namespace) && principal != nil { + ctx = headers.SetPrincipal(ctx, principal) + } // Authorize target namespaces in cross-namespace commands if err := a.authorizeTargetNamespaces(ctx, claims, namespace, req); err != nil { @@ -224,9 +236,10 @@ func (a *Interceptor) EnhanceContext(ctx context.Context, authInfo *AuthInfo, cl // Authorize uses the policy's authorizer to authorize a request based on provided claims and call target. // Logs and emits metrics when unauthorized. -func (a *Interceptor) Authorize(ctx context.Context, claims *Claims, ct *CallTarget) error { +// Returns the principal identity and any authorization error. +func (a *Interceptor) Authorize(ctx context.Context, claims *Claims, ct *CallTarget) (*commonpb.Principal, error) { if a.authorizer == nil { - return nil + return nil, nil } mh := a.getMetricsHandler(ct.Namespace) @@ -238,19 +251,19 @@ func (a *Interceptor) Authorize(ctx context.Context, claims *Claims, ct *CallTar metrics.ServiceErrAuthorizeFailedCounter.With(mh).Record(1) a.logger.Error("Authorization error", tag.Error(err)) if a.exposeAuthorizerErrors() { - return err + return nil, err } - return errUnauthorized // return a generic error to the caller without disclosing details + return nil, errUnauthorized // return a generic error to the caller without disclosing details } if result.Decision != DecisionAllow { metrics.ServiceErrUnauthorizedCounter.With(mh).Record(1) // if a reason is included in the result, include it in the error message if result.Reason != "" { - return serviceerror.NewPermissionDenied(RequestUnauthorized, result.Reason) + return nil, serviceerror.NewPermissionDenied(RequestUnauthorized, result.Reason) } - return errUnauthorized // return a generic error to the caller without disclosing details + return nil, errUnauthorized // return a generic error to the caller without disclosing details } - return nil + return result.Principal, nil } // getMetricsHandler returns a metrics handler with a namespace tag @@ -327,7 +340,7 @@ func (a *Interceptor) authorizeTargetNamespaces( } // Authorize access to target namespace for this specific API - if err := a.Authorize(ctx, claims, &CallTarget{ + if _, err := a.Authorize(ctx, claims, &CallTarget{ APIName: api.WorkflowServicePrefix + apiName, Namespace: targetNamespace, Request: req, diff --git a/common/authorization/interceptor_test.go b/common/authorization/interceptor_test.go index 7dfb3fd80b..965328f4a8 100644 --- a/common/authorization/interceptor_test.go +++ b/common/authorization/interceptor_test.go @@ -8,11 +8,13 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" commandpb "go.temporal.io/api/command/v1" + commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" "go.temporal.io/api/serviceerror" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/server/common/api" "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/common/headers" "go.temporal.io/server/common/log" "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/namespace" @@ -84,6 +86,7 @@ func (s *authorizerInterceptorSuite) SetupTest() { "", dynamicconfig.GetBoolPropertyFn(false), // exposeAuthorizerErrors dynamicconfig.GetBoolPropertyFn(false), // enableCrossNamespaceCommands + dynamicconfig.GetBoolPropertyFnFilteredByNamespace(false), // enablePrincipalPropagation ) s.handler = func(ctx context.Context, req any) (any, error) { return true, nil } } @@ -159,6 +162,7 @@ func (s *authorizerInterceptorSuite) TestAuthorizationFailedExposed() { "", dynamicconfig.GetBoolPropertyFn(true), // exposeAuthorizerErrors dynamicconfig.GetBoolPropertyFn(false), // enableCrossNamespaceCommands + dynamicconfig.GetBoolPropertyFnFilteredByNamespace(false), // enablePrincipalPropagation ) authErr := serviceerror.NewInternal("intentional test failure") @@ -192,6 +196,7 @@ func (s *authorizerInterceptorSuite) TestNoopClaimMapperWithoutTLS() { "", dynamicconfig.GetBoolPropertyFn(false), // exposeAuthorizerErrors dynamicconfig.GetBoolPropertyFn(false), // enableCrossNamespaceCommands + dynamicconfig.GetBoolPropertyFnFilteredByNamespace(false), // enablePrincipalPropagation ) _, err := interceptor.Intercept(ctx, describeNamespaceRequest, describeNamespaceInfo, s.handler) s.NoError(err) @@ -209,6 +214,7 @@ func (s *authorizerInterceptorSuite) TestAlternateHeaders() { "custom-extra-header", dynamicconfig.GetBoolPropertyFn(false), // exposeAuthorizerErrors dynamicconfig.GetBoolPropertyFn(false), // enableCrossNamespaceCommands + dynamicconfig.GetBoolPropertyFnFilteredByNamespace(false), // enablePrincipalPropagation ) cases := []struct { @@ -318,6 +324,7 @@ func (s *authorizerInterceptorSuite) newCrossNamespaceInterceptor(namespaces ... "", dynamicconfig.GetBoolPropertyFn(false), // exposeAuthorizerErrors dynamicconfig.GetBoolPropertyFn(true), // enableCrossNamespaceCommands + dynamicconfig.GetBoolPropertyFnFilteredByNamespace(false), // enablePrincipalPropagation ) } @@ -521,6 +528,78 @@ func (s *authorizerInterceptorSuite) TestMultipleCommands_AuthDeduplication() { s.NoError(err) } +func (s *authorizerInterceptorSuite) TestPrincipalPropagation_Enabled() { + principal := &commonpb.Principal{Type: "user", Name: "alice"} + s.mockAuthorizer.EXPECT().Authorize(gomock.Any(), nil, describeNamespaceTarget). + Return(Result{Decision: DecisionAllow, Principal: principal}, nil) + + interceptor := NewInterceptor( + s.mockClaimMapper, + s.mockAuthorizer, + s.mockMetricsHandler, + log.NewNoopLogger(), + mockNamespaceChecker(testNamespace), + nil, + "", + "", + dynamicconfig.GetBoolPropertyFn(false), // exposeAuthorizerErrors + dynamicconfig.GetBoolPropertyFn(false), // enableCrossNamespaceCommands + dynamicconfig.GetBoolPropertyFnFilteredByNamespace(true), // enablePrincipalPropagation + ) + + inCtx := metadata.NewIncomingContext(ctx, metadata.MD{}) + var gotPrincipal *commonpb.Principal + handler := func(ctx context.Context, req any) (any, error) { + gotPrincipal = headers.GetPrincipal(ctx) + return true, nil + } + + res, err := interceptor.Intercept(inCtx, describeNamespaceRequest, describeNamespaceInfo, handler) + s.True(res.(bool)) + s.NoError(err) + s.Equal(principal, gotPrincipal) +} + +func (s *authorizerInterceptorSuite) TestPrincipalPropagation_Disabled() { + s.mockAuthorizer.EXPECT().Authorize(gomock.Any(), nil, describeNamespaceTarget). + Return(Result{Decision: DecisionAllow, Principal: &commonpb.Principal{Type: "user", Name: "alice"}}, nil) + + inCtx := metadata.NewIncomingContext(ctx, metadata.MD{}) + var gotPrincipal *commonpb.Principal + handler := func(ctx context.Context, req any) (any, error) { + gotPrincipal = headers.GetPrincipal(ctx) + return true, nil + } + + // s.interceptor has enablePrincipalPropagation=false + res, err := s.interceptor.Intercept(inCtx, describeNamespaceRequest, describeNamespaceInfo, handler) + s.True(res.(bool)) + s.NoError(err) + s.Nil(gotPrincipal) +} + +func (s *authorizerInterceptorSuite) TestPrincipalPropagation_SpoofedHeadersStripped() { + s.mockAuthorizer.EXPECT().Authorize(gomock.Any(), nil, describeNamespaceTarget). + Return(Result{Decision: DecisionAllow}, nil) // no principal returned + + // Inject spoofed principal headers in the incoming context. + inCtx := metadata.NewIncomingContext(ctx, metadata.Pairs( + headers.PrincipalTypeHeaderName, "spoofed-type", + headers.PrincipalNameHeaderName, "spoofed-name", + )) + var gotPrincipal *commonpb.Principal + handler := func(ctx context.Context, req any) (any, error) { + gotPrincipal = headers.GetPrincipal(ctx) + return true, nil + } + + // s.interceptor has enablePrincipalPropagation=false + res, err := s.interceptor.Intercept(inCtx, describeNamespaceRequest, describeNamespaceInfo, handler) + s.True(res.(bool)) + s.NoError(err) + s.Nil(gotPrincipal, "spoofed principal headers should be stripped") +} + func (s *authorizerInterceptorSuite) TestMultipleTargetNamespaces() { // Test commands targeting different namespaces request := &workflowservice.RespondWorkflowTaskCompletedRequest{ diff --git a/common/authorization/roles.go b/common/authorization/roles.go index b660941008..da424844ca 100644 --- a/common/authorization/roles.go +++ b/common/authorization/roles.go @@ -31,6 +31,8 @@ type Claims struct { Namespaces map[string]Role // Free form bucket for extra data Extensions any + // AuthType identifies the authentication method that produced these claims (e.g., "jwt", "mtls"). + AuthType string } // @@@SNIPEND diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index 54b2630218..d8fb85d1cc 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -9,6 +9,7 @@ import ( "go.temporal.io/server/common/debug" "go.temporal.io/server/common/primitives" "go.temporal.io/server/common/retrypolicy" + "go.temporal.io/server/common/util" "go.temporal.io/server/service/matching/counter" ) @@ -837,6 +838,12 @@ This config is EXPERIMENTAL and may be changed or removed in a later release.`, false, `ExposeAuthorizerErrors controls whether the frontend authorization interceptor will pass through errors returned by the Authorizer component. If false, a generic PermissionDenied error without details will be returned. Default false.`, + ) + EnablePrincipalPropagation = NewNamespaceBoolSetting( + "frontend.enablePrincipalPropagation", + false, + `EnablePrincipalPropagation controls whether the authorization interceptor propagates the authenticated +principal identity as gRPC headers.`, ) KeepAliveMinTime = NewGlobalDurationSetting( "frontend.keepAliveMinTime", @@ -907,7 +914,7 @@ and deployment interaction in matching and history.`, ) UseRevisionNumberForWorkerVersioning = NewNamespaceBoolSetting( "system.useRevisionNumberForWorkerVersioning", - false, + true, `UseRevisionNumberForWorkerVersioning enables the use of revision number to resolve consistency problems that may arise during task dispatch time.`, ) EnableSuggestCaNOnNewTargetVersion = NewNamespaceBoolSetting( @@ -951,10 +958,17 @@ used when the first cache layer has a miss. Requires server restart for change t FrontendNexusRequestHeadersBlacklist = NewGlobalTypedSettingWithConverter( "frontend.nexusRequestHeadersBlacklist", ConvertWildcardStringListToRegexp, - MatchNothingRE, - `Nexus request headers to be removed before being sent to a user handler. -Wildcards (*) are expanded to allow any substring. By default blacklist is empty. -Concrete type should be list of strings.`, + // Failure support is an internal implementation detail that shouldn't propagate to the user. + util.MustWildCardStringsToRegexp([]string{ + "accept-encoding", + "x-forwarded-for", + "xdc-redirection", + "xdc-redirection-api", + "temporal-nexus-failure-support", + }), + `Nexus request headers to be removed before being sent to a user handler. Wildcards (*) are expanded to +allow any substring. By default headers that are meant for internal use are disallowed. Concrete type should be list of +strings.`, ) FrontendNexusForwardRequestUseEndpointDispatch = NewGlobalBoolSetting( "frontend.nexusForwardRequestUseEndpointDispatch", diff --git a/common/headers/caller_info.go b/common/headers/caller_info.go index 387bdc9514..89be3179be 100644 --- a/common/headers/caller_info.go +++ b/common/headers/caller_info.go @@ -2,8 +2,6 @@ package headers import ( "context" - - "google.golang.org/grpc/metadata" ) const ( @@ -153,24 +151,6 @@ func SetOrigin( return setIncomingMD(ctx, map[string]string{CallOriginHeaderName: callOrigin}) } -func setIncomingMD( - ctx context.Context, - kv map[string]string, -) context.Context { - mdIncoming, ok := metadata.FromIncomingContext(ctx) - if !ok { - mdIncoming = metadata.MD{} - } - - for k, v := range kv { - if v != "" { - mdIncoming.Set(k, v) - } - } - - return metadata.NewIncomingContext(ctx, mdIncoming) -} - // GetCallerInfo retrieves caller information from the context if exists. Empty value is returned // if any piece of caller information is not specified in the context. func GetCallerInfo( diff --git a/common/headers/headers.go b/common/headers/headers.go index 92ad1818d1..37d0d6bd92 100644 --- a/common/headers/headers.go +++ b/common/headers/headers.go @@ -4,6 +4,7 @@ import ( "context" "strings" + commonpb "go.temporal.io/api/common/v1" "google.golang.org/grpc/metadata" ) @@ -20,6 +21,9 @@ const ( CallerTypeHeaderName = "caller-type" CallOriginHeaderName = "call-initiation" + PrincipalTypeHeaderName = "temporal-principal-type" + PrincipalNameHeaderName = "temporal-principal-name" + ExperimentHeaderName = "temporal-experiment" ) @@ -33,6 +37,8 @@ var ( CallerNameHeaderName, CallerTypeHeaderName, CallOriginHeaderName, + PrincipalTypeHeaderName, + PrincipalNameHeaderName, } ) @@ -115,3 +121,47 @@ func IsExperimentRequested(ctx context.Context, experiment string) bool { return false } + +// StripPrincipal removes principal headers from incoming metadata to prevent +// external callers from spoofing principal identity. +func StripPrincipal(ctx context.Context) context.Context { + mdIncoming, ok := metadata.FromIncomingContext(ctx) + if !ok { + return ctx + } + mdIncoming.Delete(PrincipalTypeHeaderName) + mdIncoming.Delete(PrincipalNameHeaderName) + return metadata.NewIncomingContext(ctx, mdIncoming) +} + +// SetPrincipal sets the principal type and name headers in the incoming metadata. +func SetPrincipal(ctx context.Context, principal *commonpb.Principal) context.Context { + return setIncomingMD(ctx, map[string]string{ + PrincipalTypeHeaderName: principal.GetType(), + PrincipalNameHeaderName: principal.GetName(), + }) +} + +// GetPrincipal retrieves the principal from the context headers. Returns nil if principal is not set. +func GetPrincipal(ctx context.Context) *commonpb.Principal { + values := GetValues(ctx, PrincipalTypeHeaderName, PrincipalNameHeaderName) + if values[0] == "" && values[1] == "" { + return nil + } + return &commonpb.Principal{Type: values[0], Name: values[1]} +} + +// setIncomingMD sets the key-value pairs in the incoming metadata. +// Empty values are ignored. +func setIncomingMD(ctx context.Context, kv map[string]string) context.Context { + mdIncoming, ok := metadata.FromIncomingContext(ctx) + if !ok { + mdIncoming = metadata.MD{} + } + for k, v := range kv { + if v != "" { + mdIncoming.Set(k, v) + } + } + return metadata.NewIncomingContext(ctx, mdIncoming) +} diff --git a/common/metrics/metric_defs.go b/common/metrics/metric_defs.go index 8cb783bcf2..cfc9bd598e 100644 --- a/common/metrics/metric_defs.go +++ b/common/metrics/metric_defs.go @@ -31,6 +31,8 @@ const ( WorkerPluginNameTagName = "worker_plugin_name" WorkerStorageDriverTypeTagName = "worker_storage_driver_type" headerCallsiteTagName = "header_callsite" + ArchetypeTagName = "archetype" + ChasmTaskTypeTagName = "chasm_task_type" ) // This package should hold all the metrics and tags for temporal @@ -874,7 +876,15 @@ var ( "task_errors_throttled", WithDescription("The number of history task processing errors caused by resource exhausted errors, excluding workflow busy case."), ) - TaskCorruptionCounter = NewCounterDef("task_errors_corruption") + TaskCorruptionCounter = NewCounterDef("task_errors_corruption") + ChasmPureTaskRequests = NewCounterDef( + "chasm_pure_task_requests", + WithDescription("The number of CHASM pure tasks executed."), + ) + ChasmPureTaskErrors = NewCounterDef( + "chasm_pure_task_errors", + WithDescription("The number of errors during CHASM pure task execution."), + ) TaskScheduleToStartLatency = NewTimerDef("task_schedule_to_start_latency") TaskBatchCompleteCounter = NewCounterDef("task_batch_complete_counter") TaskReschedulerPendingTasks = NewDimensionlessHistogramDef("task_rescheduler_pending_tasks") diff --git a/common/metrics/tags.go b/common/metrics/tags.go index 544db883e7..29901b3106 100644 --- a/common/metrics/tags.go +++ b/common/metrics/tags.go @@ -20,24 +20,26 @@ const ( buildPlatformTag = "build_platform" goVersionTag = "go_version" - instance = "instance" - namespace = "namespace" - namespaceID = "namespace_id" - namespaceState = "namespace_state" - sourceCluster = "source_cluster" - targetCluster = "target_cluster" - taskSourceTag = "source" - forwardedTag = "forwarded" - fromCluster = "from_cluster" - toCluster = "to_cluster" - taskQueue = "taskqueue" - workflowType = "workflowType" - activityType = "activityType" - commandType = "commandType" - serviceName = "service_name" - actionType = "action_type" - workerVersion = "worker_version" - destination = "destination" + instance = "instance" + namespace = "namespace" + namespaceID = "namespace_id" + namespaceState = "namespace_state" + sourceCluster = "source_cluster" + targetCluster = "target_cluster" + taskSourceTag = "source" + forwardedTag = "forwarded" + fromCluster = "from_cluster" + toCluster = "to_cluster" + taskQueue = "taskqueue" + workflowType = "workflowType" + activityType = "activityType" + commandType = "commandType" + serviceName = "service_name" + actionType = "action_type" + workerVersion = "worker_version" + workerDeploymentName = "worker_deployment_name" + workerDeploymentBuildID = "worker_build_id" + destination = "destination" // Generic reason tag can be used anywhere a reason is needed. reason = "reason" // See server.api.enums.v1.ReplicationTaskType @@ -190,6 +192,20 @@ func WorkerVersionTag(version string, versionBreakdown bool) Tag { return Tag{Key: workerVersion, Value: version} } +func WorkerDeploymentNameTag(deploymentName string, versionBreakdown bool) Tag { + if !versionBreakdown { + deploymentName = "" + } + return Tag{Key: workerDeploymentName, Value: deploymentName} +} + +func WorkerDeploymentBuildIDTag(buildID string, versionBreakdown bool) Tag { + if !versionBreakdown { + buildID = "" + } + return Tag{Key: workerDeploymentBuildID, Value: buildID} +} + // WorkflowTypeTag returns a new workflow type tag. func WorkflowTypeTag(value string) Tag { if len(value) == 0 { @@ -264,6 +280,20 @@ func TaskTypeTag(value string) Tag { return Tag{Key: TaskTypeTagName, Value: value} } +func ArchetypeTag(value string) Tag { + if len(value) == 0 { + value = unknownValue + } + return Tag{Key: ArchetypeTagName, Value: value} +} + +func ChasmTaskTypeTag(value string) Tag { + if len(value) == 0 { + value = unknownValue + } + return Tag{Key: ChasmTaskTypeTagName, Value: value} +} + func PartitionTag(partition string) Tag { return Tag{Key: PartitionTagName, Value: partition} } diff --git a/common/namespace/nsreplication/replication_task_executor.go b/common/namespace/nsreplication/replication_task_executor.go index a745cc2818..5fa3110b1a 100644 --- a/common/namespace/nsreplication/replication_task_executor.go +++ b/common/namespace/nsreplication/replication_task_executor.go @@ -158,6 +158,7 @@ func (h *taskExecutorImpl) handleNamespaceCreationReplicationTask( ReplicationConfig: &persistencespb.NamespaceReplicationConfig{ ActiveClusterName: task.ReplicationConfig.GetActiveClusterName(), Clusters: ConvertClusterReplicationConfigFromProto(task.ReplicationConfig.Clusters), + State: task.ReplicationConfig.GetState(), FailoverHistory: ConvertFailoverHistoryToPersistenceProto(task.GetFailoverHistory()), }, ConfigVersion: task.GetConfigVersion(), diff --git a/common/namespace/nsreplication/replication_task_executor_test.go b/common/namespace/nsreplication/replication_task_executor_test.go index 2bde346bd1..bc304ab96f 100644 --- a/common/namespace/nsreplication/replication_task_executor_test.go +++ b/common/namespace/nsreplication/replication_task_executor_test.go @@ -163,6 +163,7 @@ func (s *namespaceReplicationTaskExecutorSuite) TestExecute_RegisterNamespaceTas clusterStandby := "some random standby cluster name" configVersion := int64(0) failoverVersion := int64(59) + replicationState := enumspb.REPLICATION_STATE_NORMAL clusters := []*replicationpb.ClusterReplicationConfig{ { ClusterName: clusterActive, @@ -202,6 +203,7 @@ func (s *namespaceReplicationTaskExecutorSuite) TestExecute_RegisterNamespaceTas ReplicationConfig: &replicationpb.NamespaceReplicationConfig{ ActiveClusterName: clusterActive, Clusters: clusters, + State: replicationState, }, ConfigVersion: configVersion, FailoverVersion: failoverVersion, @@ -230,6 +232,7 @@ func (s *namespaceReplicationTaskExecutorSuite) TestExecute_RegisterNamespaceTas ReplicationConfig: &persistencespb.NamespaceReplicationConfig{ ActiveClusterName: task.ReplicationConfig.ActiveClusterName, Clusters: []string{clusterActive, clusterStandby}, + State: replicationState, FailoverHistory: []*persistencespb.FailoverStatus{ { FailoverTime: timestamppb.New(time.Date(2025, 9, 15, 14, 30, 0, 0, time.UTC)), diff --git a/common/namespace/nsreplication/transmission_task_handler.go b/common/namespace/nsreplication/transmission_task_handler.go index 4916b55ee8..9d23ae6bfa 100644 --- a/common/namespace/nsreplication/transmission_task_handler.go +++ b/common/namespace/nsreplication/transmission_task_handler.go @@ -109,6 +109,12 @@ func (r *replicator) HandleTransmissionTask( }, } + // Only replicate on Create operation, and only if state is Normal + if namespaceOperation == enumsspb.NAMESPACE_OPERATION_CREATE && + replicationConfig.State == enumspb.REPLICATION_STATE_NORMAL { + task.NamespaceTaskAttributes.ReplicationConfig.State = replicationConfig.State + } + return r.namespaceReplicationQueue.Publish( ctx, &replicationspb.ReplicationTask{ diff --git a/common/namespace/nsreplication/transmission_task_handler_test.go b/common/namespace/nsreplication/transmission_task_handler_test.go index 28c2dc7645..dc1dc1dcc1 100644 --- a/common/namespace/nsreplication/transmission_task_handler_test.go +++ b/common/namespace/nsreplication/transmission_task_handler_test.go @@ -73,6 +73,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_RegisterNamespaceTask configVersion := int64(0) failoverVersion := int64(59) clusters := []string{clusterActive, clusterStandby} + replicationState := enumspb.REPLICATION_STATE_NORMAL namespaceOperation := enumsspb.NAMESPACE_OPERATION_CREATE info := &persistencespb.NamespaceInfo{ @@ -94,6 +95,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_RegisterNamespaceTask replicationConfig := &persistencespb.NamespaceReplicationConfig{ ActiveClusterName: clusterActive, Clusters: clusters, + State: replicationState, } isGlobalNamespace := true @@ -121,6 +123,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_RegisterNamespaceTask ReplicationConfig: &replicationpb.NamespaceReplicationConfig{ ActiveClusterName: clusterActive, Clusters: convertClusterReplicationConfigToProto(clusters), + State: replicationState, }, ConfigVersion: configVersion, FailoverVersion: failoverVersion, @@ -141,7 +144,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_RegisterNamespaceTask nil, false, // forceReplicate ) - s.Nil(err) + s.Require().NoError(err) } func (s *transmissionTaskSuite) TestHandleTransmissionTask_RegisterNamespaceTask_NotGlobalNamespace() { @@ -197,7 +200,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_RegisterNamespaceTask nil, false, // forceReplicate ) - s.Nil(err) + s.Require().NoError(err) } func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_IsGlobalNamespace() { @@ -286,7 +289,98 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_I nil, false, // forceReplicate ) - s.Nil(err) + s.Require().NoError(err) +} + +func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_StateNotReplicated() { + taskType := enumsspb.REPLICATION_TASK_TYPE_NAMESPACE_TASK + id := primitives.NewUUID().String() + name := "some random namespace test name" + state := enumspb.NAMESPACE_STATE_REGISTERED + description := "some random test description" + ownerEmail := "some random test owner" + data := map[string]string{"k": "v"} + retention := 10 * time.Hour * 24 + historyArchivalState := enumspb.ARCHIVAL_STATE_ENABLED + historyArchivalURI := "some random history archival uri" + visibilityArchivalState := enumspb.ARCHIVAL_STATE_ENABLED + visibilityArchivalURI := "some random visibility archival uri" + clusterActive := "some random active cluster name" + clusterStandby := "some random standby cluster name" + configVersion := int64(0) + failoverVersion := int64(59) + clusters := []string{clusterActive, clusterStandby} + + namespaceOperation := enumsspb.NAMESPACE_OPERATION_UPDATE + info := &persistencespb.NamespaceInfo{ + Id: id, + Name: name, + State: state, + Description: description, + Owner: ownerEmail, + Data: data, + } + config := &persistencespb.NamespaceConfig{ + Retention: durationpb.New(retention), + HistoryArchivalState: historyArchivalState, + HistoryArchivalUri: historyArchivalURI, + VisibilityArchivalState: visibilityArchivalState, + VisibilityArchivalUri: visibilityArchivalURI, + BadBinaries: &namespacepb.BadBinaries{Binaries: map[string]*namespacepb.BadBinaryInfo{}}, + } + replicationConfig := &persistencespb.NamespaceReplicationConfig{ + ActiveClusterName: clusterActive, + Clusters: clusters, + State: enumspb.REPLICATION_STATE_NORMAL, + } + isGlobalNamespace := true + + s.namespaceReplicationQueue.EXPECT().Publish(gomock.Any(), &replicationspb.ReplicationTask{ + TaskType: taskType, + Attributes: &replicationspb.ReplicationTask_NamespaceTaskAttributes{ + NamespaceTaskAttributes: &replicationspb.NamespaceTaskAttributes{ + NamespaceOperation: namespaceOperation, + Id: id, + Info: &namespacepb.NamespaceInfo{ + Name: name, + State: state, + Description: description, + OwnerEmail: ownerEmail, + Data: data, + }, + Config: &namespacepb.NamespaceConfig{ + WorkflowExecutionRetentionTtl: durationpb.New(retention), + HistoryArchivalState: historyArchivalState, + HistoryArchivalUri: historyArchivalURI, + VisibilityArchivalState: visibilityArchivalState, + VisibilityArchivalUri: visibilityArchivalURI, + BadBinaries: &namespacepb.BadBinaries{Binaries: map[string]*namespacepb.BadBinaryInfo{}}, + }, + ReplicationConfig: &replicationpb.NamespaceReplicationConfig{ + ActiveClusterName: clusterActive, + Clusters: convertClusterReplicationConfigToProto(clusters), + // State must not be set on UPDATE even when source state is NORMAL + }, + ConfigVersion: configVersion, + FailoverVersion: failoverVersion, + }, + }, + }).Return(nil) + + err := s.namespaceReplicator.HandleTransmissionTask( + context.Background(), + namespaceOperation, + info, + config, + replicationConfig, + true, + configVersion, + failoverVersion, + isGlobalNamespace, + nil, + false, // forceReplicate + ) + s.Require().NoError(err) } func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_NotGlobalNamespace() { @@ -341,7 +435,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_N nil, false, // forceReplicate ) - s.Nil(err) + s.Require().NoError(err) } func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_ReplicationClusterListUpdated() { @@ -430,7 +524,7 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_R nil, false, // forceReplicate ) - s.Nil(err) + s.Require().NoError(err) err = s.namespaceReplicator.HandleTransmissionTask( context.Background(), @@ -445,5 +539,5 @@ func (s *transmissionTaskSuite) TestHandleTransmissionTask_UpdateNamespaceTask_R nil, false, // forceReplicate ) - s.Nil(err) + s.Require().NoError(err) } diff --git a/common/payload/payload.go b/common/payload/payload.go index 691413882d..3c502f118a 100644 --- a/common/payload/payload.go +++ b/common/payload/payload.go @@ -90,3 +90,35 @@ func isEqual(a, b *commonpb.Payload) bool { bEnc := a.GetMetadata()[converter.MetadataEncoding] return bytes.Equal(aEnc, bEnc) && bytes.Equal(a.GetData(), b.GetData()) } + +// FilterNilSearchAttributes returns a new SearchAttributes with nil/empty payload values filtered out. +// If the input is nil or all values are nil/empty, returns nil. +// This is used to filter out nil search attributes from workflow start and continue-as-new events. +// Reuses MergeMapOfPayload which already handles nil payload filtering. +func FilterNilSearchAttributes(sa *commonpb.SearchAttributes) *commonpb.SearchAttributes { + if sa == nil || len(sa.GetIndexedFields()) == 0 { + return nil + } + + filtered := MergeMapOfPayload(nil, sa.GetIndexedFields()) + if len(filtered) == 0 { + return nil + } + return &commonpb.SearchAttributes{IndexedFields: filtered} +} + +// FilterNilMemo returns a new Memo with nil/empty payload values filtered out. +// If the input is nil or all values are nil/empty, returns nil. +// This is used to filter out nil memo fields from workflow start, continue-as-new, and modify-properties events. +// Reuses MergeMapOfPayload which already handles nil payload filtering. +func FilterNilMemo(memo *commonpb.Memo) *commonpb.Memo { + if memo == nil || len(memo.GetFields()) == 0 { + return nil + } + + filtered := MergeMapOfPayload(nil, memo.GetFields()) + if len(filtered) == 0 { + return nil + } + return &commonpb.Memo{Fields: filtered} +} diff --git a/common/payload/payload_test.go b/common/payload/payload_test.go index e4e1517b59..f02fa528ff 100644 --- a/common/payload/payload_test.go +++ b/common/payload/payload_test.go @@ -154,3 +154,127 @@ func TestIsEqual(t *testing.T) { b, _ = Encode("foo") s.False(isEqual(a, b)) } + +func TestFilterNilSearchAttributes(t *testing.T) { + s := assert.New(t) + + // nil input returns nil + result := FilterNilSearchAttributes(nil) + s.Nil(result) + + // empty SearchAttributes returns nil + emptySA := &commonpb.SearchAttributes{IndexedFields: map[string]*commonpb.Payload{}} + result = FilterNilSearchAttributes(emptySA) + s.Nil(result) + + // SearchAttributes with only valid values returns filtered copy + validPayload := EncodeString("value") + saNonNil := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "key1": validPayload, + }, + } + result = FilterNilSearchAttributes(saNonNil) + s.NotNil(result) + s.Len(result.IndexedFields, 1) + s.Equal(validPayload, result.IndexedFields["key1"]) + + // SearchAttributes with nil values filters them out + nilPayloadVal, _ := Encode(nil) + saMixed := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "valid": validPayload, + "nilVal": nilPayloadVal, + }, + } + result = FilterNilSearchAttributes(saMixed) + s.NotNil(result) + s.Len(result.IndexedFields, 1) + s.Equal(validPayload, result.IndexedFields["valid"]) + s.Nil(result.IndexedFields["nilVal"]) + + // SearchAttributes with empty slice values filters them out + emptySlicePayloadVal, _ := Encode([]string{}) + saEmptySlice := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "valid": validPayload, + "emptySlice": emptySlicePayloadVal, + }, + } + result = FilterNilSearchAttributes(saEmptySlice) + s.NotNil(result) + s.Len(result.IndexedFields, 1) + s.Equal(validPayload, result.IndexedFields["valid"]) + + // SearchAttributes with all nil/empty values returns nil + saAllNil := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "nil1": nilPayloadVal, + "nil2": emptySlicePayloadVal, + }, + } + result = FilterNilSearchAttributes(saAllNil) + s.Nil(result) +} + +func TestFilterNilMemo(t *testing.T) { + s := assert.New(t) + + // nil input returns nil + result := FilterNilMemo(nil) + s.Nil(result) + + // empty Memo returns nil + emptyMemo := &commonpb.Memo{Fields: map[string]*commonpb.Payload{}} + result = FilterNilMemo(emptyMemo) + s.Nil(result) + + // Memo with only valid values returns filtered copy + validPayload := EncodeString("value") + memoNonNil := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "key1": validPayload, + }, + } + result = FilterNilMemo(memoNonNil) + s.NotNil(result) + s.Len(result.Fields, 1) + s.Equal(validPayload, result.Fields["key1"]) + + // Memo with nil values filters them out + nilPayloadVal, _ := Encode(nil) + memoMixed := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "valid": validPayload, + "nilVal": nilPayloadVal, + }, + } + result = FilterNilMemo(memoMixed) + s.NotNil(result) + s.Len(result.Fields, 1) + s.Equal(validPayload, result.Fields["valid"]) + s.Nil(result.Fields["nilVal"]) + + // Memo with empty slice values filters them out + emptySlicePayloadVal, _ := Encode([]string{}) + memoEmptySlice := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "valid": validPayload, + "emptySlice": emptySlicePayloadVal, + }, + } + result = FilterNilMemo(memoEmptySlice) + s.NotNil(result) + s.Len(result.Fields, 1) + s.Equal(validPayload, result.Fields["valid"]) + + // Memo with all nil/empty values returns nil + memoAllNil := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "nil1": nilPayloadVal, + "nil2": emptySlicePayloadVal, + }, + } + result = FilterNilMemo(memoAllNil) + s.Nil(result) +} diff --git a/common/rpc/interceptor/health_check.go b/common/rpc/interceptor/health_check.go index 05f61bbfbd..6112e3fd56 100644 --- a/common/rpc/interceptor/health_check.go +++ b/common/rpc/interceptor/health_check.go @@ -131,6 +131,14 @@ func (h *HealthCheckInterceptor) UnaryIntercept( return resp, err } + if IsLongPollGetWorkflowExecutionHistoryRequest(req) { + return resp, err + } + + if IsLongPollDescribeActivityExecutionRequest(req) { + return resp, err + } + // Record health signal for standard APIs h.healthSignalAggregator.Record(elapsed, err) return resp, err diff --git a/common/testing/mocksdk/client_mock.go b/common/testing/mocksdk/client_mock.go index d9e9d46c0e..83d786a799 100644 --- a/common/testing/mocksdk/client_mock.go +++ b/common/testing/mocksdk/client_mock.go @@ -100,6 +100,20 @@ func (mr *MockClientMockRecorder) CompleteActivity(ctx, taskToken, result, err a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CompleteActivity", reflect.TypeOf((*MockClient)(nil).CompleteActivity), ctx, taskToken, result, err) } +// CompleteActivityByActivityID mocks base method. +func (m *MockClient) CompleteActivityByActivityID(ctx context.Context, namespace, activityID, activityRunID string, result any, err error) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CompleteActivityByActivityID", ctx, namespace, activityID, activityRunID, result, err) + ret0, _ := ret[0].(error) + return ret0 +} + +// CompleteActivityByActivityID indicates an expected call of CompleteActivityByActivityID. +func (mr *MockClientMockRecorder) CompleteActivityByActivityID(ctx, namespace, activityID, activityRunID, result, err any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CompleteActivityByActivityID", reflect.TypeOf((*MockClient)(nil).CompleteActivityByActivityID), ctx, namespace, activityID, activityRunID, result, err) +} + // CompleteActivityByID mocks base method. func (m *MockClient) CompleteActivityByID(ctx context.Context, namespace, workflowID, runID, activityID string, result any, err error) error { m.ctrl.T.Helper() @@ -114,6 +128,21 @@ func (mr *MockClientMockRecorder) CompleteActivityByID(ctx, namespace, workflowI return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CompleteActivityByID", reflect.TypeOf((*MockClient)(nil).CompleteActivityByID), ctx, namespace, workflowID, runID, activityID, result, err) } +// CountActivities mocks base method. +func (m *MockClient) CountActivities(ctx context.Context, options client.CountActivitiesOptions) (*client.CountActivitiesResult, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CountActivities", ctx, options) + ret0, _ := ret[0].(*client.CountActivitiesResult) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CountActivities indicates an expected call of CountActivities. +func (mr *MockClientMockRecorder) CountActivities(ctx, options any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CountActivities", reflect.TypeOf((*MockClient)(nil).CountActivities), ctx, options) +} + // CountWorkflow mocks base method. func (m *MockClient) CountWorkflow(ctx context.Context, request *workflowservice.CountWorkflowExecutionsRequest) (*workflowservice.CountWorkflowExecutionsResponse, error) { m.ctrl.T.Helper() @@ -203,6 +232,26 @@ func (mr *MockClientMockRecorder) DescribeWorkflowExecution(ctx, workflowID, run return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DescribeWorkflowExecution", reflect.TypeOf((*MockClient)(nil).DescribeWorkflowExecution), ctx, workflowID, runID) } +// ExecuteActivity mocks base method. +func (m *MockClient) ExecuteActivity(ctx context.Context, options client.StartActivityOptions, activity any, args ...any) (client.ActivityHandle, error) { + m.ctrl.T.Helper() + varargs := []any{ctx, options, activity} + for _, a := range args { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "ExecuteActivity", varargs...) + ret0, _ := ret[0].(client.ActivityHandle) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ExecuteActivity indicates an expected call of ExecuteActivity. +func (mr *MockClientMockRecorder) ExecuteActivity(ctx, options, activity any, args ...any) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]any{ctx, options, activity}, args...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ExecuteActivity", reflect.TypeOf((*MockClient)(nil).ExecuteActivity), varargs...) +} + // ExecuteWorkflow mocks base method. func (m *MockClient) ExecuteWorkflow(ctx context.Context, options client.StartWorkflowOptions, workflow any, args ...any) (client.WorkflowRun, error) { m.ctrl.T.Helper() @@ -223,6 +272,20 @@ func (mr *MockClientMockRecorder) ExecuteWorkflow(ctx, options, workflow any, ar return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ExecuteWorkflow", reflect.TypeOf((*MockClient)(nil).ExecuteWorkflow), varargs...) } +// GetActivityHandle mocks base method. +func (m *MockClient) GetActivityHandle(options client.GetActivityHandleOptions) client.ActivityHandle { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetActivityHandle", options) + ret0, _ := ret[0].(client.ActivityHandle) + return ret0 +} + +// GetActivityHandle indicates an expected call of GetActivityHandle. +func (mr *MockClientMockRecorder) GetActivityHandle(options any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetActivityHandle", reflect.TypeOf((*MockClient)(nil).GetActivityHandle), options) +} + // GetSearchAttributes mocks base method. func (m *MockClient) GetSearchAttributes(ctx context.Context) (*workflowservice.GetSearchAttributesResponse, error) { m.ctrl.T.Helper() @@ -325,6 +388,21 @@ func (mr *MockClientMockRecorder) GetWorkflowUpdateHandle(ref any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkflowUpdateHandle", reflect.TypeOf((*MockClient)(nil).GetWorkflowUpdateHandle), ref) } +// ListActivities mocks base method. +func (m *MockClient) ListActivities(ctx context.Context, options client.ListActivitiesOptions) (client.ListActivitiesResult, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ListActivities", ctx, options) + ret0, _ := ret[0].(client.ListActivitiesResult) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ListActivities indicates an expected call of ListActivities. +func (mr *MockClientMockRecorder) ListActivities(ctx, options any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListActivities", reflect.TypeOf((*MockClient)(nil).ListActivities), ctx, options) +} + // ListArchivedWorkflow mocks base method. func (m *MockClient) ListArchivedWorkflow(ctx context.Context, request *workflowservice.ListArchivedWorkflowExecutionsRequest) (*workflowservice.ListArchivedWorkflowExecutionsResponse, error) { m.ctrl.T.Helper() diff --git a/common/testing/parallelsuite/guard.go b/common/testing/parallelsuite/guard.go new file mode 100644 index 0000000000..2bdb95c997 --- /dev/null +++ b/common/testing/parallelsuite/guard.go @@ -0,0 +1,58 @@ +package parallelsuite + +import ( + "fmt" + "sync/atomic" + "testing" +) + +// guardT is a [require.TestingT] wrapper that detects mixing of assertions and Run. +// +// Before markHasSubtests: tracks assertion usage via Helper() (sets asserted flag). +// After markHasSubtests: panics on any assertion via Helper()/Errorf()/FailNow(). +type guardT struct { + *testing.T + name string + asserted atomic.Bool + hasSubtests atomic.Bool +} + +func (g *guardT) Helper() { + if g.hasSubtests.Load() { + panic(fmt.Sprintf( + "parallelsuite: assertion called on %q after Run() was called; "+ + "a test must either use assertions OR call Run(), not both — "+ + "use the callback parameter's assertions inside Run() instead", + g.name, + )) + } + g.asserted.Store(true) + g.T.Helper() +} + +func (g *guardT) Errorf(format string, args ...any) { + if g.hasSubtests.Load() { + g.Helper() // panics with clear message + } + g.T.Errorf(format, args...) +} + +func (g *guardT) FailNow() { + if g.hasSubtests.Load() { + g.Helper() // panics with clear message + } + g.T.FailNow() +} + +func (g *guardT) markHasSubtests() { + if g.hasSubtests.Swap(true) { + return + } + if g.asserted.Load() { + panic(fmt.Sprintf( + "parallelsuite: Run() called on %q after assertions were already used; "+ + "a test must either use assertions OR call Run(), not both", + g.name, + )) + } +} diff --git a/common/testing/parallelsuite/suite.go b/common/testing/parallelsuite/suite.go new file mode 100644 index 0000000000..027d80e2f4 --- /dev/null +++ b/common/testing/parallelsuite/suite.go @@ -0,0 +1,194 @@ +package parallelsuite + +import ( + "fmt" + "reflect" + "strings" + "testing" + + "github.com/stretchr/testify/require" + testifysuite "github.com/stretchr/testify/suite" + "go.temporal.io/server/common/testing/historyrequire" + "go.temporal.io/server/common/testing/protorequire" +) + +// testingSuite is the constraint for suite types. +type testingSuite interface { + testifysuite.TestingSuite + copySuite(t *testing.T) testingSuite + initSuite(t *testing.T) +} + +// Suite provides parallel test execution with require-style (fail-fast) assertions. +// +// It enforces a strict rule: a test method (or subtest) must either use assertions +// directly OR create subtests via Run — not both. +type Suite[T testingSuite] struct { + testifyBase + *require.Assertions + protorequire.ProtoAssertions + historyrequire.HistoryRequire + + guardT guardT +} + +// copySuite creates a fresh suite instance initialized for the given *testing.T. +func (s *Suite[T]) copySuite(t *testing.T) testingSuite { + cp := reflect.New(reflect.TypeFor[T]().Elem()).Interface().(T) + cp.initSuite(t) + return cp +} + +func (s *Suite[T]) initSuite(t *testing.T) { + g := &s.guardT + g.name = t.Name() + g.T = t + g.asserted.Store(false) + g.hasSubtests.Store(false) + s.Assertions = require.New(g) + s.ProtoAssertions = protorequire.New(g) + s.HistoryRequire = historyrequire.New(g) +} + +// T returns the *testing.T, panicking if the guard has been sealed. +func (s *Suite[T]) T() *testing.T { + if s.guardT.hasSubtests.Load() { + panic("parallelsuite: do not call T() after Run(); use the subtest callback's parameter instead") + } + return s.guardT.T +} + +// Run creates a parallel subtest. The callback receives a fresh copy of the +// concrete suite type, initialized for the subtest's *testing.T. +func (s *Suite[T]) Run(name string, fn func(T)) bool { + pt := s.guardT.T // grab T before sealing + s.guardT.markHasSubtests() + return pt.Run(name, func(t *testing.T) { + t.Parallel() //nolint:testifylint // parallelsuite intentionally supports parallel subtests + fn(s.copySuite(t).(T)) + }) +} + +// Run discovers and runs all exported Test* methods on the given suite in parallel. +// +// Each method gets its own fresh suite instance initialized for the subtest's +// *testing.T. Both the suite-level test and each method subtest are marked as +// parallel. Any sequential setup must happen before calling Run. +// +// The suite must embed [Suite] and have no other fields. +func Run[T testingSuite](t *testing.T, s T, args ...any) { + t.Helper() + + typ := reflect.TypeOf(s) + if typ.Kind() != reflect.Ptr || typ.Elem().Kind() != reflect.Struct { + panic(fmt.Sprintf("parallelsuite.Run: suite must be a pointer to a struct, got %v", typ)) + } + structType := typ.Elem() + + validateSuiteStruct(structType) + + methods := discoverTestMethods(typ, structType, args) + if len(methods) == 0 { + panic(fmt.Sprintf("parallelsuite.Run: suite %s has no Test* methods", structType.Name())) + } + + argVals := make([]reflect.Value, len(args)) + for i, a := range args { + argVals[i] = reflect.ValueOf(a) + } + + t.Parallel() + + for _, method := range methods { + t.Run(method.Name, func(t *testing.T) { + t.Parallel() + + cpS := s.copySuite(t) + callArgs := append([]reflect.Value{reflect.ValueOf(cpS)}, argVals...) + method.Func.Call(callArgs) + }) + } +} + +var inheritedMethods map[string]bool + +func init() { + type ds struct{ Suite[*ds] } + ptrType := reflect.TypeOf(&ds{}) + inheritedMethods = make(map[string]bool, ptrType.NumMethod()) + for i := 0; i < ptrType.NumMethod(); i++ { + inheritedMethods[ptrType.Method(i).Name] = true + } +} + +func validateSuiteStruct(structType reflect.Type) { + if !strings.HasSuffix(structType.Name(), "Suite") { + panic(fmt.Sprintf("parallelsuite.Run: struct name %q must end with \"Suite\"", structType.Name())) + } + + if structType.NumField() != 1 { + panic(fmt.Sprintf( + "parallelsuite.Run: suite %s must have no fields besides the embedded parallelsuite.Suite; "+ + "pass parameters as extra args to Run instead (got %d fields)", + structType.Name(), structType.NumField(), + )) + } + f := structType.Field(0) + if !f.Anonymous { + panic(fmt.Sprintf( + "parallelsuite.Run: suite %s must embed parallelsuite.Suite, found named field %q", + structType.Name(), f.Name, + )) + } +} + +func discoverTestMethods(ptrType, structType reflect.Type, args []any) []reflect.Method { + expectedNumIn := 1 + len(args) + + for i := 0; i < ptrType.NumMethod(); i++ { + name := ptrType.Method(i).Name + if !strings.HasPrefix(name, "Test") && !inheritedMethods[name] { + panic(fmt.Sprintf( + "parallelsuite.Run: suite %s has exported method %s that does not start with Test; "+ + "use a package-level function instead", + structType.Name(), name, + )) + } + } + + var methods []reflect.Method + for i := 0; i < ptrType.NumMethod(); i++ { + method := ptrType.Method(i) + if !strings.HasPrefix(method.Name, "Test") { + continue + } + + mt := method.Type + if mt.NumOut() != 0 { + panic(fmt.Sprintf( + "parallelsuite.Run: method %s.%s must not have return values, got %v", + structType.Name(), method.Name, mt, + )) + } + if mt.NumIn() != expectedNumIn { + panic(fmt.Sprintf( + "parallelsuite.Run: method %s.%s has wrong number of parameters: expected %d, got %d (%v)", + structType.Name(), method.Name, expectedNumIn, mt.NumIn(), mt, + )) + } + + for j, a := range args { + paramType := mt.In(1 + j) + argType := reflect.TypeOf(a) + if !argType.AssignableTo(paramType) { + panic(fmt.Sprintf( + "parallelsuite.Run: method %s.%s parameter %d has type %v but Run arg has type %v", + structType.Name(), method.Name, j+1, paramType, argType, + )) + } + } + + methods = append(methods, method) + } + return methods +} diff --git a/common/testing/parallelsuite/suite_test.go b/common/testing/parallelsuite/suite_test.go new file mode 100644 index 0000000000..7bc9032275 --- /dev/null +++ b/common/testing/parallelsuite/suite_test.go @@ -0,0 +1,119 @@ +package parallelsuite + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +type validSuite struct{ Suite[*validSuite] } + +func (s *validSuite) TestA() { + s.NotNil(s.T()) +} + +type validWithArgsSuite struct{ Suite[*validWithArgsSuite] } + +func (s *validWithArgsSuite) TestA(name string, count int) { + s.Equal("hello", name) + s.Equal(42, count) +} + +type noTestMethodsSuite struct{ Suite[*noTestMethodsSuite] } + +type wrongSigSuite struct{ Suite[*wrongSigSuite] } + +func (s *wrongSigSuite) TestBad(t *testing.T) {} //nolint:unused + +type badNameTests struct{ Suite[*badNameTests] } + +func (s *badNameTests) TestA() {} //nolint:unused + +type exportedNonTestSuite struct{ Suite[*exportedNonTestSuite] } + +func (s *exportedNonTestSuite) TestA() {} +func (s *exportedNonTestSuite) Helper() {} //nolint:unused + +type hasExtraFieldsSuite struct { + Suite[*hasExtraFieldsSuite] + x int //nolint:unused +} + +func (s *hasExtraFieldsSuite) TestA() {} //nolint:unused + +type setupTestSuite struct{ Suite[*setupTestSuite] } + +func (s *setupTestSuite) TestA() {} +func (s *setupTestSuite) SetupTest() {} //nolint:unused + +type sealAfterRunSuite struct{ Suite[*sealAfterRunSuite] } + +func (s *sealAfterRunSuite) TestAssertionAfterRun() { + // Calling Run seals the parent's assertions and T(). + s.Run("subtest", func(s *sealAfterRunSuite) { + s.NotNil(s.T()) // subtest assertions work fine + }) + + t := s.guardT.T + + // After Run: even passing assertions panic. + require.Panics(t, func() { s.NotNil(t) }) + + // T() also panics after Run. + require.Panics(t, func() { s.T() }) +} + +type sealRunAfterAssertSuite struct { + Suite[*sealRunAfterAssertSuite] +} + +func (s *sealRunAfterAssertSuite) TestRunAfterAssertion() { + // Use an assertion first. + s.NotNil(s.T()) + + t := s.guardT.T + + // Calling Run after assertions panics. + require.Panics(t, func() { + s.Run("should-not-run", func(*sealRunAfterAssertSuite) {}) + }) +} + +func TestRun_AcceptsSuite(t *testing.T) { + t.Run("no args", func(t *testing.T) { + require.NotPanics(t, func() { Run(t, &validSuite{}) }) + }) + t.Run("with args", func(t *testing.T) { + require.NotPanics(t, func() { Run(t, &validWithArgsSuite{}, "hello", 42) }) + }) +} + +func TestRun_RejectsSuite(t *testing.T) { + t.Run("no Test methods", func(t *testing.T) { + require.Panics(t, func() { Run(t, &noTestMethodsSuite{}) }) + }) + t.Run("wrong method signature", func(t *testing.T) { + require.Panics(t, func() { Run(t, &wrongSigSuite{}) }) + }) + t.Run("extra fields", func(t *testing.T) { + require.Panics(t, func() { Run(t, &hasExtraFieldsSuite{}) }) + }) + t.Run("name not ending in Suite", func(t *testing.T) { + require.Panics(t, func() { Run(t, &badNameTests{}) }) + }) + t.Run("non-Test exported method", func(t *testing.T) { + require.Panics(t, func() { Run(t, &exportedNonTestSuite{}) }) + }) + t.Run("SetupTest forbidden", func(t *testing.T) { + require.Panics(t, func() { Run(t, &setupTestSuite{}) }) + }) +} + +func TestGuardSeal(t *testing.T) { + t.Run("assertion after Run", func(t *testing.T) { + Run(t, &sealAfterRunSuite{}) + }) + t.Run("Run after assertion", func(t *testing.T) { + Run(t, &sealRunAfterAssertSuite{}) + }) +} diff --git a/common/testing/parallelsuite/testify.go b/common/testing/parallelsuite/testify.go new file mode 100644 index 0000000000..67d4696704 --- /dev/null +++ b/common/testing/parallelsuite/testify.go @@ -0,0 +1,37 @@ +package parallelsuite + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + testifysuite "github.com/stretchr/testify/suite" +) + +// testifyBase wraps testify's suite.Suite so that Suite's *require.Assertions +// shadows testify's *assert.Assertions in Go's embedding resolution. +// It disables all testify suite methods with panicking stubs. +type testifyBase struct { + testifysuite.Suite +} + +// Deprecated: SetT is managed internally by [Run]. Do not call directly. +func (b *testifyBase) SetT(_ *testing.T) { + panic("parallelsuite: do not call SetT directly; it is managed by parallelsuite.Run") +} + +// Deprecated: SetS is managed internally by [Run]. Do not call directly. +func (b *testifyBase) SetS(_ testifysuite.TestingSuite) { + panic("parallelsuite: do not call SetS directly; it is managed by parallelsuite.Run") +} + +// Deprecated: Assert returns non-fatal assertions which are not supported. +// Use s.NoError, s.Equal, etc. directly (require semantics). +func (b *testifyBase) Assert() *assert.Assertions { + panic("parallelsuite: do not use Assert(); use s.NoError, s.Equal, etc. directly") +} + +// Deprecated: Require bypasses the guard mechanism. Use s.NoError, s.Equal, etc. directly. +func (b *testifyBase) Require() *require.Assertions { + panic("parallelsuite: do not use Require(); use s.NoError, s.Equal, etc. directly") +} diff --git a/common/testing/testvars/test_vars.go b/common/testing/testvars/test_vars.go index 23e823412c..2b12d7ec7c 100644 --- a/common/testing/testvars/test_vars.go +++ b/common/testing/testvars/test_vars.go @@ -282,20 +282,14 @@ func (tv *TestVars) DeploymentVersionTransition() *workflowpb.DeploymentVersionT return ret } -func (tv *TestVars) VersioningOverridePinned(useV32 bool) *workflowpb.VersioningOverride { - if useV32 { - return &workflowpb.VersioningOverride{ - Override: &workflowpb.VersioningOverride_Pinned{ - Pinned: &workflowpb.VersioningOverride_PinnedOverride{ - Behavior: workflowpb.VersioningOverride_PINNED_OVERRIDE_BEHAVIOR_PINNED, - Version: tv.ExternalDeploymentVersion(), - }, - }, - } - } +func (tv *TestVars) VersioningOverridePinned() *workflowpb.VersioningOverride { return &workflowpb.VersioningOverride{ - Behavior: enumspb.VERSIONING_BEHAVIOR_PINNED, - PinnedVersion: tv.DeploymentVersionString(), + Override: &workflowpb.VersioningOverride_Pinned{ + Pinned: &workflowpb.VersioningOverride_PinnedOverride{ + Behavior: workflowpb.VersioningOverride_PINNED_OVERRIDE_BEHAVIOR_PINNED, + Version: tv.ExternalDeploymentVersion(), + }, + }, } } diff --git a/common/util/wildcard.go b/common/util/wildcard.go index 80c74aa465..c1cdb95efb 100644 --- a/common/util/wildcard.go +++ b/common/util/wildcard.go @@ -39,3 +39,12 @@ func WildCardStringsToRegexp(patterns []string) (*regexp.Regexp, error) { result.WriteRune('$') return regexp.Compile(result.String()) } + +// MustWildCardStringsToRegexp is like WildCardStringsToRegexp but panics on error. +func MustWildCardStringsToRegexp(patterns []string) *regexp.Regexp { + re, err := WildCardStringsToRegexp(patterns) + if err != nil { + panic(err) //nolint:forbidigo // Must* functions conventionally panic on error. + } + return re +} diff --git a/common/worker_versioning/worker_versioning.go b/common/worker_versioning/worker_versioning.go index 5ebcaeb16a..7d7368530e 100644 --- a/common/worker_versioning/worker_versioning.go +++ b/common/worker_versioning/worker_versioning.go @@ -370,7 +370,7 @@ func checkVersionMembershipViaUserData( return HasDeploymentVersion(tqData.GetDeploymentData(), DeploymentVersionFromDeployment(DeploymentFromExternalDeploymentVersion(version))), nil } -func FindDeploymentVersion(deployments *persistencespb.DeploymentData, v *deploymentspb.WorkerDeploymentVersion) int { +func FindOldDeploymentVersion(deployments *persistencespb.DeploymentData, v *deploymentspb.WorkerDeploymentVersion) int { for i, vd := range deployments.GetVersions() { if proto.Equal(v, vd.GetVersion()) { return i @@ -1139,6 +1139,8 @@ func WorkerDeploymentVersionFromStringV32(s string) (*deploymentspb.WorkerDeploy // CleanupOldDeletedVersions removes versions deleted more than 7 days ago. Also removes more deleted versions if // the limit is being exceeded. Never removes undeleted versions. +// Deprecated. Versions now are deleted serially without using the deleted flag in versionData. +// TODO: remove this cleanup logic after next major release. func CleanupOldDeletedVersions(deploymentData *persistencespb.WorkerDeploymentData, maxVersions int) bool { now := time.Now() aWeekAgo := now.Add(-time.Hour * 24 * 7) diff --git a/components/nexusoperations/config.go b/components/nexusoperations/config.go index 35b7e75bc8..722b8c23ec 100644 --- a/components/nexusoperations/config.go +++ b/components/nexusoperations/config.go @@ -107,6 +107,8 @@ var DisallowedOperationHeaders = dynamicconfig.NewGlobalTypedSettingWithConverte headers.CallerNameHeaderName, headers.CallerTypeHeaderName, headers.CallOriginHeaderName, + headers.PrincipalTypeHeaderName, + headers.PrincipalNameHeaderName, }, `Case insensitive list of disallowed header keys for Nexus Operations. ScheduleNexusOperation commands with a "nexus_header" field that contains any of these disallowed keys will be diff --git a/components/nexusoperations/frontend/handler.go b/components/nexusoperations/frontend/handler.go index 05d850b69c..dbc1ca5a28 100644 --- a/components/nexusoperations/frontend/handler.go +++ b/components/nexusoperations/frontend/handler.go @@ -414,7 +414,7 @@ func (c *requestContext) interceptRequest(ctx context.Context, request *nexusrpc ctx = c.AuthInterceptor.EnhanceContext(ctx, authInfo, claims) } - err = c.AuthInterceptor.Authorize(ctx, claims, &authorization.CallTarget{ + _, err = c.AuthInterceptor.Authorize(ctx, claims, &authorization.CallTarget{ APIName: apiName, Namespace: c.namespace.Name().String(), Request: request, diff --git a/docs/Makefile b/docs/Makefile index 5f5ae551b1..b7bdad93db 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -4,8 +4,8 @@ DIAGRAMS := $(wildcard ./_assets/*.d2) install-d2: @printf $(COLOR) "Install d2..." - @go install oss.terrastruct.com/d2@v0.6.3 + @go install oss.terrastruct.com/d2@v0.7.1 update-diagrams: install-d2 @printf $(COLOR) "Update diagrams" - $(foreach DIAGRAM, $(DIAGRAMS), $(shell d2 --layout elk --theme 7 --dark-theme 200 $(DIAGRAM) $(NEWLINE))) + $(foreach DIAGRAM, $(DIAGRAMS), $(shell d2 --theme 7 --dark-theme 200 $(DIAGRAM) $(NEWLINE))) diff --git a/docs/_assets/matching-context.d2 b/docs/_assets/matching-context.d2 index 66337ec9c3..40ead8ccd8 100644 --- a/docs/_assets/matching-context.d2 +++ b/docs/_assets/matching-context.d2 @@ -1,6 +1,12 @@ # https://d2lang.com # Generate SVG by running `make` inside of `docs/`. +vars: { + d2-config: { + layout-engine: elk + } +} + Cluster: { grid-rows: 2 vertical-gap: 10 diff --git a/docs/_assets/matching-context.svg b/docs/_assets/matching-context.svg index 1b68689b39..050202ad24 100644 --- a/docs/_assets/matching-context.svg +++ b/docs/_assets/matching-context.svg @@ -1,23 +1,23 @@ -Temporal ClusterOutsideServerWorkerplacehoHistoryMatchingFrontendplaceho1Database Poll TasksPoll Tasks Add Tasks Record Start + .d2-825436234 .fill-N1{fill:#0A0F25;} + .d2-825436234 .fill-N2{fill:#676C7E;} + .d2-825436234 .fill-N3{fill:#9499AB;} + .d2-825436234 .fill-N4{fill:#CFD2DD;} + .d2-825436234 .fill-N5{fill:#DEE1EB;} + .d2-825436234 .fill-N6{fill:#EEF1F8;} + .d2-825436234 .fill-N7{fill:#FFFFFF;} + .d2-825436234 .fill-B1{fill:#170034;} + .d2-825436234 .fill-B2{fill:#7639C5;} + .d2-825436234 .fill-B3{fill:#8F70D1;} + .d2-825436234 .fill-B4{fill:#D0B9F5;} + .d2-825436234 .fill-B5{fill:#E7DEFF;} + .d2-825436234 .fill-B6{fill:#F4F0FF;} + .d2-825436234 .fill-AA2{fill:#0F66B7;} + .d2-825436234 .fill-AA4{fill:#87BFF3;} + .d2-825436234 .fill-AA5{fill:#BCDDFB;} + .d2-825436234 .fill-AB4{fill:#92E3E3;} + .d2-825436234 .fill-AB5{fill:#D7F5F5;} + .d2-825436234 .stroke-N1{stroke:#0A0F25;} + .d2-825436234 .stroke-N2{stroke:#676C7E;} + .d2-825436234 .stroke-N3{stroke:#9499AB;} + .d2-825436234 .stroke-N4{stroke:#CFD2DD;} + .d2-825436234 .stroke-N5{stroke:#DEE1EB;} + .d2-825436234 .stroke-N6{stroke:#EEF1F8;} + .d2-825436234 .stroke-N7{stroke:#FFFFFF;} + .d2-825436234 .stroke-B1{stroke:#170034;} + .d2-825436234 .stroke-B2{stroke:#7639C5;} + .d2-825436234 .stroke-B3{stroke:#8F70D1;} + .d2-825436234 .stroke-B4{stroke:#D0B9F5;} + .d2-825436234 .stroke-B5{stroke:#E7DEFF;} + .d2-825436234 .stroke-B6{stroke:#F4F0FF;} + .d2-825436234 .stroke-AA2{stroke:#0F66B7;} + .d2-825436234 .stroke-AA4{stroke:#87BFF3;} + .d2-825436234 .stroke-AA5{stroke:#BCDDFB;} + .d2-825436234 .stroke-AB4{stroke:#92E3E3;} + .d2-825436234 .stroke-AB5{stroke:#D7F5F5;} + .d2-825436234 .background-color-N1{background-color:#0A0F25;} + .d2-825436234 .background-color-N2{background-color:#676C7E;} + .d2-825436234 .background-color-N3{background-color:#9499AB;} + .d2-825436234 .background-color-N4{background-color:#CFD2DD;} + .d2-825436234 .background-color-N5{background-color:#DEE1EB;} + .d2-825436234 .background-color-N6{background-color:#EEF1F8;} + .d2-825436234 .background-color-N7{background-color:#FFFFFF;} + .d2-825436234 .background-color-B1{background-color:#170034;} + .d2-825436234 .background-color-B2{background-color:#7639C5;} + .d2-825436234 .background-color-B3{background-color:#8F70D1;} + .d2-825436234 .background-color-B4{background-color:#D0B9F5;} + .d2-825436234 .background-color-B5{background-color:#E7DEFF;} + .d2-825436234 .background-color-B6{background-color:#F4F0FF;} + .d2-825436234 .background-color-AA2{background-color:#0F66B7;} + .d2-825436234 .background-color-AA4{background-color:#87BFF3;} + .d2-825436234 .background-color-AA5{background-color:#BCDDFB;} + .d2-825436234 .background-color-AB4{background-color:#92E3E3;} + .d2-825436234 .background-color-AB5{background-color:#D7F5F5;} + .d2-825436234 .color-N1{color:#0A0F25;} + .d2-825436234 .color-N2{color:#676C7E;} + .d2-825436234 .color-N3{color:#9499AB;} + .d2-825436234 .color-N4{color:#CFD2DD;} + .d2-825436234 .color-N5{color:#DEE1EB;} + .d2-825436234 .color-N6{color:#EEF1F8;} + .d2-825436234 .color-N7{color:#FFFFFF;} + .d2-825436234 .color-B1{color:#170034;} + .d2-825436234 .color-B2{color:#7639C5;} + .d2-825436234 .color-B3{color:#8F70D1;} + .d2-825436234 .color-B4{color:#D0B9F5;} + .d2-825436234 .color-B5{color:#E7DEFF;} + .d2-825436234 .color-B6{color:#F4F0FF;} + .d2-825436234 .color-AA2{color:#0F66B7;} + .d2-825436234 .color-AA4{color:#87BFF3;} + .d2-825436234 .color-AA5{color:#BCDDFB;} + .d2-825436234 .color-AB4{color:#92E3E3;} + .d2-825436234 .color-AB5{color:#D7F5F5;}.appendix text.text{fill:#0A0F25}.md{--color-fg-default:#0A0F25;--color-fg-muted:#676C7E;--color-fg-subtle:#9499AB;--color-canvas-default:#FFFFFF;--color-canvas-subtle:#EEF1F8;--color-border-default:#170034;--color-border-muted:#7639C5;--color-neutral-muted:#EEF1F8;--color-accent-fg:#7639C5;--color-accent-emphasis:#7639C5;--color-attention-subtle:#676C7E;--color-danger-fg:red;}.sketch-overlay-B1{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-B2{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-B3{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-B4{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-B5{fill:url(#streaks-bright-d2-825436234);mix-blend-mode:darken}.sketch-overlay-B6{fill:url(#streaks-bright-d2-825436234);mix-blend-mode:darken}.sketch-overlay-AA2{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-AA4{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-AA5{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-AB4{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-AB5{fill:url(#streaks-bright-d2-825436234);mix-blend-mode:darken}.sketch-overlay-N1{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-N2{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-N3{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-N4{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-N5{fill:url(#streaks-bright-d2-825436234);mix-blend-mode:darken}.sketch-overlay-N6{fill:url(#streaks-bright-d2-825436234);mix-blend-mode:darken}.sketch-overlay-N7{fill:url(#streaks-bright-d2-825436234);mix-blend-mode:darken}.light-code{display: block}.dark-code{display: none}@media screen and (prefers-color-scheme:dark){ + .d2-825436234 .fill-N1{fill:#CDD6F4;} + .d2-825436234 .fill-N2{fill:#BAC2DE;} + .d2-825436234 .fill-N3{fill:#A6ADC8;} + .d2-825436234 .fill-N4{fill:#585B70;} + .d2-825436234 .fill-N5{fill:#45475A;} + .d2-825436234 .fill-N6{fill:#313244;} + .d2-825436234 .fill-N7{fill:#1E1E2E;} + .d2-825436234 .fill-B1{fill:#CBA6f7;} + .d2-825436234 .fill-B2{fill:#CBA6f7;} + .d2-825436234 .fill-B3{fill:#6C7086;} + .d2-825436234 .fill-B4{fill:#585B70;} + .d2-825436234 .fill-B5{fill:#45475A;} + .d2-825436234 .fill-B6{fill:#313244;} + .d2-825436234 .fill-AA2{fill:#f38BA8;} + .d2-825436234 .fill-AA4{fill:#45475A;} + .d2-825436234 .fill-AA5{fill:#313244;} + .d2-825436234 .fill-AB4{fill:#45475A;} + .d2-825436234 .fill-AB5{fill:#313244;} + .d2-825436234 .stroke-N1{stroke:#CDD6F4;} + .d2-825436234 .stroke-N2{stroke:#BAC2DE;} + .d2-825436234 .stroke-N3{stroke:#A6ADC8;} + .d2-825436234 .stroke-N4{stroke:#585B70;} + .d2-825436234 .stroke-N5{stroke:#45475A;} + .d2-825436234 .stroke-N6{stroke:#313244;} + .d2-825436234 .stroke-N7{stroke:#1E1E2E;} + .d2-825436234 .stroke-B1{stroke:#CBA6f7;} + .d2-825436234 .stroke-B2{stroke:#CBA6f7;} + .d2-825436234 .stroke-B3{stroke:#6C7086;} + .d2-825436234 .stroke-B4{stroke:#585B70;} + .d2-825436234 .stroke-B5{stroke:#45475A;} + .d2-825436234 .stroke-B6{stroke:#313244;} + .d2-825436234 .stroke-AA2{stroke:#f38BA8;} + .d2-825436234 .stroke-AA4{stroke:#45475A;} + .d2-825436234 .stroke-AA5{stroke:#313244;} + .d2-825436234 .stroke-AB4{stroke:#45475A;} + .d2-825436234 .stroke-AB5{stroke:#313244;} + .d2-825436234 .background-color-N1{background-color:#CDD6F4;} + .d2-825436234 .background-color-N2{background-color:#BAC2DE;} + .d2-825436234 .background-color-N3{background-color:#A6ADC8;} + .d2-825436234 .background-color-N4{background-color:#585B70;} + .d2-825436234 .background-color-N5{background-color:#45475A;} + .d2-825436234 .background-color-N6{background-color:#313244;} + .d2-825436234 .background-color-N7{background-color:#1E1E2E;} + .d2-825436234 .background-color-B1{background-color:#CBA6f7;} + .d2-825436234 .background-color-B2{background-color:#CBA6f7;} + .d2-825436234 .background-color-B3{background-color:#6C7086;} + .d2-825436234 .background-color-B4{background-color:#585B70;} + .d2-825436234 .background-color-B5{background-color:#45475A;} + .d2-825436234 .background-color-B6{background-color:#313244;} + .d2-825436234 .background-color-AA2{background-color:#f38BA8;} + .d2-825436234 .background-color-AA4{background-color:#45475A;} + .d2-825436234 .background-color-AA5{background-color:#313244;} + .d2-825436234 .background-color-AB4{background-color:#45475A;} + .d2-825436234 .background-color-AB5{background-color:#313244;} + .d2-825436234 .color-N1{color:#CDD6F4;} + .d2-825436234 .color-N2{color:#BAC2DE;} + .d2-825436234 .color-N3{color:#A6ADC8;} + .d2-825436234 .color-N4{color:#585B70;} + .d2-825436234 .color-N5{color:#45475A;} + .d2-825436234 .color-N6{color:#313244;} + .d2-825436234 .color-N7{color:#1E1E2E;} + .d2-825436234 .color-B1{color:#CBA6f7;} + .d2-825436234 .color-B2{color:#CBA6f7;} + .d2-825436234 .color-B3{color:#6C7086;} + .d2-825436234 .color-B4{color:#585B70;} + .d2-825436234 .color-B5{color:#45475A;} + .d2-825436234 .color-B6{color:#313244;} + .d2-825436234 .color-AA2{color:#f38BA8;} + .d2-825436234 .color-AA4{color:#45475A;} + .d2-825436234 .color-AA5{color:#313244;} + .d2-825436234 .color-AB4{color:#45475A;} + .d2-825436234 .color-AB5{color:#313244;}.appendix text.text{fill:#CDD6F4}.md{--color-fg-default:#CDD6F4;--color-fg-muted:#BAC2DE;--color-fg-subtle:#A6ADC8;--color-canvas-default:#1E1E2E;--color-canvas-subtle:#313244;--color-border-default:#CBA6f7;--color-border-muted:#CBA6f7;--color-neutral-muted:#313244;--color-accent-fg:#CBA6f7;--color-accent-emphasis:#CBA6f7;--color-attention-subtle:#BAC2DE;--color-danger-fg:red;}.sketch-overlay-B1{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-B2{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-B3{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-B4{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-B5{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-B6{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-AA2{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-AA4{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-AA5{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-AB4{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-AB5{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-N1{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-N2{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-N3{fill:url(#streaks-normal-d2-825436234);mix-blend-mode:color-burn}.sketch-overlay-N4{fill:url(#streaks-dark-d2-825436234);mix-blend-mode:overlay}.sketch-overlay-N5{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-N6{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.sketch-overlay-N7{fill:url(#streaks-darker-d2-825436234);mix-blend-mode:lighten}.light-code{display: none}.dark-code{display: block}}]]>Temporal ClusterServerWorkerHistoryMatchingFrontendDatabase Poll TasksPoll Tasks Add Tasks Record Start - - - - - - - - - - - - - + + + diff --git a/docs/_assets/retries.d2 b/docs/_assets/retries.d2 index 2cda5dba3e..a4dbaba3f6 100644 --- a/docs/_assets/retries.d2 +++ b/docs/_assets/retries.d2 @@ -1,6 +1,12 @@ # https://d2lang.com # Generate SVG by running `make` inside of `docs/`. +vars: { + d2-config: { + layout-engine: elk + } +} + classes: { invisible: { style.opacity: 0 diff --git a/docs/_assets/retries.svg b/docs/_assets/retries.svg index 5d54bd20a8..a3226d0419 100644 --- a/docs/_assets/retries.svg +++ b/docs/_assets/retries.svg @@ -1,23 +1,23 @@ -ClientFrontendHistory🔁 gRPC interceptor...🔁 History client🔁 gRPC interceptor... gRPC callgRPC call + .d2-3159573827 .fill-N1{fill:#0A0F25;} + .d2-3159573827 .fill-N2{fill:#676C7E;} + .d2-3159573827 .fill-N3{fill:#9499AB;} + .d2-3159573827 .fill-N4{fill:#CFD2DD;} + .d2-3159573827 .fill-N5{fill:#DEE1EB;} + .d2-3159573827 .fill-N6{fill:#EEF1F8;} + .d2-3159573827 .fill-N7{fill:#FFFFFF;} + .d2-3159573827 .fill-B1{fill:#170034;} + .d2-3159573827 .fill-B2{fill:#7639C5;} + .d2-3159573827 .fill-B3{fill:#8F70D1;} + .d2-3159573827 .fill-B4{fill:#D0B9F5;} + .d2-3159573827 .fill-B5{fill:#E7DEFF;} + .d2-3159573827 .fill-B6{fill:#F4F0FF;} + .d2-3159573827 .fill-AA2{fill:#0F66B7;} + .d2-3159573827 .fill-AA4{fill:#87BFF3;} + .d2-3159573827 .fill-AA5{fill:#BCDDFB;} + .d2-3159573827 .fill-AB4{fill:#92E3E3;} + .d2-3159573827 .fill-AB5{fill:#D7F5F5;} + .d2-3159573827 .stroke-N1{stroke:#0A0F25;} + .d2-3159573827 .stroke-N2{stroke:#676C7E;} + .d2-3159573827 .stroke-N3{stroke:#9499AB;} + .d2-3159573827 .stroke-N4{stroke:#CFD2DD;} + .d2-3159573827 .stroke-N5{stroke:#DEE1EB;} + .d2-3159573827 .stroke-N6{stroke:#EEF1F8;} + .d2-3159573827 .stroke-N7{stroke:#FFFFFF;} + .d2-3159573827 .stroke-B1{stroke:#170034;} + .d2-3159573827 .stroke-B2{stroke:#7639C5;} + .d2-3159573827 .stroke-B3{stroke:#8F70D1;} + .d2-3159573827 .stroke-B4{stroke:#D0B9F5;} + .d2-3159573827 .stroke-B5{stroke:#E7DEFF;} + .d2-3159573827 .stroke-B6{stroke:#F4F0FF;} + .d2-3159573827 .stroke-AA2{stroke:#0F66B7;} + .d2-3159573827 .stroke-AA4{stroke:#87BFF3;} + .d2-3159573827 .stroke-AA5{stroke:#BCDDFB;} + .d2-3159573827 .stroke-AB4{stroke:#92E3E3;} + .d2-3159573827 .stroke-AB5{stroke:#D7F5F5;} + .d2-3159573827 .background-color-N1{background-color:#0A0F25;} + .d2-3159573827 .background-color-N2{background-color:#676C7E;} + .d2-3159573827 .background-color-N3{background-color:#9499AB;} + .d2-3159573827 .background-color-N4{background-color:#CFD2DD;} + .d2-3159573827 .background-color-N5{background-color:#DEE1EB;} + .d2-3159573827 .background-color-N6{background-color:#EEF1F8;} + .d2-3159573827 .background-color-N7{background-color:#FFFFFF;} + .d2-3159573827 .background-color-B1{background-color:#170034;} + .d2-3159573827 .background-color-B2{background-color:#7639C5;} + .d2-3159573827 .background-color-B3{background-color:#8F70D1;} + .d2-3159573827 .background-color-B4{background-color:#D0B9F5;} + .d2-3159573827 .background-color-B5{background-color:#E7DEFF;} + .d2-3159573827 .background-color-B6{background-color:#F4F0FF;} + .d2-3159573827 .background-color-AA2{background-color:#0F66B7;} + .d2-3159573827 .background-color-AA4{background-color:#87BFF3;} + .d2-3159573827 .background-color-AA5{background-color:#BCDDFB;} + .d2-3159573827 .background-color-AB4{background-color:#92E3E3;} + .d2-3159573827 .background-color-AB5{background-color:#D7F5F5;} + .d2-3159573827 .color-N1{color:#0A0F25;} + .d2-3159573827 .color-N2{color:#676C7E;} + .d2-3159573827 .color-N3{color:#9499AB;} + .d2-3159573827 .color-N4{color:#CFD2DD;} + .d2-3159573827 .color-N5{color:#DEE1EB;} + .d2-3159573827 .color-N6{color:#EEF1F8;} + .d2-3159573827 .color-N7{color:#FFFFFF;} + .d2-3159573827 .color-B1{color:#170034;} + .d2-3159573827 .color-B2{color:#7639C5;} + .d2-3159573827 .color-B3{color:#8F70D1;} + .d2-3159573827 .color-B4{color:#D0B9F5;} + .d2-3159573827 .color-B5{color:#E7DEFF;} + .d2-3159573827 .color-B6{color:#F4F0FF;} + .d2-3159573827 .color-AA2{color:#0F66B7;} + .d2-3159573827 .color-AA4{color:#87BFF3;} + .d2-3159573827 .color-AA5{color:#BCDDFB;} + .d2-3159573827 .color-AB4{color:#92E3E3;} + .d2-3159573827 .color-AB5{color:#D7F5F5;}.appendix text.text{fill:#0A0F25}.md{--color-fg-default:#0A0F25;--color-fg-muted:#676C7E;--color-fg-subtle:#9499AB;--color-canvas-default:#FFFFFF;--color-canvas-subtle:#EEF1F8;--color-border-default:#170034;--color-border-muted:#7639C5;--color-neutral-muted:#EEF1F8;--color-accent-fg:#7639C5;--color-accent-emphasis:#7639C5;--color-attention-subtle:#676C7E;--color-danger-fg:red;}.sketch-overlay-B1{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-B2{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-B3{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-B4{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-B5{fill:url(#streaks-bright-d2-3159573827);mix-blend-mode:darken}.sketch-overlay-B6{fill:url(#streaks-bright-d2-3159573827);mix-blend-mode:darken}.sketch-overlay-AA2{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-AA4{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-AA5{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-AB4{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-AB5{fill:url(#streaks-bright-d2-3159573827);mix-blend-mode:darken}.sketch-overlay-N1{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-N2{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-N3{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-N4{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-N5{fill:url(#streaks-bright-d2-3159573827);mix-blend-mode:darken}.sketch-overlay-N6{fill:url(#streaks-bright-d2-3159573827);mix-blend-mode:darken}.sketch-overlay-N7{fill:url(#streaks-bright-d2-3159573827);mix-blend-mode:darken}.light-code{display: block}.dark-code{display: none}@media screen and (prefers-color-scheme:dark){ + .d2-3159573827 .fill-N1{fill:#CDD6F4;} + .d2-3159573827 .fill-N2{fill:#BAC2DE;} + .d2-3159573827 .fill-N3{fill:#A6ADC8;} + .d2-3159573827 .fill-N4{fill:#585B70;} + .d2-3159573827 .fill-N5{fill:#45475A;} + .d2-3159573827 .fill-N6{fill:#313244;} + .d2-3159573827 .fill-N7{fill:#1E1E2E;} + .d2-3159573827 .fill-B1{fill:#CBA6f7;} + .d2-3159573827 .fill-B2{fill:#CBA6f7;} + .d2-3159573827 .fill-B3{fill:#6C7086;} + .d2-3159573827 .fill-B4{fill:#585B70;} + .d2-3159573827 .fill-B5{fill:#45475A;} + .d2-3159573827 .fill-B6{fill:#313244;} + .d2-3159573827 .fill-AA2{fill:#f38BA8;} + .d2-3159573827 .fill-AA4{fill:#45475A;} + .d2-3159573827 .fill-AA5{fill:#313244;} + .d2-3159573827 .fill-AB4{fill:#45475A;} + .d2-3159573827 .fill-AB5{fill:#313244;} + .d2-3159573827 .stroke-N1{stroke:#CDD6F4;} + .d2-3159573827 .stroke-N2{stroke:#BAC2DE;} + .d2-3159573827 .stroke-N3{stroke:#A6ADC8;} + .d2-3159573827 .stroke-N4{stroke:#585B70;} + .d2-3159573827 .stroke-N5{stroke:#45475A;} + .d2-3159573827 .stroke-N6{stroke:#313244;} + .d2-3159573827 .stroke-N7{stroke:#1E1E2E;} + .d2-3159573827 .stroke-B1{stroke:#CBA6f7;} + .d2-3159573827 .stroke-B2{stroke:#CBA6f7;} + .d2-3159573827 .stroke-B3{stroke:#6C7086;} + .d2-3159573827 .stroke-B4{stroke:#585B70;} + .d2-3159573827 .stroke-B5{stroke:#45475A;} + .d2-3159573827 .stroke-B6{stroke:#313244;} + .d2-3159573827 .stroke-AA2{stroke:#f38BA8;} + .d2-3159573827 .stroke-AA4{stroke:#45475A;} + .d2-3159573827 .stroke-AA5{stroke:#313244;} + .d2-3159573827 .stroke-AB4{stroke:#45475A;} + .d2-3159573827 .stroke-AB5{stroke:#313244;} + .d2-3159573827 .background-color-N1{background-color:#CDD6F4;} + .d2-3159573827 .background-color-N2{background-color:#BAC2DE;} + .d2-3159573827 .background-color-N3{background-color:#A6ADC8;} + .d2-3159573827 .background-color-N4{background-color:#585B70;} + .d2-3159573827 .background-color-N5{background-color:#45475A;} + .d2-3159573827 .background-color-N6{background-color:#313244;} + .d2-3159573827 .background-color-N7{background-color:#1E1E2E;} + .d2-3159573827 .background-color-B1{background-color:#CBA6f7;} + .d2-3159573827 .background-color-B2{background-color:#CBA6f7;} + .d2-3159573827 .background-color-B3{background-color:#6C7086;} + .d2-3159573827 .background-color-B4{background-color:#585B70;} + .d2-3159573827 .background-color-B5{background-color:#45475A;} + .d2-3159573827 .background-color-B6{background-color:#313244;} + .d2-3159573827 .background-color-AA2{background-color:#f38BA8;} + .d2-3159573827 .background-color-AA4{background-color:#45475A;} + .d2-3159573827 .background-color-AA5{background-color:#313244;} + .d2-3159573827 .background-color-AB4{background-color:#45475A;} + .d2-3159573827 .background-color-AB5{background-color:#313244;} + .d2-3159573827 .color-N1{color:#CDD6F4;} + .d2-3159573827 .color-N2{color:#BAC2DE;} + .d2-3159573827 .color-N3{color:#A6ADC8;} + .d2-3159573827 .color-N4{color:#585B70;} + .d2-3159573827 .color-N5{color:#45475A;} + .d2-3159573827 .color-N6{color:#313244;} + .d2-3159573827 .color-N7{color:#1E1E2E;} + .d2-3159573827 .color-B1{color:#CBA6f7;} + .d2-3159573827 .color-B2{color:#CBA6f7;} + .d2-3159573827 .color-B3{color:#6C7086;} + .d2-3159573827 .color-B4{color:#585B70;} + .d2-3159573827 .color-B5{color:#45475A;} + .d2-3159573827 .color-B6{color:#313244;} + .d2-3159573827 .color-AA2{color:#f38BA8;} + .d2-3159573827 .color-AA4{color:#45475A;} + .d2-3159573827 .color-AA5{color:#313244;} + .d2-3159573827 .color-AB4{color:#45475A;} + .d2-3159573827 .color-AB5{color:#313244;}.appendix text.text{fill:#CDD6F4}.md{--color-fg-default:#CDD6F4;--color-fg-muted:#BAC2DE;--color-fg-subtle:#A6ADC8;--color-canvas-default:#1E1E2E;--color-canvas-subtle:#313244;--color-border-default:#CBA6f7;--color-border-muted:#CBA6f7;--color-neutral-muted:#313244;--color-accent-fg:#CBA6f7;--color-accent-emphasis:#CBA6f7;--color-attention-subtle:#BAC2DE;--color-danger-fg:red;}.sketch-overlay-B1{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-B2{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-B3{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-B4{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-B5{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-B6{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-AA2{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-AA4{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-AA5{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-AB4{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-AB5{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-N1{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-N2{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-N3{fill:url(#streaks-normal-d2-3159573827);mix-blend-mode:color-burn}.sketch-overlay-N4{fill:url(#streaks-dark-d2-3159573827);mix-blend-mode:overlay}.sketch-overlay-N5{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-N6{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.sketch-overlay-N7{fill:url(#streaks-darker-d2-3159573827);mix-blend-mode:lighten}.light-code{display: none}.dark-code{display: block}}]]>ClientFrontendHistory🔁 gRPC interceptor...🔁 History client🔁 gRPC interceptor... gRPC callgRPC call - - - - - - - - - - + + diff --git a/docs/development/testing.md b/docs/development/testing.md index 95a994913a..d81d126149 100644 --- a/docs/development/testing.md +++ b/docs/development/testing.md @@ -43,13 +43,17 @@ unless there is a reason not to. `make parallelize-tests` can be used to automatically add `t.Parallel()`. Use `//parallelize:ignore` to opt your test out of it. -Functional tests in `tests/` using `testcore.NewEnv(t)` will always use `t.Parallel()`; -unless the `MustRunSequential` option is passed. - ## Test helpers Test helpers can be found in the [common/testing](../../common/testing) package. +### parallelsuite package + +Use `parallelsuite.Suite` to ensure your test suite is fast and safe: it runs all test methods and sub-tests in parallel by default; +and provides assertion helpers and safety mechanisms. + +It replaces all use of `testify`'s `Suite`. + ### testvars package Instead of creating identifiers like task queue name, namespace or worker identity by hand, @@ -155,11 +159,10 @@ will ultimately fail the test. Use `testcore.NewEnv(t)` to create a test environment with access to a Temporal cluster for end-to-end testing. ```go -func TestMyFeatureSuite(t *testing.T) { - t.Run("scenario one", func(t *testing.T) { - s := testcore.NewEnv(t) - // ... - })} +func (s* TestMyFeatureSuite) func TestXYZ(t *testing.T) { + s := testcore.NewEnv(t) + // ... +} ``` Note that each test has its own namespace (`s.Namespace()`) for isolation. diff --git a/go.mod b/go.mod index aa2d419616..7ecccf7dd3 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,7 @@ require ( github.com/lib/pq v1.10.9 github.com/maruel/panicparse/v2 v2.4.0 github.com/mitchellh/mapstructure v1.5.0 - github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5 + github.com/nexus-rpc/sdk-go v0.6.0 github.com/olekukonko/tablewriter v0.0.5 github.com/olivere/elastic/v7 v7.0.32 github.com/prometheus/client_golang v1.21.0 @@ -59,8 +59,8 @@ require ( go.opentelemetry.io/otel/sdk v1.40.0 go.opentelemetry.io/otel/sdk/metric v1.40.0 go.opentelemetry.io/otel/trace v1.40.0 - go.temporal.io/api v1.62.5 - go.temporal.io/sdk v1.38.0 + go.temporal.io/api v1.62.6-0.20260318231552-70f7dc9970b6 + go.temporal.io/sdk v1.41.1 go.uber.org/fx v1.24.0 go.uber.org/mock v0.6.0 go.uber.org/multierr v1.11.0 diff --git a/go.sum b/go.sum index ab7a7f9f22..a3e3a54ecc 100644 --- a/go.sum +++ b/go.sum @@ -238,8 +238,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= -github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5 h1:Van9KGGs8lcDgxzSNFbDhEMNeJ80TbBxwZ45f9iBk9U= -github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= +github.com/nexus-rpc/sdk-go v0.6.0 h1:QRgnP2zTbxEbiyWG/aXH8uSC5LV/Mg1fqb19jb4DBlo= +github.com/nexus-rpc/sdk-go v0.6.0/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= @@ -376,10 +376,10 @@ go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZY go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= -go.temporal.io/api v1.62.5 h1:9R/9CeyM7xqHSlsNt+QIvapQLcRxCZ38bnXQx4mCN6I= -go.temporal.io/api v1.62.5/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= -go.temporal.io/sdk v1.38.0 h1:4Bok5LEdED7YKpsSjIa3dDqram5VOq+ydBf4pyx0Wo4= -go.temporal.io/sdk v1.38.0/go.mod h1:a+R2Ej28ObvHoILbHaxMyind7M6D+W0L7edt5UJF4SE= +go.temporal.io/api v1.62.6-0.20260318231552-70f7dc9970b6 h1:N3/HbW7JdvRGJHP2woHEJwJ/vFM9gCy+yIM5qWJwSQo= +go.temporal.io/api v1.62.6-0.20260318231552-70f7dc9970b6/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/sdk v1.41.1 h1:yOpvsHyDD1lNuwlGBv/SUodCPhjv9nDeC9lLHW/fJUA= +go.temporal.io/sdk v1.41.1/go.mod h1:/InXQT5guZ6AizYzpmzr5avQ/GMgq1ZObcKlKE2AhTc= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= diff --git a/proto/internal/temporal/server/api/adminservice/v1/request_response.proto b/proto/internal/temporal/server/api/adminservice/v1/request_response.proto index e8e11a03f7..8240afb392 100644 --- a/proto/internal/temporal/server/api/adminservice/v1/request_response.proto +++ b/proto/internal/temporal/server/api/adminservice/v1/request_response.proto @@ -1,44 +1,42 @@ syntax = "proto3"; package temporal.server.api.adminservice.v1; -option go_package = "go.temporal.io/server/api/adminservice/v1;adminservice"; -import "google/protobuf/timestamp.proto"; import "google/protobuf/duration.proto"; - +import "google/protobuf/timestamp.proto"; +import "temporal/api/common/v1/message.proto"; import "temporal/api/enums/v1/common.proto"; import "temporal/api/enums/v1/task_queue.proto"; -import "temporal/api/common/v1/message.proto"; -import "temporal/api/version/v1/message.proto"; -import "temporal/api/workflow/v1/message.proto"; import "temporal/api/namespace/v1/message.proto"; import "temporal/api/replication/v1/message.proto"; import "temporal/api/taskqueue/v1/message.proto"; - +import "temporal/api/version/v1/message.proto"; +import "temporal/api/workflow/v1/message.proto"; import "temporal/server/api/cluster/v1/message.proto"; import "temporal/server/api/common/v1/dlq.proto"; -import "temporal/server/api/enums/v1/common.proto"; import "temporal/server/api/enums/v1/cluster.proto"; -import "temporal/server/api/enums/v1/task.proto"; +import "temporal/server/api/enums/v1/common.proto"; import "temporal/server/api/enums/v1/dlq.proto"; +import "temporal/server/api/enums/v1/task.proto"; +import "temporal/server/api/health/v1/message.proto"; import "temporal/server/api/history/v1/message.proto"; import "temporal/server/api/namespace/v1/message.proto"; -import "temporal/server/api/replication/v1/message.proto"; import "temporal/server/api/persistence/v1/cluster_metadata.proto"; import "temporal/server/api/persistence/v1/executions.proto"; -import "temporal/server/api/persistence/v1/workflow_mutable_state.proto"; -import "temporal/server/api/persistence/v1/tasks.proto"; import "temporal/server/api/persistence/v1/hsm.proto"; +import "temporal/server/api/persistence/v1/tasks.proto"; +import "temporal/server/api/persistence/v1/workflow_mutable_state.proto"; +import "temporal/server/api/replication/v1/message.proto"; import "temporal/server/api/taskqueue/v1/message.proto"; -import "temporal/server/api/health/v1/message.proto"; + +option go_package = "go.temporal.io/server/api/adminservice/v1;adminservice"; message RebuildMutableStateRequest { string namespace = 1; temporal.api.common.v1.WorkflowExecution execution = 2; } -message RebuildMutableStateResponse { -} +message RebuildMutableStateResponse {} message ImportWorkflowExecutionRequest { string namespace = 1; @@ -66,7 +64,7 @@ message DescribeMutableStateResponse { string history_addr = 2; // CacheMutableState is only available when mutable state is in cache. temporal.server.api.persistence.v1.WorkflowMutableState cache_mutable_state = 3; - // DatabaseMutableState is always available, + // DatabaseMutableState is always available, // but only loaded from database when mutable state is NOT in cache or skip_force_reload is false. temporal.server.api.persistence.v1.WorkflowMutableState database_mutable_state = 4; } @@ -92,8 +90,7 @@ message CloseShardRequest { int32 shard_id = 1; } -message CloseShardResponse { -} +message CloseShardResponse {} message GetShardRequest { int32 shard_id = 1; @@ -135,8 +132,7 @@ message RemoveTaskRequest { google.protobuf.Timestamp visibility_time = 4; } -message RemoveTaskResponse { -} +message RemoveTaskResponse {} /** * StartEventId defines the beginning of the event to fetch. The first event is exclusive. @@ -218,8 +214,7 @@ message ReapplyEventsRequest { temporal.api.common.v1.DataBlob events = 3; } -message ReapplyEventsResponse { -} +message ReapplyEventsResponse {} message AddSearchAttributesRequest { map search_attributes = 1; @@ -228,8 +223,7 @@ message AddSearchAttributesRequest { string namespace = 4; } -message AddSearchAttributesResponse { -} +message AddSearchAttributesResponse {} message RemoveSearchAttributesRequest { repeated string search_attributes = 1; @@ -237,8 +231,7 @@ message RemoveSearchAttributesRequest { string namespace = 3; } -message RemoveSearchAttributesResponse { -} +message RemoveSearchAttributesResponse {} message GetSearchAttributesRequest { string index_name = 1; @@ -292,15 +285,13 @@ message AddOrUpdateRemoteClusterRequest { bool enable_replication = 4; } -message AddOrUpdateRemoteClusterResponse { -} +message AddOrUpdateRemoteClusterResponse {} message RemoveRemoteClusterRequest { string cluster_name = 1; } -message RemoveRemoteClusterResponse { -} +message RemoveRemoteClusterResponse {} message ListClusterMembersRequest { // (-- api-linter: core::0140::prepositions=disabled @@ -344,8 +335,7 @@ message PurgeDLQMessagesRequest { int64 inclusive_end_message_id = 4; } -message PurgeDLQMessagesResponse { -} +message PurgeDLQMessagesResponse {} message MergeDLQMessagesRequest { temporal.server.api.enums.v1.DeadLetterQueueType type = 1; @@ -369,8 +359,7 @@ message RefreshWorkflowTasksRequest { uint32 archetype_id = 5; } -message RefreshWorkflowTasksResponse { -} +message RefreshWorkflowTasksResponse {} message ResendReplicationTasksRequest { string namespace_id = 1; @@ -383,8 +372,7 @@ message ResendReplicationTasksRequest { int64 end_version = 8; } -message ResendReplicationTasksResponse { -} +message ResendReplicationTasksResponse {} message GetTaskQueueTasksRequest { string namespace = 1; @@ -461,7 +449,7 @@ message GetDLQTasksResponse { message PurgeDLQTasksRequest { temporal.server.api.common.v1.HistoryDLQKey dlq_key = 1; - temporal.server.api.common.v1. HistoryDLQTaskMetadata inclusive_max_task_metadata = 2; + temporal.server.api.common.v1.HistoryDLQTaskMetadata inclusive_max_task_metadata = 2; } message PurgeDLQTasksResponse { @@ -554,8 +542,7 @@ message ListQueuesResponse { bytes next_page_token = 2; } -message DeepHealthCheckRequest { -} +message DeepHealthCheckRequest {} message DeepHealthCheckResponse { temporal.server.api.enums.v1.HealthState state = 1; @@ -645,40 +632,36 @@ message StartAdminBatchOperationRequest { } } -message StartAdminBatchOperationResponse { -} +message StartAdminBatchOperationResponse {} // BatchOperationRefreshTasks refreshes tasks for batch executions. // This regenerates all pending tasks for each execution. -message BatchOperationRefreshTasks { -} - +message BatchOperationRefreshTasks {} message MigrateScheduleRequest { - // Target scheduler implementation for migration. - enum SchedulerTarget { - SCHEDULER_TARGET_UNSPECIFIED = 0; - // Migrate to CHASM-backed scheduler (V2). - SCHEDULER_TARGET_CHASM = 1; - // Migrate to workflow-backed scheduler (V1). - SCHEDULER_TARGET_WORKFLOW = 2; - } - - // Namespace name. - string namespace = 1; + // Target scheduler implementation for migration. + enum SchedulerTarget { + SCHEDULER_TARGET_UNSPECIFIED = 0; + // Migrate to CHASM-backed scheduler (V2). + SCHEDULER_TARGET_CHASM = 1; + // Migrate to workflow-backed scheduler (V1). + SCHEDULER_TARGET_WORKFLOW = 2; + } - // Schedule ID. - string schedule_id = 2; + // Namespace name. + string namespace = 1; - // Target scheduler implementation. - SchedulerTarget target = 3; + // Schedule ID. + string schedule_id = 2; - // Identity of the caller. - string identity = 4; + // Target scheduler implementation. + SchedulerTarget target = 3; - // Used for request deduplication. - string request_id = 5; + // Identity of the caller. + string identity = 4; + + // Used for request deduplication. + string request_id = 5; } message MigrateScheduleResponse {} - diff --git a/proto/internal/temporal/server/api/adminservice/v1/service.proto b/proto/internal/temporal/server/api/adminservice/v1/service.proto index a170a7b525..40e052fd0c 100644 --- a/proto/internal/temporal/server/api/adminservice/v1/service.proto +++ b/proto/internal/temporal/server/api/adminservice/v1/service.proto @@ -1,229 +1,230 @@ syntax = "proto3"; package temporal.server.api.adminservice.v1; -option go_package = "go.temporal.io/server/api/adminservice/v1;adminservice"; import "temporal/server/api/adminservice/v1/request_response.proto"; import "temporal/server/api/common/v1/api_category.proto"; +option go_package = "go.temporal.io/server/api/adminservice/v1;adminservice"; + // AdminService provides advanced APIs for debugging and analysis with admin privilege service AdminService { - // RebuildMutableState attempts to rebuild mutable state according to persisted history events. - // NOTE: this is experimental API - rpc RebuildMutableState (RebuildMutableStateRequest) returns (RebuildMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // ImportWorkflowExecution attempts to import workflow according to persisted history events. - // NOTE: this is experimental API - rpc ImportWorkflowExecution (ImportWorkflowExecutionRequest) returns (ImportWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // DescribeWorkflowExecution returns information about the internal states of workflow execution. - rpc DescribeMutableState (DescribeMutableStateRequest) returns (DescribeMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // DescribeHistoryHost returns information about the internal states of a history host - rpc DescribeHistoryHost (DescribeHistoryHostRequest) returns (DescribeHistoryHostResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc GetShard (GetShardRequest) returns (GetShardResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc CloseShard (CloseShardRequest) returns (CloseShardResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc ListHistoryTasks (ListHistoryTasksRequest) returns (ListHistoryTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc RemoveTask (RemoveTaskRequest) returns (RemoveTaskResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // Returns the raw history of specified workflow execution. It fails with 'NotFound' if specified workflow - // execution in unknown to the service. - // StartEventId defines the beginning of the event to fetch. The first event is inclusive. - // EndEventId and EndEventVersion defines the end of the event to fetch. The end event is exclusive. - rpc GetWorkflowExecutionRawHistoryV2 (GetWorkflowExecutionRawHistoryV2Request) returns (GetWorkflowExecutionRawHistoryV2Response) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // StartEventId defines the beginning of the event to fetch. The first event is inclusive. - // EndEventId and EndEventVersion defines the end of the event to fetch. The end event is inclusive. - rpc GetWorkflowExecutionRawHistory (GetWorkflowExecutionRawHistoryRequest) returns (GetWorkflowExecutionRawHistoryResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // GetReplicationMessages returns new replication tasks since the read level provided in the token. - rpc GetReplicationMessages (GetReplicationMessagesRequest) returns (GetReplicationMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // GetNamespaceReplicationMessages returns new namespace replication tasks since last retrieved task Id. - rpc GetNamespaceReplicationMessages (GetNamespaceReplicationMessagesRequest) returns (GetNamespaceReplicationMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // GetDLQReplicationMessages return replication messages based on DLQ info. - rpc GetDLQReplicationMessages(GetDLQReplicationMessagesRequest) returns (GetDLQReplicationMessagesResponse){ - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // ReapplyEvents applies stale events to the current workflow and current run. - rpc ReapplyEvents (ReapplyEventsRequest) returns (ReapplyEventsResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // AddSearchAttributes add custom search attributes and returns comprehensive information about them. - // Deprecated. Use operatorservice instead. - rpc AddSearchAttributes (AddSearchAttributesRequest) returns (AddSearchAttributesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // RemoveSearchAttributes removes custom search attributes and returns comprehensive information about them. - // Deprecated. Use operatorservice instead. - rpc RemoveSearchAttributes (RemoveSearchAttributesRequest) returns (RemoveSearchAttributesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // GetSearchAttributes returns comprehensive information about search attributes. - // Deprecated. Use operatorservice instead. - rpc GetSearchAttributes (GetSearchAttributesRequest) returns (GetSearchAttributesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // DescribeCluster returns information about Temporal cluster. - rpc DescribeCluster(DescribeClusterRequest) returns (DescribeClusterResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // ListClusters returns information about Temporal clusters. - rpc ListClusters(ListClustersRequest) returns (ListClustersResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // ListClusterMembers returns information about Temporal cluster members. - rpc ListClusterMembers(ListClusterMembersRequest) returns (ListClusterMembersResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // AddOrUpdateRemoteCluster adds or updates remote cluster. - rpc AddOrUpdateRemoteCluster(AddOrUpdateRemoteClusterRequest) returns (AddOrUpdateRemoteClusterResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // RemoveRemoteCluster removes remote cluster. - rpc RemoveRemoteCluster(RemoveRemoteClusterRequest) returns (RemoveRemoteClusterResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // GetDLQMessages returns messages from DLQ. - rpc GetDLQMessages(GetDLQMessagesRequest) returns (GetDLQMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // (-- api-linter: core::0165::response-message-name=disabled - // aip.dev/not-precedent: --) - // PurgeDLQMessages purges messages from DLQ. - rpc PurgeDLQMessages(PurgeDLQMessagesRequest) returns (PurgeDLQMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // MergeDLQMessages merges messages from DLQ. - rpc MergeDLQMessages(MergeDLQMessagesRequest) returns (MergeDLQMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // RefreshWorkflowTasks refreshes all tasks of a workflow. - rpc RefreshWorkflowTasks(RefreshWorkflowTasksRequest) returns (RefreshWorkflowTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // StartAdminBatchOperation starts an admin batch operation. Supports internal operations like RefreshWorkflowTasks. - rpc StartAdminBatchOperation(StartAdminBatchOperationRequest) returns (StartAdminBatchOperationResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // ResendReplicationTasks requests replication tasks from remote cluster and apply tasks to current cluster. - rpc ResendReplicationTasks(ResendReplicationTasksRequest) returns (ResendReplicationTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // GetTaskQueueTasks returns tasks from task queue. - rpc GetTaskQueueTasks(GetTaskQueueTasksRequest) returns (GetTaskQueueTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // DeleteWorkflowExecution force deletes a workflow's visibility record, current & concrete execution record and history if possible - rpc DeleteWorkflowExecution(DeleteWorkflowExecutionRequest) returns (DeleteWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc StreamWorkflowReplicationMessages(stream StreamWorkflowReplicationMessagesRequest) returns (stream StreamWorkflowReplicationMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc GetNamespace(GetNamespaceRequest) returns (GetNamespaceResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc GetDLQTasks (GetDLQTasksRequest) returns (GetDLQTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - // (-- api-linter: core::0165::response-message-name=disabled - // aip.dev/not-precedent: --) - rpc PurgeDLQTasks (PurgeDLQTasksRequest) returns (PurgeDLQTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc MergeDLQTasks (MergeDLQTasksRequest) returns (MergeDLQTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc DescribeDLQJob (DescribeDLQJobRequest) returns (DescribeDLQJobResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc CancelDLQJob (CancelDLQJobRequest) returns (CancelDLQJobResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc AddTasks (AddTasksRequest) returns (AddTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc ListQueues (ListQueuesRequest) returns (ListQueuesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc DeepHealthCheck (DeepHealthCheckRequest) returns (DeepHealthCheckResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc SyncWorkflowState (SyncWorkflowStateRequest) returns (SyncWorkflowStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc GenerateLastHistoryReplicationTasks(GenerateLastHistoryReplicationTasksRequest) returns (GenerateLastHistoryReplicationTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc DescribeTaskQueuePartition (DescribeTaskQueuePartitionRequest) returns (DescribeTaskQueuePartitionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc ForceUnloadTaskQueuePartition (ForceUnloadTaskQueuePartitionRequest) returns (ForceUnloadTaskQueuePartitionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // MigrateSchedule migrates a schedule between V1 (workflow-backed) and V2 (CHASM-backed) implementations. - rpc MigrateSchedule (MigrateScheduleRequest) returns (MigrateScheduleResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } + // RebuildMutableState attempts to rebuild mutable state according to persisted history events. + // NOTE: this is experimental API + rpc RebuildMutableState(RebuildMutableStateRequest) returns (RebuildMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // ImportWorkflowExecution attempts to import workflow according to persisted history events. + // NOTE: this is experimental API + rpc ImportWorkflowExecution(ImportWorkflowExecutionRequest) returns (ImportWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // DescribeWorkflowExecution returns information about the internal states of workflow execution. + rpc DescribeMutableState(DescribeMutableStateRequest) returns (DescribeMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // DescribeHistoryHost returns information about the internal states of a history host + rpc DescribeHistoryHost(DescribeHistoryHostRequest) returns (DescribeHistoryHostResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc GetShard(GetShardRequest) returns (GetShardResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc CloseShard(CloseShardRequest) returns (CloseShardResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc ListHistoryTasks(ListHistoryTasksRequest) returns (ListHistoryTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc RemoveTask(RemoveTaskRequest) returns (RemoveTaskResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // Returns the raw history of specified workflow execution. It fails with 'NotFound' if specified workflow + // execution in unknown to the service. + // StartEventId defines the beginning of the event to fetch. The first event is inclusive. + // EndEventId and EndEventVersion defines the end of the event to fetch. The end event is exclusive. + rpc GetWorkflowExecutionRawHistoryV2(GetWorkflowExecutionRawHistoryV2Request) returns (GetWorkflowExecutionRawHistoryV2Response) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // StartEventId defines the beginning of the event to fetch. The first event is inclusive. + // EndEventId and EndEventVersion defines the end of the event to fetch. The end event is inclusive. + rpc GetWorkflowExecutionRawHistory(GetWorkflowExecutionRawHistoryRequest) returns (GetWorkflowExecutionRawHistoryResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // GetReplicationMessages returns new replication tasks since the read level provided in the token. + rpc GetReplicationMessages(GetReplicationMessagesRequest) returns (GetReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // GetNamespaceReplicationMessages returns new namespace replication tasks since last retrieved task Id. + rpc GetNamespaceReplicationMessages(GetNamespaceReplicationMessagesRequest) returns (GetNamespaceReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // GetDLQReplicationMessages return replication messages based on DLQ info. + rpc GetDLQReplicationMessages(GetDLQReplicationMessagesRequest) returns (GetDLQReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // ReapplyEvents applies stale events to the current workflow and current run. + rpc ReapplyEvents(ReapplyEventsRequest) returns (ReapplyEventsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // AddSearchAttributes add custom search attributes and returns comprehensive information about them. + // Deprecated. Use operatorservice instead. + rpc AddSearchAttributes(AddSearchAttributesRequest) returns (AddSearchAttributesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // RemoveSearchAttributes removes custom search attributes and returns comprehensive information about them. + // Deprecated. Use operatorservice instead. + rpc RemoveSearchAttributes(RemoveSearchAttributesRequest) returns (RemoveSearchAttributesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // GetSearchAttributes returns comprehensive information about search attributes. + // Deprecated. Use operatorservice instead. + rpc GetSearchAttributes(GetSearchAttributesRequest) returns (GetSearchAttributesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // DescribeCluster returns information about Temporal cluster. + rpc DescribeCluster(DescribeClusterRequest) returns (DescribeClusterResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // ListClusters returns information about Temporal clusters. + rpc ListClusters(ListClustersRequest) returns (ListClustersResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // ListClusterMembers returns information about Temporal cluster members. + rpc ListClusterMembers(ListClusterMembersRequest) returns (ListClusterMembersResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // AddOrUpdateRemoteCluster adds or updates remote cluster. + rpc AddOrUpdateRemoteCluster(AddOrUpdateRemoteClusterRequest) returns (AddOrUpdateRemoteClusterResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // RemoveRemoteCluster removes remote cluster. + rpc RemoveRemoteCluster(RemoveRemoteClusterRequest) returns (RemoveRemoteClusterResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // GetDLQMessages returns messages from DLQ. + rpc GetDLQMessages(GetDLQMessagesRequest) returns (GetDLQMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // (-- api-linter: core::0165::response-message-name=disabled + // aip.dev/not-precedent: --) + // PurgeDLQMessages purges messages from DLQ. + rpc PurgeDLQMessages(PurgeDLQMessagesRequest) returns (PurgeDLQMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // MergeDLQMessages merges messages from DLQ. + rpc MergeDLQMessages(MergeDLQMessagesRequest) returns (MergeDLQMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // RefreshWorkflowTasks refreshes all tasks of a workflow. + rpc RefreshWorkflowTasks(RefreshWorkflowTasksRequest) returns (RefreshWorkflowTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // StartAdminBatchOperation starts an admin batch operation. Supports internal operations like RefreshWorkflowTasks. + rpc StartAdminBatchOperation(StartAdminBatchOperationRequest) returns (StartAdminBatchOperationResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // ResendReplicationTasks requests replication tasks from remote cluster and apply tasks to current cluster. + rpc ResendReplicationTasks(ResendReplicationTasksRequest) returns (ResendReplicationTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // GetTaskQueueTasks returns tasks from task queue. + rpc GetTaskQueueTasks(GetTaskQueueTasksRequest) returns (GetTaskQueueTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // DeleteWorkflowExecution force deletes a workflow's visibility record, current & concrete execution record and history if possible + rpc DeleteWorkflowExecution(DeleteWorkflowExecutionRequest) returns (DeleteWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc StreamWorkflowReplicationMessages(stream StreamWorkflowReplicationMessagesRequest) returns (stream StreamWorkflowReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc GetNamespace(GetNamespaceRequest) returns (GetNamespaceResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc GetDLQTasks(GetDLQTasksRequest) returns (GetDLQTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + // (-- api-linter: core::0165::response-message-name=disabled + // aip.dev/not-precedent: --) + rpc PurgeDLQTasks(PurgeDLQTasksRequest) returns (PurgeDLQTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc MergeDLQTasks(MergeDLQTasksRequest) returns (MergeDLQTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc DescribeDLQJob(DescribeDLQJobRequest) returns (DescribeDLQJobResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc CancelDLQJob(CancelDLQJobRequest) returns (CancelDLQJobResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc AddTasks(AddTasksRequest) returns (AddTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc ListQueues(ListQueuesRequest) returns (ListQueuesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc DeepHealthCheck(DeepHealthCheckRequest) returns (DeepHealthCheckResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc SyncWorkflowState(SyncWorkflowStateRequest) returns (SyncWorkflowStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc GenerateLastHistoryReplicationTasks(GenerateLastHistoryReplicationTasksRequest) returns (GenerateLastHistoryReplicationTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc DescribeTaskQueuePartition(DescribeTaskQueuePartitionRequest) returns (DescribeTaskQueuePartitionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc ForceUnloadTaskQueuePartition(ForceUnloadTaskQueuePartitionRequest) returns (ForceUnloadTaskQueuePartitionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // MigrateSchedule migrates a schedule between V1 (workflow-backed) and V2 (CHASM-backed) implementations. + rpc MigrateSchedule(MigrateScheduleRequest) returns (MigrateScheduleResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } } diff --git a/proto/internal/temporal/server/api/archiver/v1/message.proto b/proto/internal/temporal/server/api/archiver/v1/message.proto index ead6f57eb5..41680046ee 100644 --- a/proto/internal/temporal/server/api/archiver/v1/message.proto +++ b/proto/internal/temporal/server/api/archiver/v1/message.proto @@ -2,47 +2,46 @@ syntax = "proto3"; package temporal.server.api.archiver.v1; -option go_package = "go.temporal.io/server/api/archiver/v1;archiver"; - import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; - import "temporal/api/common/v1/message.proto"; -import "temporal/api/history/v1/message.proto"; import "temporal/api/enums/v1/workflow.proto"; +import "temporal/api/history/v1/message.proto"; + +option go_package = "go.temporal.io/server/api/archiver/v1;archiver"; message HistoryBlobHeader { - string namespace = 1; - string namespace_id = 2; - string workflow_id = 3; - string run_id = 4; - bool is_last = 5; - int64 first_failover_version = 6; - int64 last_failover_version = 7; - int64 first_event_id = 8; - int64 last_event_id = 9; - int64 event_count = 10; + string namespace = 1; + string namespace_id = 2; + string workflow_id = 3; + string run_id = 4; + bool is_last = 5; + int64 first_failover_version = 6; + int64 last_failover_version = 7; + int64 first_event_id = 8; + int64 last_event_id = 9; + int64 event_count = 10; } -message HistoryBlob { - HistoryBlobHeader header = 1; - repeated temporal.api.history.v1.History body = 2; +message HistoryBlob { + HistoryBlobHeader header = 1; + repeated temporal.api.history.v1.History body = 2; } // VisibilityRecord is a single workflow visibility record in archive. message VisibilityRecord { - string namespace_id = 1; - string namespace = 2; - string workflow_id = 3; - string run_id = 4; - string workflow_type_name = 5; - google.protobuf.Timestamp start_time = 6; - google.protobuf.Timestamp execution_time = 7; - google.protobuf.Timestamp close_time = 8; - temporal.api.enums.v1.WorkflowExecutionStatus status = 9; - int64 history_length = 10; - temporal.api.common.v1.Memo memo = 11; - map search_attributes = 12; - string history_archival_uri = 13; - google.protobuf.Duration execution_duration = 14; + string namespace_id = 1; + string namespace = 2; + string workflow_id = 3; + string run_id = 4; + string workflow_type_name = 5; + google.protobuf.Timestamp start_time = 6; + google.protobuf.Timestamp execution_time = 7; + google.protobuf.Timestamp close_time = 8; + temporal.api.enums.v1.WorkflowExecutionStatus status = 9; + int64 history_length = 10; + temporal.api.common.v1.Memo memo = 11; + map search_attributes = 12; + string history_archival_uri = 13; + google.protobuf.Duration execution_duration = 14; } diff --git a/proto/internal/temporal/server/api/batch/v1/request_response.proto b/proto/internal/temporal/server/api/batch/v1/request_response.proto index e30f2456fa..cb4542e05d 100644 --- a/proto/internal/temporal/server/api/batch/v1/request_response.proto +++ b/proto/internal/temporal/server/api/batch/v1/request_response.proto @@ -1,16 +1,17 @@ syntax = "proto3"; package temporal.server.api.batch.v1; -option go_package = "go.temporal.io/server/api/batch/v1;batch"; -import "temporal/api/workflowservice/v1/request_response.proto"; +import "google/protobuf/duration.proto"; import "temporal/api/enums/v1/batch_operation.proto"; +import "temporal/api/workflowservice/v1/request_response.proto"; import "temporal/server/api/adminservice/v1/request_response.proto"; -import "google/protobuf/duration.proto"; -message BatchOperationInput { - string namespace_id = 1; - +option go_package = "go.temporal.io/server/api/batch/v1;batch"; + +message BatchOperationInput { + string namespace_id = 1; + int64 concurrency = 2; int64 attempts_on_retryable_error = 3; @@ -29,4 +30,4 @@ message BatchOperationInput { // The request to start an admin batch operation. // Mutually exclusive with StartBatchOperationRequest request. temporal.server.api.adminservice.v1.StartAdminBatchOperationRequest admin_request = 8; -} \ No newline at end of file +} diff --git a/proto/internal/temporal/server/api/chasm/v1/message.proto b/proto/internal/temporal/server/api/chasm/v1/message.proto index 3d4c93aa0a..83a8e5d273 100644 --- a/proto/internal/temporal/server/api/chasm/v1/message.proto +++ b/proto/internal/temporal/server/api/chasm/v1/message.proto @@ -1,22 +1,22 @@ syntax = "proto3"; package temporal.server.api.chasm.v1; -option go_package = "go.temporal.io/server/api/chasm/v1;chasm"; import "google/protobuf/timestamp.proto"; - import "temporal/api/common/v1/message.proto"; +option go_package = "go.temporal.io/server/api/chasm/v1;chasm"; + message VisibilityExecutionInfo { - string business_id = 1; - string run_id = 2; - google.protobuf.Timestamp start_time = 3; - google.protobuf.Timestamp close_time = 4; - int64 history_length = 5; - int64 history_size_bytes = 6; - int64 state_transition_count = 7; - temporal.api.common.v1.SearchAttributes chasm_search_attributes = 8; - temporal.api.common.v1.SearchAttributes custom_search_attributes = 9; - temporal.api.common.v1.Memo memo = 10; - temporal.api.common.v1.Payload chasm_memo = 11; + string business_id = 1; + string run_id = 2; + google.protobuf.Timestamp start_time = 3; + google.protobuf.Timestamp close_time = 4; + int64 history_length = 5; + int64 history_size_bytes = 6; + int64 state_transition_count = 7; + temporal.api.common.v1.SearchAttributes chasm_search_attributes = 8; + temporal.api.common.v1.SearchAttributes custom_search_attributes = 9; + temporal.api.common.v1.Memo memo = 10; + temporal.api.common.v1.Payload chasm_memo = 11; } diff --git a/proto/internal/temporal/server/api/checksum/v1/message.proto b/proto/internal/temporal/server/api/checksum/v1/message.proto index 8a48a8e200..d766dad78b 100644 --- a/proto/internal/temporal/server/api/checksum/v1/message.proto +++ b/proto/internal/temporal/server/api/checksum/v1/message.proto @@ -2,44 +2,42 @@ syntax = "proto3"; package temporal.server.api.checksum.v1; -option go_package = "go.temporal.io/server/api/checksum/v1;checksum"; - import "temporal/api/enums/v1/workflow.proto"; - -import "temporal/server/api/history/v1/message.proto"; import "temporal/server/api/enums/v1/workflow.proto"; +import "temporal/server/api/history/v1/message.proto"; -message MutableStateChecksumPayload { - bool cancel_requested = 1; - temporal.server.api.enums.v1.WorkflowExecutionState state = 2; - temporal.api.enums.v1.WorkflowExecutionStatus status = 3; - - int64 last_write_version = 4; - int64 last_write_event_id = 5; - int64 last_first_event_id = 6; - int64 next_event_id = 7; - int64 last_processed_event_id = 8; - - int64 signal_count = 9; - int64 activity_count = 21; - int64 child_execution_count = 22; - int64 user_timer_count = 23; - int64 request_cancel_external_count = 24; - int64 signal_external_count = 25; - - int32 workflow_task_attempt = 10; - int64 workflow_task_version = 11; - int64 workflow_task_scheduled_event_id = 12; - int64 workflow_task_started_event_id = 13; - - repeated int64 pending_timer_started_event_ids = 14; - repeated int64 pending_activity_scheduled_event_ids = 15; - repeated int64 pending_signal_initiated_event_ids = 16; - repeated int64 pending_req_cancel_initiated_event_ids = 17; - repeated int64 pending_child_initiated_event_ids = 18; - repeated string pending_chasm_node_paths = 26; - - string sticky_task_queue_name = 19; - temporal.server.api.history.v1.VersionHistories version_histories = 20; +option go_package = "go.temporal.io/server/api/checksum/v1;checksum"; +message MutableStateChecksumPayload { + bool cancel_requested = 1; + temporal.server.api.enums.v1.WorkflowExecutionState state = 2; + temporal.api.enums.v1.WorkflowExecutionStatus status = 3; + + int64 last_write_version = 4; + int64 last_write_event_id = 5; + int64 last_first_event_id = 6; + int64 next_event_id = 7; + int64 last_processed_event_id = 8; + + int64 signal_count = 9; + int64 activity_count = 21; + int64 child_execution_count = 22; + int64 user_timer_count = 23; + int64 request_cancel_external_count = 24; + int64 signal_external_count = 25; + + int32 workflow_task_attempt = 10; + int64 workflow_task_version = 11; + int64 workflow_task_scheduled_event_id = 12; + int64 workflow_task_started_event_id = 13; + + repeated int64 pending_timer_started_event_ids = 14; + repeated int64 pending_activity_scheduled_event_ids = 15; + repeated int64 pending_signal_initiated_event_ids = 16; + repeated int64 pending_req_cancel_initiated_event_ids = 17; + repeated int64 pending_child_initiated_event_ids = 18; + repeated string pending_chasm_node_paths = 26; + + string sticky_task_queue_name = 19; + temporal.server.api.history.v1.VersionHistories version_histories = 20; } diff --git a/proto/internal/temporal/server/api/cli/v1/message.proto b/proto/internal/temporal/server/api/cli/v1/message.proto index f633ec6564..c8cc0dc943 100644 --- a/proto/internal/temporal/server/api/cli/v1/message.proto +++ b/proto/internal/temporal/server/api/cli/v1/message.proto @@ -2,71 +2,70 @@ syntax = "proto3"; package temporal.server.api.cli.v1; -option go_package = "go.temporal.io/server/api/cli/v1;cli"; - import "google/protobuf/timestamp.proto"; - import "temporal/api/common/v1/message.proto"; import "temporal/api/enums/v1/workflow.proto"; import "temporal/api/workflow/v1/message.proto"; +option go_package = "go.temporal.io/server/api/cli/v1;cli"; + message DescribeWorkflowExecutionResponse { - temporal.api.workflow.v1.WorkflowExecutionConfig execution_config = 1; - WorkflowExecutionInfo workflow_execution_info = 2; - repeated PendingActivityInfo pending_activities = 3; - repeated temporal.api.workflow.v1.PendingChildExecutionInfo pending_children = 4; - temporal.api.workflow.v1.PendingWorkflowTaskInfo pending_workflow_task = 5; + temporal.api.workflow.v1.WorkflowExecutionConfig execution_config = 1; + WorkflowExecutionInfo workflow_execution_info = 2; + repeated PendingActivityInfo pending_activities = 3; + repeated temporal.api.workflow.v1.PendingChildExecutionInfo pending_children = 4; + temporal.api.workflow.v1.PendingWorkflowTaskInfo pending_workflow_task = 5; } message WorkflowExecutionInfo { - temporal.api.common.v1.WorkflowExecution execution = 1; - temporal.api.common.v1.WorkflowType type = 2; - google.protobuf.Timestamp start_time = 3; - google.protobuf.Timestamp close_time = 4; - temporal.api.enums.v1.WorkflowExecutionStatus status = 5; - int64 history_length = 6; - string parent_namespace_id = 7; - temporal.api.common.v1.WorkflowExecution parent_execution = 8; - google.protobuf.Timestamp execution_time = 9; - temporal.api.common.v1.Memo memo = 10; - SearchAttributes search_attributes = 11; - temporal.api.workflow.v1.ResetPoints auto_reset_points = 12; - int64 state_transition_count = 13; - int64 history_size_bytes = 14; - temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 15; + temporal.api.common.v1.WorkflowExecution execution = 1; + temporal.api.common.v1.WorkflowType type = 2; + google.protobuf.Timestamp start_time = 3; + google.protobuf.Timestamp close_time = 4; + temporal.api.enums.v1.WorkflowExecutionStatus status = 5; + int64 history_length = 6; + string parent_namespace_id = 7; + temporal.api.common.v1.WorkflowExecution parent_execution = 8; + google.protobuf.Timestamp execution_time = 9; + temporal.api.common.v1.Memo memo = 10; + SearchAttributes search_attributes = 11; + temporal.api.workflow.v1.ResetPoints auto_reset_points = 12; + int64 state_transition_count = 13; + int64 history_size_bytes = 14; + temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 15; } message PendingActivityInfo { - string activity_id = 1; - temporal.api.common.v1.ActivityType activity_type = 2; - temporal.api.enums.v1.PendingActivityState state = 3; - string heartbeat_details = 4; - google.protobuf.Timestamp last_heartbeat_time = 5; - google.protobuf.Timestamp last_started_time = 6; - int32 attempt = 7; - int32 maximum_attempts = 8; - google.protobuf.Timestamp scheduled_time = 9; - google.protobuf.Timestamp expiration_time = 10; - Failure last_failure = 11; - string last_worker_identity = 12; + string activity_id = 1; + temporal.api.common.v1.ActivityType activity_type = 2; + temporal.api.enums.v1.PendingActivityState state = 3; + string heartbeat_details = 4; + google.protobuf.Timestamp last_heartbeat_time = 5; + google.protobuf.Timestamp last_started_time = 6; + int32 attempt = 7; + int32 maximum_attempts = 8; + google.protobuf.Timestamp scheduled_time = 9; + google.protobuf.Timestamp expiration_time = 10; + Failure last_failure = 11; + string last_worker_identity = 12; } message SearchAttributes { - map indexed_fields = 1; + map indexed_fields = 1; } message Failure { - string message = 1; - string source = 2; - string stack_trace = 3; - Failure cause = 4; - string failure_type = 5; + string message = 1; + string source = 2; + string stack_trace = 3; + Failure cause = 4; + string failure_type = 5; } message AddSearchAttributesResponse { - string index_name = 1; - map custom_search_attributes = 2; - map system_search_attributes = 3; - map mapping = 4; - WorkflowExecutionInfo add_workflow_execution_info = 5; + string index_name = 1; + map custom_search_attributes = 2; + map system_search_attributes = 3; + map mapping = 4; + WorkflowExecutionInfo add_workflow_execution_info = 5; } diff --git a/proto/internal/temporal/server/api/clock/v1/message.proto b/proto/internal/temporal/server/api/clock/v1/message.proto index 8063d8b6b8..d2a58293a5 100644 --- a/proto/internal/temporal/server/api/clock/v1/message.proto +++ b/proto/internal/temporal/server/api/clock/v1/message.proto @@ -5,21 +5,21 @@ package temporal.server.api.clock.v1; option go_package = "go.temporal.io/server/api/clock/v1;clock"; message VectorClock { - int32 shard_id = 1; - int64 clock = 2; - int64 cluster_id = 3; + int32 shard_id = 1; + int64 clock = 2; + int64 cluster_id = 3; } // A Hybrid Logical Clock timestamp. // Guarantees strict total ordering for conflict resolution purposes. message HybridLogicalClock { - // Wall clock - A single time source MUST guarantee that 2 consecutive timestamps are monotonically non-decreasing. - // e.g. by storing the last wall clock and returning max(gettimeofday(), lastWallClock). - int64 wall_clock = 1; - // Incremental sequence that is reset every time the system's wallclock moves forward. - // Ensures the clock generates monotonically increasing timestamps. - int32 version = 2; - // The cluster version ID as described in the XDC docs - used as a tie breaker. - // See: https://github.com/uber/cadence/blob/master/docs/design/2290-cadence-ndc.md - int64 cluster_id = 3; + // Wall clock - A single time source MUST guarantee that 2 consecutive timestamps are monotonically non-decreasing. + // e.g. by storing the last wall clock and returning max(gettimeofday(), lastWallClock). + int64 wall_clock = 1; + // Incremental sequence that is reset every time the system's wallclock moves forward. + // Ensures the clock generates monotonically increasing timestamps. + int32 version = 2; + // The cluster version ID as described in the XDC docs - used as a tie breaker. + // See: https://github.com/uber/cadence/blob/master/docs/design/2290-cadence-ndc.md + int64 cluster_id = 3; } diff --git a/proto/internal/temporal/server/api/cluster/v1/message.proto b/proto/internal/temporal/server/api/cluster/v1/message.proto index 4247fff5fb..06e95b5b6c 100644 --- a/proto/internal/temporal/server/api/cluster/v1/message.proto +++ b/proto/internal/temporal/server/api/cluster/v1/message.proto @@ -2,34 +2,33 @@ syntax = "proto3"; package temporal.server.api.cluster.v1; -option go_package = "go.temporal.io/server/api/cluster/v1;cluster"; - import "google/protobuf/timestamp.proto"; - import "temporal/server/api/enums/v1/cluster.proto"; +option go_package = "go.temporal.io/server/api/cluster/v1;cluster"; + message HostInfo { - string identity = 1; + string identity = 1; } message RingInfo { - string role = 1; - int32 member_count = 2; - repeated HostInfo members = 3; + string role = 1; + int32 member_count = 2; + repeated HostInfo members = 3; } message MembershipInfo { - HostInfo current_host = 1; - repeated string reachable_members = 2; - repeated RingInfo rings = 3; + HostInfo current_host = 1; + repeated string reachable_members = 2; + repeated RingInfo rings = 3; } message ClusterMember { - temporal.server.api.enums.v1.ClusterMemberRole role = 1; - string host_id = 2; - string rpc_address = 3; - int32 rpc_port = 4; - google.protobuf.Timestamp session_start_time = 5; - google.protobuf.Timestamp last_heartbit_time = 6; - google.protobuf.Timestamp record_expiry_time = 7; + temporal.server.api.enums.v1.ClusterMemberRole role = 1; + string host_id = 2; + string rpc_address = 3; + int32 rpc_port = 4; + google.protobuf.Timestamp session_start_time = 5; + google.protobuf.Timestamp last_heartbit_time = 6; + google.protobuf.Timestamp record_expiry_time = 7; } diff --git a/proto/internal/temporal/server/api/common/v1/api_category.proto b/proto/internal/temporal/server/api/common/v1/api_category.proto index b6d40e6068..3ac0fb25dd 100644 --- a/proto/internal/temporal/server/api/common/v1/api_category.proto +++ b/proto/internal/temporal/server/api/common/v1/api_category.proto @@ -2,10 +2,10 @@ syntax = "proto3"; package temporal.server.api.common.v1; -option go_package = "go.temporal.io/server/api/common/v1;commonspb"; - import "google/protobuf/descriptor.proto"; +option go_package = "go.temporal.io/server/api/common/v1;commonspb"; + extend google.protobuf.MethodOptions { optional ApiCategoryOptions api_category = 50001; } diff --git a/proto/internal/temporal/server/api/common/v1/dlq.proto b/proto/internal/temporal/server/api/common/v1/dlq.proto index 5328b2120e..f3ed852a10 100644 --- a/proto/internal/temporal/server/api/common/v1/dlq.proto +++ b/proto/internal/temporal/server/api/common/v1/dlq.proto @@ -1,10 +1,11 @@ syntax = "proto3"; package temporal.server.api.common.v1; -option go_package = "go.temporal.io/server/api/common/v1;commonspb"; import "temporal/api/common/v1/message.proto"; +option go_package = "go.temporal.io/server/api/common/v1;commonspb"; + message HistoryTask { // shard_id is included to avoid having to deserialize the task blob. int32 shard_id = 1; @@ -37,4 +38,3 @@ message HistoryDLQKey { string source_cluster = 2; string target_cluster = 3; } - diff --git a/proto/internal/temporal/server/api/deployment/v1/message.proto b/proto/internal/temporal/server/api/deployment/v1/message.proto index 513069a398..00d24c4243 100644 --- a/proto/internal/temporal/server/api/deployment/v1/message.proto +++ b/proto/internal/temporal/server/api/deployment/v1/message.proto @@ -2,50 +2,50 @@ syntax = "proto3"; package temporal.server.api.deployment.v1; -option go_package = "go.temporal.io/server/api/deployment/v1;deployment"; - -import "temporal/api/enums/v1/task_queue.proto"; -import "temporal/api/enums/v1/deployment.proto"; import "google/protobuf/timestamp.proto"; -import "temporal/api/deployment/v1/message.proto"; import "temporal/api/common/v1/message.proto"; +import "temporal/api/deployment/v1/message.proto"; +import "temporal/api/enums/v1/deployment.proto"; +import "temporal/api/enums/v1/task_queue.proto"; + +option go_package = "go.temporal.io/server/api/deployment/v1;deployment"; // Identifies a Worker Deployment Version. The combination of `deployment_name` and `build_id` // serve as the identifier. message WorkerDeploymentVersion { - // The name of the Deployment this version belongs too. - string deployment_name = 1; - // Build ID uniquely identifies the Deployment Version within a Deployment, but the same Build - // ID can be used in multiple Deployments. - string build_id = 2; + // The name of the Deployment this version belongs too. + string deployment_name = 1; + // Build ID uniquely identifies the Deployment Version within a Deployment, but the same Build + // ID can be used in multiple Deployments. + string build_id = 2; } // The source of truth for this data is in the WorkerDeployment entity workflows, which is // synced to all TQs whenever the source changes. // Deprecated. message DeploymentVersionData { - // Nil means unversioned. - WorkerDeploymentVersion version = 1; + // Nil means unversioned. + WorkerDeploymentVersion version = 1; - // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. - google.protobuf.Timestamp routing_update_time = 2; + // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. + google.protobuf.Timestamp routing_update_time = 2; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not current. - google.protobuf.Timestamp current_since_time = 3; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not current. + google.protobuf.Timestamp current_since_time = 3; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. - google.protobuf.Timestamp ramping_since_time = 4; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. + google.protobuf.Timestamp ramping_since_time = 4; - // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). - // Can be in the range [0, 100] if the version is ramping. - float ramp_percentage = 5; + // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). + // Can be in the range [0, 100] if the version is ramping. + float ramp_percentage = 5; - // Status of the Worker Deployment Version. - temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 6; + // Status of the Worker Deployment Version. + temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 6; } // Information that a TQ should know about a particular Deployment Version. This info is not part of @@ -53,395 +53,395 @@ message DeploymentVersionData { // As of Workflow Version `VersionDataRevisionNumber`, version specific data has its own revision // number, which makes async propagations safer and allows async registration. message WorkerDeploymentVersionData { - // Incremented everytime version data changes. Updates with lower revision number than what is - // already in the TQ will be ignored to avoid stale writes. - int64 revision_number = 1; - // Last update time. Used for garbage collecting deleted versions from TQ user data. - google.protobuf.Timestamp update_time = 2; - // In order to protect against deletes being overwritten by delayed stale writes, we can't - // immediately delete the version data from task queues. instead, we mark them as deleted while - // keeping the revision number. - // Old enough deleted versions are GCed based on update_time. - bool deleted = 3; - - temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 6; + // Incremented everytime version data changes. Updates with lower revision number than what is + // already in the TQ will be ignored to avoid stale writes. + int64 revision_number = 1; + // Last update time. Used for garbage collecting deleted versions from TQ user data. + google.protobuf.Timestamp update_time = 2; + // In order to protect against deletes being overwritten by delayed stale writes, we can't + // immediately delete the version data from task queues. instead, we mark them as deleted while + // keeping the revision number. + // Old enough deleted versions are GCed based on update_time. + // Deprecated. This mechanism is not safe against reactivation of versions after delete. + // Use forget_version flag for synchronous deletion of the version data from TQ. + bool deleted = 3; + + temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 6; } // Local state for Worker Deployment Version message VersionLocalState { - WorkerDeploymentVersion version = 1; - google.protobuf.Timestamp create_time = 2; - - // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. - google.protobuf.Timestamp routing_update_time = 3; - - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not current. - google.protobuf.Timestamp current_since_time = 4; - - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. - google.protobuf.Timestamp ramping_since_time = 5; - - // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). - // Can be in the range [0, 100] if the version is ramping. - float ramp_percentage = 6; - - // Timestamp when this version first became current or ramping. - google.protobuf.Timestamp first_activation_time = 12; - - // Timestamp when this version last became current. - // Can be used to determine whether a version has ever been Current. - google.protobuf.Timestamp last_current_time = 16; - - // Timestamp when this version last stopped being current or ramping. - google.protobuf.Timestamp last_deactivation_time = 13; - - // Helps user determine when it is safe to decommission the workers of this - // Version. Not present when version is current or ramping. - // Current limitations: - // - Not supported for Unversioned mode. - // - Periodically refreshed, may have delays up to few minutes (consult the - // last_checked_time value). - // - Refreshed only when version is not current or ramping AND the status is not - // "drained" yet. - // - Once the status is changed to "drained", it is not changed until the Version - // becomes Current or Ramping again, at which time the drainage info is cleared. - // This means if the Version is "drained" but new workflows are sent to it via - // Pinned Versioning Override, the status does not account for those Pinned-override - // executions and remains "drained". - temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 7; - - // Arbitrary user-provided metadata attached to this version. - temporal.api.deployment.v1.VersionMetadata metadata = 8; + WorkerDeploymentVersion version = 1; + google.protobuf.Timestamp create_time = 2; + + // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. + google.protobuf.Timestamp routing_update_time = 3; + + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not current. + google.protobuf.Timestamp current_since_time = 4; + + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. + google.protobuf.Timestamp ramping_since_time = 5; + + // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). + // Can be in the range [0, 100] if the version is ramping. + float ramp_percentage = 6; + + // Timestamp when this version first became current or ramping. + google.protobuf.Timestamp first_activation_time = 12; + + // Timestamp when this version last became current. + // Can be used to determine whether a version has ever been Current. + google.protobuf.Timestamp last_current_time = 16; + + // Timestamp when this version last stopped being current or ramping. + google.protobuf.Timestamp last_deactivation_time = 13; + + // Helps user determine when it is safe to decommission the workers of this + // Version. Not present when version is current or ramping. + // Current limitations: + // - Not supported for Unversioned mode. + // - Periodically refreshed, may have delays up to few minutes (consult the + // last_checked_time value). + // - Refreshed only when version is not current or ramping AND the status is not + // "drained" yet. + // - Once the status is changed to "drained", it is not changed until the Version + // becomes Current or Ramping again, at which time the drainage info is cleared. + // This means if the Version is "drained" but new workflows are sent to it via + // Pinned Versioning Override, the status does not account for those Pinned-override + // executions and remains "drained". + temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 7; + + // Arbitrary user-provided metadata attached to this version. + temporal.api.deployment.v1.VersionMetadata metadata = 8; // Deployment workflow should always be running before starting the version workflow. // We should not start the deployment workflow. If we cannot find the deployment workflow when signaling, it means a bug and we should fix it. // Deprecated. bool started_deployment_workflow = 9 [deprecated = true]; - // Key: Task Queue Name - map task_queue_families = 10; + // Key: Task Queue Name + map task_queue_families = 10; - // Number of task queues which will be synced in a single batch. - int32 sync_batch_size = 11; + // Number of task queues which will be synced in a single batch. + int32 sync_batch_size = 11; - message TaskQueueFamilyData { - // Key: Task Queue Type - map task_queues = 1; - } + message TaskQueueFamilyData { + // Key: Task Queue Type + map task_queues = 1; + } - // Status of the Worker Deployment Version. - temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 14; + // Status of the Worker Deployment Version. + temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 14; - // Incremented everytime version data synced to TQ changes. Updates with lower revision number - // than what is already in the TQ will be ignored to avoid stale writes during async operations. - int64 revision_number = 15; + // Incremented everytime version data synced to TQ changes. Updates with lower revision number + // than what is already in the TQ will be ignored to avoid stale writes during async operations. + int64 revision_number = 15; } // Data specific to a task queue, from the perspective of a worker deployment version. -message TaskQueueVersionData { -} +message TaskQueueVersionData {} // used as Worker Deployment Version workflow input: message WorkerDeploymentVersionWorkflowArgs { - string namespace_name = 1; - string namespace_id = 2; - VersionLocalState version_state = 3; + string namespace_name = 1; + string namespace_id = 2; + VersionLocalState version_state = 3; } // used as Worker Deployment workflow input: message WorkerDeploymentWorkflowArgs { - string namespace_name = 1; - string namespace_id = 2; - string deployment_name = 3; - WorkerDeploymentLocalState state = 4; + string namespace_name = 1; + string namespace_id = 2; + string deployment_name = 3; + WorkerDeploymentLocalState state = 4; } // Local state for Worker Deployment message WorkerDeploymentLocalState { - google.protobuf.Timestamp create_time = 1; - // Encapsulates task routing information for this deployment. - temporal.api.deployment.v1.RoutingConfig routing_config = 2; - map versions = 3; - bytes conflict_token = 4; - string last_modifier_identity = 5; - // Number of task queues which will be synced in a single batch. - int32 sync_batch_size = 6; - string manager_identity = 7; - // Track async propagations in progress per build ID. Map: build_id -> revision numbers. - // Used to track which propagations are still pending across continue-as-new. - map propagating_revisions = 8; + google.protobuf.Timestamp create_time = 1; + // Encapsulates task routing information for this deployment. + temporal.api.deployment.v1.RoutingConfig routing_config = 2; + map versions = 3; + bytes conflict_token = 4; + string last_modifier_identity = 5; + // Number of task queues which will be synced in a single batch. + int32 sync_batch_size = 6; + string manager_identity = 7; + // Track async propagations in progress per build ID. Map: build_id -> revision numbers. + // Used to track which propagations are still pending across continue-as-new. + map propagating_revisions = 8; } // Tracks revision numbers that are currently propagating for a specific build ID message PropagatingRevisions { - repeated int64 revision_numbers = 1; + repeated int64 revision_numbers = 1; } message WorkerDeploymentVersionSummary { - string version = 1; - google.protobuf.Timestamp create_time = 2; - temporal.api.enums.v1.VersionDrainageStatus drainage_status = 3 [deprecated=true]; - // Information about workflow drainage to help the user determine when it is safe - // to decommission a Version. Not present while version is current or ramping. - temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 4; - // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. - google.protobuf.Timestamp routing_update_time = 5; + string version = 1; + google.protobuf.Timestamp create_time = 2; + temporal.api.enums.v1.VersionDrainageStatus drainage_status = 3 [deprecated = true]; + // Information about workflow drainage to help the user determine when it is safe + // to decommission a Version. Not present while version is current or ramping. + temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 4; + // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. + google.protobuf.Timestamp routing_update_time = 5; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not current. - google.protobuf.Timestamp current_since_time = 6; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not current. + google.protobuf.Timestamp current_since_time = 6; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. - google.protobuf.Timestamp ramping_since_time = 7; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. + google.protobuf.Timestamp ramping_since_time = 7; - // Timestamp when this version first became current or ramping. - google.protobuf.Timestamp first_activation_time = 8; + // Timestamp when this version first became current or ramping. + google.protobuf.Timestamp first_activation_time = 8; - // Timestamp when this version last became current. - // Can be used to determine whether a version has ever been Current. - google.protobuf.Timestamp last_current_time = 11; + // Timestamp when this version last became current. + // Can be used to determine whether a version has ever been Current. + google.protobuf.Timestamp last_current_time = 11; - // Timestamp when this version last stopped being current or ramping. - google.protobuf.Timestamp last_deactivation_time = 9; + // Timestamp when this version last stopped being current or ramping. + google.protobuf.Timestamp last_deactivation_time = 9; - // Status of the Worker Deployment Version. - temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 10; + // Status of the Worker Deployment Version. + temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 10; } // used as Worker Deployment Version workflow update input: message RegisterWorkerInVersionArgs { - string task_queue_name = 1; - temporal.api.enums.v1.TaskQueueType task_queue_type = 2; - int32 max_task_queues = 3; - string version = 4; - temporal.api.deployment.v1.RoutingConfig routing_config = 5; + string task_queue_name = 1; + temporal.api.enums.v1.TaskQueueType task_queue_type = 2; + int32 max_task_queues = 3; + string version = 4; + temporal.api.deployment.v1.RoutingConfig routing_config = 5; } // used as Worker Deployment workflow update input: message RegisterWorkerInWorkerDeploymentArgs { - string task_queue_name = 1; - temporal.api.enums.v1.TaskQueueType task_queue_type = 2; - int32 max_task_queues = 3; - WorkerDeploymentVersion version = 4; + string task_queue_name = 1; + temporal.api.enums.v1.TaskQueueType task_queue_type = 2; + int32 max_task_queues = 3; + WorkerDeploymentVersion version = 4; } // used as Worker Deployment workflow activity input: message DescribeVersionFromWorkerDeploymentActivityArgs { - string version = 1; + string version = 1; } message DescribeVersionFromWorkerDeploymentActivityResult { - // All the Task Queues that have ever polled from this Deployment version. - repeated temporal.api.deployment.v1.WorkerDeploymentVersionInfo.VersionTaskQueueInfo task_queue_infos = 1; + // All the Task Queues that have ever polled from this Deployment version. + repeated temporal.api.deployment.v1.WorkerDeploymentVersionInfo.VersionTaskQueueInfo task_queue_infos = 1; } // used as Worker Deployment workflow update input (sent from Worker Deployment workflow): message SyncVersionStateUpdateArgs { - // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. - google.protobuf.Timestamp routing_update_time = 1 [deprecated = true]; + // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. + google.protobuf.Timestamp routing_update_time = 1 [deprecated = true]; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not current. - google.protobuf.Timestamp current_since_time = 2 [deprecated = true]; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not current. + google.protobuf.Timestamp current_since_time = 2 [deprecated = true]; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. - google.protobuf.Timestamp ramping_since_time = 3 [deprecated = true]; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. + google.protobuf.Timestamp ramping_since_time = 3 [deprecated = true]; - // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). - // Can be in the range [0, 100] if the version is ramping. - float ramp_percentage = 4 [deprecated = true]; + // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). + // Can be in the range [0, 100] if the version is ramping. + float ramp_percentage = 4 [deprecated = true]; - // Full routing config for async propagation mode. When present, the version workflow - // will propagate the entire routing config asynchronously. When absent, sync mode is used. - temporal.api.deployment.v1.RoutingConfig routing_config = 5; + // Full routing config for async propagation mode. When present, the version workflow + // will propagate the entire routing config asynchronously. When absent, sync mode is used. + temporal.api.deployment.v1.RoutingConfig routing_config = 5; } // used as Worker Deployment workflow update response (sent from Worker Deployment workflow): message SyncVersionStateResponse { - // Deprecated. State could be so large, no need to send it to the deployment workflow. - VersionLocalState version_state = 1 [deprecated = true]; - WorkerDeploymentVersionSummary summary = 2; + // Deprecated. State could be so large, no need to send it to the deployment workflow. + VersionLocalState version_state = 1 [deprecated = true]; + WorkerDeploymentVersionSummary summary = 2; } // Sent from Version workflow to Worker Deployment workflow message AddVersionUpdateArgs { - string version = 1; - google.protobuf.Timestamp create_time = 2; + string version = 1; + google.protobuf.Timestamp create_time = 2; } // Sent from Drainage child workflow to Version parent message SyncDrainageInfoSignalArgs { - temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 1; + temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 1; } // Sent from Version workflow to Worker Deployment workflow message SyncDrainageStatusSignalArgs { - string version = 1; - temporal.api.enums.v1.VersionDrainageStatus drainage_status = 2; + string version = 1; + temporal.api.enums.v1.VersionDrainageStatus drainage_status = 2; } // Sent from Version workflow to Worker Deployment workflow when async propagation completes message PropagationCompletionInfo { - int64 revision_number = 1; - string build_id = 2; + int64 revision_number = 1; + string build_id = 2; } - // used as Worker Deployment Version workflow query response: message QueryDescribeVersionResponse { - VersionLocalState version_state = 1; + VersionLocalState version_state = 1; } // used as Worker Deployment Version workflow query response: message QueryDescribeWorkerDeploymentResponse { - WorkerDeploymentLocalState state = 1; + WorkerDeploymentLocalState state = 1; } // used as Worker Deployment Version workflow activity input: message StartWorkerDeploymentRequest { - string deployment_name = 1; - string request_id = 2; + string deployment_name = 1; + string request_id = 2; } // used as Worker Deployment workflow activity input: message StartWorkerDeploymentVersionRequest { - string deployment_name = 1; - string build_id = 2; - string request_id = 3; + string deployment_name = 1; + string build_id = 2; + string request_id = 3; } // used as Worker Deployment Version workflow activity input: message SyncDeploymentVersionUserDataRequest { - string deployment_name = 4; - WorkerDeploymentVersion version = 1; - repeated SyncUserData sync = 2; - // if true, the version will be forgotten from the task queue user data. - bool forget_version = 3; - // Async mode: full routing config to propagate (includes revision_number) - temporal.api.deployment.v1.RoutingConfig update_routing_config = 5; - // Async mode: version-specific data to upsert - WorkerDeploymentVersionData upsert_version_data = 6; - - message SyncUserData { - string name = 1; - repeated temporal.api.enums.v1.TaskQueueType types = 2; - DeploymentVersionData data = 3; - } + string deployment_name = 4; + WorkerDeploymentVersion version = 1; + repeated SyncUserData sync = 2; + // if true, the version will be forgotten from the task queue user data. + bool forget_version = 3; + // Async mode: full routing config to propagate (includes revision_number) + temporal.api.deployment.v1.RoutingConfig update_routing_config = 5; + // Async mode: version-specific data to upsert + WorkerDeploymentVersionData upsert_version_data = 6; + + message SyncUserData { + string name = 1; + repeated temporal.api.enums.v1.TaskQueueType types = 2; + DeploymentVersionData data = 3; + } } // used as Worker Deployment Version workflow activity output: message SyncDeploymentVersionUserDataResponse { - map task_queue_max_versions = 1; + map task_queue_max_versions = 1; } // used as Worker Deployment Version workflow activity input: message CheckWorkerDeploymentUserDataPropagationRequest { - map task_queue_max_versions = 1; + map task_queue_max_versions = 1; } // used as Worker Deployment workflow activity input: message SyncUnversionedRampActivityArgs { - string current_version = 1; - SyncVersionStateUpdateArgs update_args = 2; + string current_version = 1; + SyncVersionStateUpdateArgs update_args = 2; } // used as Worker Deployment workflow activity output: message SyncUnversionedRampActivityResponse { - map task_queue_max_versions = 1; + map task_queue_max_versions = 1; } // used as Worker Deployment Version workflow update input: message UpdateVersionMetadataArgs { - map upsert_entries = 1; - repeated string remove_entries = 2; - string identity = 3; + map upsert_entries = 1; + repeated string remove_entries = 2; + string identity = 3; } // used as Worker Deployment Version workflow update response: message UpdateVersionMetadataResponse { - temporal.api.deployment.v1.VersionMetadata metadata = 1; + temporal.api.deployment.v1.VersionMetadata metadata = 1; } // used as Worker Deployment workflow update input: message SetCurrentVersionArgs { - string identity = 1; - string version = 2; - bool ignore_missing_task_queues = 3; - bytes conflict_token = 4; - bool allow_no_pollers = 5; + string identity = 1; + string version = 2; + bool ignore_missing_task_queues = 3; + bytes conflict_token = 4; + bool allow_no_pollers = 5; } // used as Worker Deployment update response: message SetCurrentVersionResponse { - string previous_version = 1; - bytes conflict_token = 2; + string previous_version = 1; + bytes conflict_token = 2; } // used as Worker Deployment workflow update input: message DeleteVersionArgs { - string identity = 1; - string version = 2; - bool skip_drainage = 3; - // If true, it would mean that the delete operation is initiated by the server internally. This is done on the - // event that the addition of a version exceeds the max number of versions allowed in a worker-deployment (defaultMaxVersions). - // False elsewhere. - bool server_delete = 4; - // version workflow does not block the update for tq propagation - bool async_propagation = 5; + string identity = 1; + string version = 2; + bool skip_drainage = 3; + // If true, it would mean that the delete operation is initiated by the server internally. This is done on the + // event that the addition of a version exceeds the max number of versions allowed in a worker-deployment (defaultMaxVersions). + // False elsewhere. + bool server_delete = 4; + // version workflow does not block the update for tq propagation + bool async_propagation = 5; } // used as Worker Deployment Activity input: message DeleteVersionActivityArgs { - string identity = 1; - string deployment_name = 2; - string version = 3; - string request_id = 4; - bool skip_drainage = 5; - bool async_propagation = 6; + string identity = 1; + string deployment_name = 2; + string version = 3; + string request_id = 4; + bool skip_drainage = 5; + bool async_propagation = 6; } // used as Worker Deployment Activity input: message CheckTaskQueuesHavePollersActivityArgs { - // Key: Task Queue Name - map task_queues_and_types = 1; + // Key: Task Queue Name + map task_queues_and_types = 1; - message TaskQueueTypes { - repeated temporal.api.enums.v1.TaskQueueType types = 1; - } + message TaskQueueTypes { + repeated temporal.api.enums.v1.TaskQueueType types = 1; + } - WorkerDeploymentVersion worker_deployment_version = 2; + WorkerDeploymentVersion worker_deployment_version = 2; } // used as Worker Deployment workflow update input: message DeleteDeploymentArgs { - string identity = 1; + string identity = 1; } // used as Worker Deployment update response: message SetRampingVersionResponse { - string previous_version = 1; - float previous_percentage = 2; - bytes conflict_token = 3; + string previous_version = 1; + float previous_percentage = 2; + bytes conflict_token = 3; } // used as Worker Deployment workflow update input: message SetRampingVersionArgs { - string identity = 1; - string version = 2; - float percentage = 3; - bool ignore_missing_task_queues = 4; - bytes conflict_token = 5; - bool allow_no_pollers = 6; + string identity = 1; + string version = 2; + float percentage = 3; + bool ignore_missing_task_queues = 4; + bytes conflict_token = 5; + bool allow_no_pollers = 6; } // used as Worker Deployment workflow update input: @@ -461,60 +461,60 @@ message SetManagerIdentityResponse { // used as Worker Deployment activity input: message SyncVersionStateActivityArgs { - string deployment_name = 1; - // . or possibly just in the future - string version = 2; - SyncVersionStateUpdateArgs update_args = 3; - string request_id = 4; + string deployment_name = 1; + // . or possibly just in the future + string version = 2; + SyncVersionStateUpdateArgs update_args = 3; + string request_id = 4; } // used as Worker Deployment activity result: message SyncVersionStateActivityResult { - VersionLocalState version_state = 1 [deprecated = true]; - WorkerDeploymentVersionSummary summary = 2; + VersionLocalState version_state = 1 [deprecated = true]; + WorkerDeploymentVersionSummary summary = 2; } // used as Worker Deployment activity input: message IsVersionMissingTaskQueuesArgs { - string prev_current_version = 1; - string new_current_version = 2; + string prev_current_version = 1; + string new_current_version = 2; } // used as Worker Deployment activity output: message IsVersionMissingTaskQueuesResult { - bool is_missing_task_queues = 1; + bool is_missing_task_queues = 1; } // used as Worker Deployment workflow memo: message WorkerDeploymentWorkflowMemo { - string deployment_name = 1; - google.protobuf.Timestamp create_time = 2; - temporal.api.deployment.v1.RoutingConfig routing_config = 3; - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary current_version_summary = 5; - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary ramping_version_summary = 6; + string deployment_name = 1; + google.protobuf.Timestamp create_time = 2; + temporal.api.deployment.v1.RoutingConfig routing_config = 3; + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary current_version_summary = 5; + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary ramping_version_summary = 6; } // Subset of fields of WorkerDeploymentInfo returned in ListWorkerDeploymentsResponse message WorkerDeploymentSummary { - string name = 1; - google.protobuf.Timestamp create_time = 2; - temporal.api.deployment.v1.RoutingConfig routing_config = 3; - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary current_version_summary = 5; - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary ramping_version_summary = 6; + string name = 1; + google.protobuf.Timestamp create_time = 2; + temporal.api.deployment.v1.RoutingConfig routing_config = 3; + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary current_version_summary = 5; + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary ramping_version_summary = 6; } // Signal input for force-continue-as-new on Deployment workflow message ForceCANDeploymentSignalArgs { - // If provided, this state will be used instead of the current state - // when performing continue-as-new. - WorkerDeploymentLocalState override_state = 1; + // If provided, this state will be used instead of the current state + // when performing continue-as-new. + WorkerDeploymentLocalState override_state = 1; } // Signal input for force-continue-as-new on Version workflow message ForceCANVersionSignalArgs { - // If provided, this state will be used instead of the current state - // when performing continue-as-new. - VersionLocalState override_state = 1; + // If provided, this state will be used instead of the current state + // when performing continue-as-new. + VersionLocalState override_state = 1; } diff --git a/proto/internal/temporal/server/api/enums/v1/cluster.proto b/proto/internal/temporal/server/api/enums/v1/cluster.proto index 5730717527..bcd5e30f91 100644 --- a/proto/internal/temporal/server/api/enums/v1/cluster.proto +++ b/proto/internal/temporal/server/api/enums/v1/cluster.proto @@ -5,21 +5,21 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum ClusterMemberRole { - CLUSTER_MEMBER_ROLE_UNSPECIFIED = 0; - CLUSTER_MEMBER_ROLE_FRONTEND = 1; - CLUSTER_MEMBER_ROLE_HISTORY = 2; - CLUSTER_MEMBER_ROLE_MATCHING = 3; - CLUSTER_MEMBER_ROLE_WORKER = 4; + CLUSTER_MEMBER_ROLE_UNSPECIFIED = 0; + CLUSTER_MEMBER_ROLE_FRONTEND = 1; + CLUSTER_MEMBER_ROLE_HISTORY = 2; + CLUSTER_MEMBER_ROLE_MATCHING = 3; + CLUSTER_MEMBER_ROLE_WORKER = 4; } enum HealthState { - HEALTH_STATE_UNSPECIFIED = 0; - // The host is in a healthy state. - HEALTH_STATE_SERVING = 1; - // The host is unhealthy through external observation. - HEALTH_STATE_NOT_SERVING = 2; - // The host has marked itself as not ready to serve traffic. - HEALTH_STATE_DECLINED_SERVING = 3; - // An internal error occurred while checking health (e.g. resolver failure). - HEALTH_STATE_INTERNAL_ERROR = 4; + HEALTH_STATE_UNSPECIFIED = 0; + // The host is in a healthy state. + HEALTH_STATE_SERVING = 1; + // The host is unhealthy through external observation. + HEALTH_STATE_NOT_SERVING = 2; + // The host has marked itself as not ready to serve traffic. + HEALTH_STATE_DECLINED_SERVING = 3; + // An internal error occurred while checking health (e.g. resolver failure). + HEALTH_STATE_INTERNAL_ERROR = 4; } diff --git a/proto/internal/temporal/server/api/enums/v1/common.proto b/proto/internal/temporal/server/api/enums/v1/common.proto index 894740a58c..9dc22ccb46 100644 --- a/proto/internal/temporal/server/api/enums/v1/common.proto +++ b/proto/internal/temporal/server/api/enums/v1/common.proto @@ -5,28 +5,28 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum DeadLetterQueueType { - DEAD_LETTER_QUEUE_TYPE_UNSPECIFIED = 0; - DEAD_LETTER_QUEUE_TYPE_REPLICATION = 1; - DEAD_LETTER_QUEUE_TYPE_NAMESPACE = 2; + DEAD_LETTER_QUEUE_TYPE_UNSPECIFIED = 0; + DEAD_LETTER_QUEUE_TYPE_REPLICATION = 1; + DEAD_LETTER_QUEUE_TYPE_NAMESPACE = 2; } enum ChecksumFlavor { - CHECKSUM_FLAVOR_UNSPECIFIED = 0; - CHECKSUM_FLAVOR_IEEE_CRC32_OVER_PROTO3_BINARY = 1; + CHECKSUM_FLAVOR_UNSPECIFIED = 0; + CHECKSUM_FLAVOR_IEEE_CRC32_OVER_PROTO3_BINARY = 1; } // State of a callback. enum CallbackState { - // Default value, unspecified state. - CALLBACK_STATE_UNSPECIFIED = 0; - // Callback is standing by, waiting to be triggered. - CALLBACK_STATE_STANDBY = 1; - // Callback is in the queue waiting to be executed or is currently executing. - CALLBACK_STATE_SCHEDULED = 2; - // Callback has failed with a retryable error and is backing off before the next attempt. - CALLBACK_STATE_BACKING_OFF = 3; - // Callback has failed. - CALLBACK_STATE_FAILED = 4; - // Callback has succeeded. - CALLBACK_STATE_SUCCEEDED = 5; + // Default value, unspecified state. + CALLBACK_STATE_UNSPECIFIED = 0; + // Callback is standing by, waiting to be triggered. + CALLBACK_STATE_STANDBY = 1; + // Callback is in the queue waiting to be executed or is currently executing. + CALLBACK_STATE_SCHEDULED = 2; + // Callback has failed with a retryable error and is backing off before the next attempt. + CALLBACK_STATE_BACKING_OFF = 3; + // Callback has failed. + CALLBACK_STATE_FAILED = 4; + // Callback has succeeded. + CALLBACK_STATE_SUCCEEDED = 5; } diff --git a/proto/internal/temporal/server/api/enums/v1/fairness_state.proto b/proto/internal/temporal/server/api/enums/v1/fairness_state.proto index 140d79a78d..ebce3c5f71 100644 --- a/proto/internal/temporal/server/api/enums/v1/fairness_state.proto +++ b/proto/internal/temporal/server/api/enums/v1/fairness_state.proto @@ -5,8 +5,8 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum FairnessState { - FAIRNESS_STATE_UNSPECIFIED = 0; - FAIRNESS_STATE_V0 = 1; - FAIRNESS_STATE_V1 = 2; - FAIRNESS_STATE_V2 = 3; -}; + FAIRNESS_STATE_UNSPECIFIED = 0; + FAIRNESS_STATE_V0 = 1; + FAIRNESS_STATE_V1 = 2; + FAIRNESS_STATE_V2 = 3; +} diff --git a/proto/internal/temporal/server/api/enums/v1/nexus.proto b/proto/internal/temporal/server/api/enums/v1/nexus.proto index 51a820236d..d7c986c013 100644 --- a/proto/internal/temporal/server/api/enums/v1/nexus.proto +++ b/proto/internal/temporal/server/api/enums/v1/nexus.proto @@ -5,23 +5,23 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum NexusOperationState { - // Default value, unspecified state. - NEXUS_OPERATION_STATE_UNSPECIFIED = 0; - // Operation is in the queue waiting to be executed or is currently executing. - NEXUS_OPERATION_STATE_SCHEDULED = 1; - // Operation has failed with a retryable error and is backing off before the next attempt. - NEXUS_OPERATION_STATE_BACKING_OFF = 2; - // Operation was started and will complete asynchronously. - NEXUS_OPERATION_STATE_STARTED = 3; - // Operation succeeded. - // This may happen either as a response to a start request or as reported via callback. - NEXUS_OPERATION_STATE_SUCCEEDED = 4; - // Operation failed either when a start request encounters a non-retryable error or as reported via callback. - NEXUS_OPERATION_STATE_FAILED = 5; - // Operation completed as canceled (may have not ever been delivered). - // This may happen either as a response to a start request or as reported via callback. - NEXUS_OPERATION_STATE_CANCELED = 6; - // Operation timed out - exceeded the user supplied schedule-to-close timeout. - // Any attempts to complete the operation in this state will be ignored. - NEXUS_OPERATION_STATE_TIMED_OUT = 7; + // Default value, unspecified state. + NEXUS_OPERATION_STATE_UNSPECIFIED = 0; + // Operation is in the queue waiting to be executed or is currently executing. + NEXUS_OPERATION_STATE_SCHEDULED = 1; + // Operation has failed with a retryable error and is backing off before the next attempt. + NEXUS_OPERATION_STATE_BACKING_OFF = 2; + // Operation was started and will complete asynchronously. + NEXUS_OPERATION_STATE_STARTED = 3; + // Operation succeeded. + // This may happen either as a response to a start request or as reported via callback. + NEXUS_OPERATION_STATE_SUCCEEDED = 4; + // Operation failed either when a start request encounters a non-retryable error or as reported via callback. + NEXUS_OPERATION_STATE_FAILED = 5; + // Operation completed as canceled (may have not ever been delivered). + // This may happen either as a response to a start request or as reported via callback. + NEXUS_OPERATION_STATE_CANCELED = 6; + // Operation timed out - exceeded the user supplied schedule-to-close timeout. + // Any attempts to complete the operation in this state will be ignored. + NEXUS_OPERATION_STATE_TIMED_OUT = 7; } diff --git a/proto/internal/temporal/server/api/enums/v1/predicate.proto b/proto/internal/temporal/server/api/enums/v1/predicate.proto index 01a3ae191a..1e0abe4533 100644 --- a/proto/internal/temporal/server/api/enums/v1/predicate.proto +++ b/proto/internal/temporal/server/api/enums/v1/predicate.proto @@ -5,17 +5,17 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum PredicateType { - PREDICATE_TYPE_UNSPECIFIED = 0; - PREDICATE_TYPE_UNIVERSAL = 1; - PREDICATE_TYPE_EMPTY = 2; - PREDICATE_TYPE_AND = 3; - PREDICATE_TYPE_OR = 4; - PREDICATE_TYPE_NOT = 5; - PREDICATE_TYPE_NAMESPACE_ID = 6; - PREDICATE_TYPE_TASK_TYPE = 7; - PREDICATE_TYPE_DESTINATION = 8; - PREDICATE_TYPE_OUTBOUND_TASK_GROUP = 9; - // Predicate used for grouping outbound tasks. Consists of task_group, namespace_id, and destination. - // This replaces a previous implementation which used an AND predicate over 3 separate predicate types. - PREDICATE_TYPE_OUTBOUND_TASK = 10; + PREDICATE_TYPE_UNSPECIFIED = 0; + PREDICATE_TYPE_UNIVERSAL = 1; + PREDICATE_TYPE_EMPTY = 2; + PREDICATE_TYPE_AND = 3; + PREDICATE_TYPE_OR = 4; + PREDICATE_TYPE_NOT = 5; + PREDICATE_TYPE_NAMESPACE_ID = 6; + PREDICATE_TYPE_TASK_TYPE = 7; + PREDICATE_TYPE_DESTINATION = 8; + PREDICATE_TYPE_OUTBOUND_TASK_GROUP = 9; + // Predicate used for grouping outbound tasks. Consists of task_group, namespace_id, and destination. + // This replaces a previous implementation which used an AND predicate over 3 separate predicate types. + PREDICATE_TYPE_OUTBOUND_TASK = 10; } diff --git a/proto/internal/temporal/server/api/enums/v1/replication.proto b/proto/internal/temporal/server/api/enums/v1/replication.proto index be1f2490e7..ce44d76acb 100644 --- a/proto/internal/temporal/server/api/enums/v1/replication.proto +++ b/proto/internal/temporal/server/api/enums/v1/replication.proto @@ -5,29 +5,29 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum ReplicationTaskType { - REPLICATION_TASK_TYPE_UNSPECIFIED = 0; - REPLICATION_TASK_TYPE_NAMESPACE_TASK = 1; - REPLICATION_TASK_TYPE_HISTORY_TASK = 2; - REPLICATION_TASK_TYPE_SYNC_SHARD_STATUS_TASK = 3; - REPLICATION_TASK_TYPE_SYNC_ACTIVITY_TASK = 4; - REPLICATION_TASK_TYPE_HISTORY_METADATA_TASK = 5; - REPLICATION_TASK_TYPE_HISTORY_V2_TASK = 6; - REPLICATION_TASK_TYPE_SYNC_WORKFLOW_STATE_TASK = 7; - REPLICATION_TASK_TYPE_TASK_QUEUE_USER_DATA = 8; - REPLICATION_TASK_TYPE_SYNC_HSM_TASK = 9; - REPLICATION_TASK_TYPE_BACKFILL_HISTORY_TASK = 10; - REPLICATION_TASK_TYPE_VERIFY_VERSIONED_TRANSITION_TASK = 11; - REPLICATION_TASK_TYPE_SYNC_VERSIONED_TRANSITION_TASK = 12; + REPLICATION_TASK_TYPE_UNSPECIFIED = 0; + REPLICATION_TASK_TYPE_NAMESPACE_TASK = 1; + REPLICATION_TASK_TYPE_HISTORY_TASK = 2; + REPLICATION_TASK_TYPE_SYNC_SHARD_STATUS_TASK = 3; + REPLICATION_TASK_TYPE_SYNC_ACTIVITY_TASK = 4; + REPLICATION_TASK_TYPE_HISTORY_METADATA_TASK = 5; + REPLICATION_TASK_TYPE_HISTORY_V2_TASK = 6; + REPLICATION_TASK_TYPE_SYNC_WORKFLOW_STATE_TASK = 7; + REPLICATION_TASK_TYPE_TASK_QUEUE_USER_DATA = 8; + REPLICATION_TASK_TYPE_SYNC_HSM_TASK = 9; + REPLICATION_TASK_TYPE_BACKFILL_HISTORY_TASK = 10; + REPLICATION_TASK_TYPE_VERIFY_VERSIONED_TRANSITION_TASK = 11; + REPLICATION_TASK_TYPE_SYNC_VERSIONED_TRANSITION_TASK = 12; } enum NamespaceOperation { - NAMESPACE_OPERATION_UNSPECIFIED = 0; - NAMESPACE_OPERATION_CREATE = 1; - NAMESPACE_OPERATION_UPDATE = 2; + NAMESPACE_OPERATION_UNSPECIFIED = 0; + NAMESPACE_OPERATION_CREATE = 1; + NAMESPACE_OPERATION_UPDATE = 2; } enum ReplicationFlowControlCommand { - REPLICATION_FLOW_CONTROL_COMMAND_UNSPECIFIED = 0; - REPLICATION_FLOW_CONTROL_COMMAND_RESUME = 1; - REPLICATION_FLOW_CONTROL_COMMAND_PAUSE = 2; + REPLICATION_FLOW_CONTROL_COMMAND_UNSPECIFIED = 0; + REPLICATION_FLOW_CONTROL_COMMAND_RESUME = 1; + REPLICATION_FLOW_CONTROL_COMMAND_PAUSE = 2; } diff --git a/proto/internal/temporal/server/api/enums/v1/task.proto b/proto/internal/temporal/server/api/enums/v1/task.proto index 34e026271a..1f9dba5c4e 100644 --- a/proto/internal/temporal/server/api/enums/v1/task.proto +++ b/proto/internal/temporal/server/api/enums/v1/task.proto @@ -6,65 +6,65 @@ option go_package = "go.temporal.io/server/api/enums/v1;enums"; // TaskSource is the source from which a task was produced. enum TaskSource { - TASK_SOURCE_UNSPECIFIED = 0; - // Task produced by history service. - TASK_SOURCE_HISTORY = 1; - // Task produced from matching db backlog. - TASK_SOURCE_DB_BACKLOG = 2; + TASK_SOURCE_UNSPECIFIED = 0; + // Task produced by history service. + TASK_SOURCE_HISTORY = 1; + // Task produced from matching db backlog. + TASK_SOURCE_DB_BACKLOG = 2; } enum TaskType { - TASK_TYPE_UNSPECIFIED = 0; - TASK_TYPE_REPLICATION_HISTORY = 1; - TASK_TYPE_REPLICATION_SYNC_ACTIVITY = 2; - TASK_TYPE_TRANSFER_WORKFLOW_TASK = 3; - TASK_TYPE_TRANSFER_ACTIVITY_TASK = 4; - TASK_TYPE_TRANSFER_CLOSE_EXECUTION = 5; - TASK_TYPE_TRANSFER_CANCEL_EXECUTION = 6; - TASK_TYPE_TRANSFER_START_CHILD_EXECUTION = 7; - TASK_TYPE_TRANSFER_SIGNAL_EXECUTION = 8; - reserved 9; // TASK_TYPE_TRANSFER_RECORD_WORKFLOW_STARTED - TASK_TYPE_TRANSFER_RESET_WORKFLOW = 10; - reserved 11; // TASK_TYPE_TRANSFER_UPSERT_WORKFLOW_SEARCH_ATTRIBUTES - TASK_TYPE_WORKFLOW_TASK_TIMEOUT = 12; - TASK_TYPE_ACTIVITY_TIMEOUT = 13; - TASK_TYPE_USER_TIMER = 14; - TASK_TYPE_WORKFLOW_RUN_TIMEOUT = 15; - TASK_TYPE_DELETE_HISTORY_EVENT = 16; - TASK_TYPE_ACTIVITY_RETRY_TIMER = 17; - TASK_TYPE_WORKFLOW_BACKOFF_TIMER = 18; - TASK_TYPE_VISIBILITY_START_EXECUTION = 19; - TASK_TYPE_VISIBILITY_UPSERT_EXECUTION = 20; - TASK_TYPE_VISIBILITY_CLOSE_EXECUTION = 21; - TASK_TYPE_VISIBILITY_DELETE_EXECUTION = 22; - reserved 23; - TASK_TYPE_TRANSFER_DELETE_EXECUTION = 24; - TASK_TYPE_REPLICATION_SYNC_WORKFLOW_STATE = 25; - TASK_TYPE_ARCHIVAL_ARCHIVE_EXECUTION = 26; - // An immediate outbound task generated by a state machine. - // Outbound tasks specify a destination that is used to group tasks into a per namespace-and-destination - // scheduler. - TASK_TYPE_STATE_MACHINE_OUTBOUND = 27; - // A timer task generated by a state machine. - TASK_TYPE_STATE_MACHINE_TIMER = 28; + TASK_TYPE_UNSPECIFIED = 0; + TASK_TYPE_REPLICATION_HISTORY = 1; + TASK_TYPE_REPLICATION_SYNC_ACTIVITY = 2; + TASK_TYPE_TRANSFER_WORKFLOW_TASK = 3; + TASK_TYPE_TRANSFER_ACTIVITY_TASK = 4; + TASK_TYPE_TRANSFER_CLOSE_EXECUTION = 5; + TASK_TYPE_TRANSFER_CANCEL_EXECUTION = 6; + TASK_TYPE_TRANSFER_START_CHILD_EXECUTION = 7; + TASK_TYPE_TRANSFER_SIGNAL_EXECUTION = 8; + reserved 9; // TASK_TYPE_TRANSFER_RECORD_WORKFLOW_STARTED + TASK_TYPE_TRANSFER_RESET_WORKFLOW = 10; + reserved 11; // TASK_TYPE_TRANSFER_UPSERT_WORKFLOW_SEARCH_ATTRIBUTES + TASK_TYPE_WORKFLOW_TASK_TIMEOUT = 12; + TASK_TYPE_ACTIVITY_TIMEOUT = 13; + TASK_TYPE_USER_TIMER = 14; + TASK_TYPE_WORKFLOW_RUN_TIMEOUT = 15; + TASK_TYPE_DELETE_HISTORY_EVENT = 16; + TASK_TYPE_ACTIVITY_RETRY_TIMER = 17; + TASK_TYPE_WORKFLOW_BACKOFF_TIMER = 18; + TASK_TYPE_VISIBILITY_START_EXECUTION = 19; + TASK_TYPE_VISIBILITY_UPSERT_EXECUTION = 20; + TASK_TYPE_VISIBILITY_CLOSE_EXECUTION = 21; + TASK_TYPE_VISIBILITY_DELETE_EXECUTION = 22; + reserved 23; + TASK_TYPE_TRANSFER_DELETE_EXECUTION = 24; + TASK_TYPE_REPLICATION_SYNC_WORKFLOW_STATE = 25; + TASK_TYPE_ARCHIVAL_ARCHIVE_EXECUTION = 26; + // An immediate outbound task generated by a state machine. + // Outbound tasks specify a destination that is used to group tasks into a per namespace-and-destination + // scheduler. + TASK_TYPE_STATE_MACHINE_OUTBOUND = 27; + // A timer task generated by a state machine. + TASK_TYPE_STATE_MACHINE_TIMER = 28; - // Timeout task for the entire workflow execution chain. - TASK_TYPE_WORKFLOW_EXECUTION_TIMEOUT = 29; + // Timeout task for the entire workflow execution chain. + TASK_TYPE_WORKFLOW_EXECUTION_TIMEOUT = 29; - TASK_TYPE_REPLICATION_SYNC_HSM = 30; - TASK_TYPE_REPLICATION_SYNC_VERSIONED_TRANSITION = 31; + TASK_TYPE_REPLICATION_SYNC_HSM = 30; + TASK_TYPE_REPLICATION_SYNC_VERSIONED_TRANSITION = 31; - // A task that applies a batch of state changes to a CHASM entity. - TASK_TYPE_CHASM_PURE = 32; + // A task that applies a batch of state changes to a CHASM entity. + TASK_TYPE_CHASM_PURE = 32; - // A task with side effects generated by a CHASM component. - TASK_TYPE_CHASM = 33; + // A task with side effects generated by a CHASM component. + TASK_TYPE_CHASM = 33; } // TaskPriority is only used for replication task as of May 2024 enum TaskPriority { - TASK_PRIORITY_UNSPECIFIED = 0; - TASK_PRIORITY_HIGH = 1; - // gap between index can be used for future priority levels if needed - TASK_PRIORITY_LOW = 10; + TASK_PRIORITY_UNSPECIFIED = 0; + TASK_PRIORITY_HIGH = 1; + // gap between index can be used for future priority levels if needed + TASK_PRIORITY_LOW = 10; } diff --git a/proto/internal/temporal/server/api/enums/v1/workflow.proto b/proto/internal/temporal/server/api/enums/v1/workflow.proto index cb74c9a0c4..12f529df15 100644 --- a/proto/internal/temporal/server/api/enums/v1/workflow.proto +++ b/proto/internal/temporal/server/api/enums/v1/workflow.proto @@ -5,24 +5,24 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum WorkflowExecutionState { - WORKFLOW_EXECUTION_STATE_UNSPECIFIED = 0; - WORKFLOW_EXECUTION_STATE_CREATED = 1; - WORKFLOW_EXECUTION_STATE_RUNNING = 2; - WORKFLOW_EXECUTION_STATE_COMPLETED = 3; - WORKFLOW_EXECUTION_STATE_ZOMBIE = 4; - WORKFLOW_EXECUTION_STATE_VOID = 5; - WORKFLOW_EXECUTION_STATE_CORRUPTED = 6; + WORKFLOW_EXECUTION_STATE_UNSPECIFIED = 0; + WORKFLOW_EXECUTION_STATE_CREATED = 1; + WORKFLOW_EXECUTION_STATE_RUNNING = 2; + WORKFLOW_EXECUTION_STATE_COMPLETED = 3; + WORKFLOW_EXECUTION_STATE_ZOMBIE = 4; + WORKFLOW_EXECUTION_STATE_VOID = 5; + WORKFLOW_EXECUTION_STATE_CORRUPTED = 6; } enum WorkflowBackoffType { - WORKFLOW_BACKOFF_TYPE_UNSPECIFIED = 0; - WORKFLOW_BACKOFF_TYPE_RETRY = 1; - WORKFLOW_BACKOFF_TYPE_CRON = 2; - WORKFLOW_BACKOFF_TYPE_DELAY_START = 3; + WORKFLOW_BACKOFF_TYPE_UNSPECIFIED = 0; + WORKFLOW_BACKOFF_TYPE_RETRY = 1; + WORKFLOW_BACKOFF_TYPE_CRON = 2; + WORKFLOW_BACKOFF_TYPE_DELAY_START = 3; } enum PausedWorkflowEntityType { - PAUSED_WORKFLOW_ENTITY_TYPE_UNSPECIFIED = 0; - PAUSED_WORKFLOW_ENTITY_TYPE_ACTIVITY = 1; - PAUSED_WORKFLOW_ENTITY_TYPE_WORKFLOW = 2; + PAUSED_WORKFLOW_ENTITY_TYPE_UNSPECIFIED = 0; + PAUSED_WORKFLOW_ENTITY_TYPE_ACTIVITY = 1; + PAUSED_WORKFLOW_ENTITY_TYPE_WORKFLOW = 2; } diff --git a/proto/internal/temporal/server/api/enums/v1/workflow_task_type.proto b/proto/internal/temporal/server/api/enums/v1/workflow_task_type.proto index 62a2d41d93..ab89c64abe 100644 --- a/proto/internal/temporal/server/api/enums/v1/workflow_task_type.proto +++ b/proto/internal/temporal/server/api/enums/v1/workflow_task_type.proto @@ -5,9 +5,9 @@ package temporal.server.api.enums.v1; option go_package = "go.temporal.io/server/api/enums/v1;enums"; enum WorkflowTaskType { - WORKFLOW_TASK_TYPE_UNSPECIFIED = 0; - WORKFLOW_TASK_TYPE_NORMAL = 1; - // TODO (alex): TRANSIENT is not current used. Needs to be set when Attempt>1. - WORKFLOW_TASK_TYPE_TRANSIENT = 2; - WORKFLOW_TASK_TYPE_SPECULATIVE = 3; + WORKFLOW_TASK_TYPE_UNSPECIFIED = 0; + WORKFLOW_TASK_TYPE_NORMAL = 1; + // TODO (alex): TRANSIENT is not current used. Needs to be set when Attempt>1. + WORKFLOW_TASK_TYPE_TRANSIENT = 2; + WORKFLOW_TASK_TYPE_SPECULATIVE = 3; } diff --git a/proto/internal/temporal/server/api/errordetails/v1/message.proto b/proto/internal/temporal/server/api/errordetails/v1/message.proto index d26aca8f6c..8d92a4e5a7 100644 --- a/proto/internal/temporal/server/api/errordetails/v1/message.proto +++ b/proto/internal/temporal/server/api/errordetails/v1/message.proto @@ -4,58 +4,53 @@ syntax = "proto3"; package temporal.server.api.errordetails.v1; -option go_package = "go.temporal.io/server/api/errordetails/v1;errordetails"; - import "temporal/server/api/history/v1/message.proto"; import "temporal/server/api/persistence/v1/hsm.proto"; -message TaskAlreadyStartedFailure { -} +option go_package = "go.temporal.io/server/api/errordetails/v1;errordetails"; + +message TaskAlreadyStartedFailure {} message CurrentBranchChangedFailure { - bytes current_branch_token = 1; - bytes request_branch_token = 2; - temporal.server.api.persistence.v1.VersionedTransition current_versioned_transition = 3; - temporal.server.api.persistence.v1.VersionedTransition request_versioned_transition = 4; + bytes current_branch_token = 1; + bytes request_branch_token = 2; + temporal.server.api.persistence.v1.VersionedTransition current_versioned_transition = 3; + temporal.server.api.persistence.v1.VersionedTransition request_versioned_transition = 4; } message ShardOwnershipLostFailure { - string owner_host = 1; - string current_host = 2; + string owner_host = 1; + string current_host = 2; } message RetryReplicationFailure { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - int64 start_event_id = 4; - int64 start_event_version = 5; - int64 end_event_id = 6; - int64 end_event_version = 7; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + int64 start_event_id = 4; + int64 start_event_version = 5; + int64 end_event_id = 6; + int64 end_event_version = 7; } message SyncStateFailure { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 4; - temporal.server.api.history.v1.VersionHistories version_histories = 5; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 6; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 4; + temporal.server.api.history.v1.VersionHistories version_histories = 5; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 6; } -message StickyWorkerUnavailableFailure { -} +message StickyWorkerUnavailableFailure {} // Deprecated. Only used in WV2. [cleanup-old-wv] -message ObsoleteDispatchBuildIdFailure { -} +message ObsoleteDispatchBuildIdFailure {} // Returned when History determines a task that Matching wants to dispatch is no longer valid. -message ObsoleteMatchingTaskFailure { -} +message ObsoleteMatchingTaskFailure {} // Returned when an activity start is rejected by History because the workflow is in a transitioning // between worker deployments. -message ActivityStartDuringTransitionFailure { -} +message ActivityStartDuringTransitionFailure {} diff --git a/proto/internal/temporal/server/api/health/v1/message.proto b/proto/internal/temporal/server/api/health/v1/message.proto index 083ef18252..636c716122 100644 --- a/proto/internal/temporal/server/api/health/v1/message.proto +++ b/proto/internal/temporal/server/api/health/v1/message.proto @@ -2,43 +2,43 @@ syntax = "proto3"; package temporal.server.api.health.v1; -option go_package = "go.temporal.io/server/api/health/v1;health"; - import "temporal/server/api/enums/v1/cluster.proto"; +option go_package = "go.temporal.io/server/api/health/v1;health"; + // Individual health check result. // The check_type field uses human-readable strings rather than an enum for extensibility. message HealthCheck { - // Machine-readable check type identifier for programmatic matching. - // Known values defined as Go constants in api/health/v1/types.go: - // "grpc_health", "rpc_latency", "rpc_error_ratio", - // "persistence_latency", "persistence_error_ratio", - // "host_availability", "task_queue_backlog" - // We use strings instead of an enum for flexibility: new check types can be - // added without proto changes. See HealthCheck.message for human-readable details. - string check_type = 1; - temporal.server.api.enums.v1.HealthState state = 2; - // Actual observed value (0 if N/A). - double value = 3; - // Threshold that was exceeded (0 if N/A). - double threshold = 4; - // Human-readable detail describing what happened, e.g. - // "RPC latency 850.00ms exceeded 500.00ms threshold". - string message = 5; + // Machine-readable check type identifier for programmatic matching. + // Known values defined as Go constants in api/health/v1/types.go: + // "grpc_health", "rpc_latency", "rpc_error_ratio", + // "persistence_latency", "persistence_error_ratio", + // "host_availability", "task_queue_backlog" + // We use strings instead of an enum for flexibility: new check types can be + // added without proto changes. See HealthCheck.message for human-readable details. + string check_type = 1; + temporal.server.api.enums.v1.HealthState state = 2; + // Actual observed value (0 if N/A). + double value = 3; + // Threshold that was exceeded (0 if N/A). + double threshold = 4; + // Human-readable detail describing what happened, e.g. + // "RPC latency 850.00ms exceeded 500.00ms threshold". + string message = 5; } // Health details for a single host. message HostHealthDetail { - string address = 1; - temporal.server.api.enums.v1.HealthState state = 2; - repeated HealthCheck checks = 3; + string address = 1; + temporal.server.api.enums.v1.HealthState state = 2; + repeated HealthCheck checks = 3; } // Health details for a service (history, frontend, matching). message ServiceHealthDetail { - string service = 1; - temporal.server.api.enums.v1.HealthState state = 2; - repeated HostHealthDetail hosts = 3; - // Service-level diagnostic message (e.g. "no available hosts", "resolver error"). - string message = 4; + string service = 1; + temporal.server.api.enums.v1.HealthState state = 2; + repeated HostHealthDetail hosts = 3; + // Service-level diagnostic message (e.g. "no available hosts", "resolver error"). + string message = 4; } diff --git a/proto/internal/temporal/server/api/history/v1/message.proto b/proto/internal/temporal/server/api/history/v1/message.proto index a90d6dbe65..c9ff843607 100644 --- a/proto/internal/temporal/server/api/history/v1/message.proto +++ b/proto/internal/temporal/server/api/history/v1/message.proto @@ -2,54 +2,53 @@ syntax = "proto3"; package temporal.server.api.history.v1; -option go_package = "go.temporal.io/server/api/history/v1;history"; - import "google/protobuf/timestamp.proto"; - import "temporal/api/history/v1/message.proto"; +option go_package = "go.temporal.io/server/api/history/v1;history"; + message TransientWorkflowTaskInfo { - reserved 1; - reserved 2; + reserved 1; + reserved 2; - // A list of history events that are to be appended to the "real" workflow history. - repeated temporal.api.history.v1.HistoryEvent history_suffix = 3; + // A list of history events that are to be appended to the "real" workflow history. + repeated temporal.api.history.v1.HistoryEvent history_suffix = 3; } // VersionHistoryItem contains signal eventId and the corresponding version. message VersionHistoryItem { - int64 event_id = 1; - int64 version = 2; + int64 event_id = 1; + int64 version = 2; } // VersionHistory contains the version history of a branch. message VersionHistory { - bytes branch_token = 1; - repeated VersionHistoryItem items = 2; + bytes branch_token = 1; + repeated VersionHistoryItem items = 2; } // VersionHistories contains all version histories from all branches. message VersionHistories { - int32 current_version_history_index = 1; - repeated VersionHistory histories = 2; + int32 current_version_history_index = 1; + repeated VersionHistory histories = 2; } message TaskKey { - int64 task_id = 1; - google.protobuf.Timestamp fire_time = 2; + int64 task_id = 1; + google.protobuf.Timestamp fire_time = 2; } message TaskRange { - TaskKey inclusive_min_task_key = 1; - TaskKey exclusive_max_task_key = 2; + TaskKey inclusive_min_task_key = 1; + TaskKey exclusive_max_task_key = 2; } // StrippedHistoryEvent is a stripped down version of HistoryEvent that only contains the event_id and version. message StrippedHistoryEvent { - int64 event_id = 1; - int64 version = 4; + int64 event_id = 1; + int64 version = 4; } message StrippedHistoryEvents { - repeated StrippedHistoryEvent events = 1; + repeated StrippedHistoryEvent events = 1; } diff --git a/proto/internal/temporal/server/api/historyservice/v1/request_response.proto b/proto/internal/temporal/server/api/historyservice/v1/request_response.proto index 2a1fba7491..2ed7f59673 100644 --- a/proto/internal/temporal/server/api/historyservice/v1/request_response.proto +++ b/proto/internal/temporal/server/api/historyservice/v1/request_response.proto @@ -1,28 +1,29 @@ syntax = "proto3"; package temporal.server.api.historyservice.v1; -option go_package = "go.temporal.io/server/api/historyservice/v1;historyservice"; import "google/protobuf/descriptor.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; - import "temporal/api/activity/v1/message.proto"; -import "temporal/api/deployment/v1/message.proto"; import "temporal/api/common/v1/message.proto"; -import "temporal/api/history/v1/message.proto"; -import "temporal/api/taskqueue/v1/message.proto"; +import "temporal/api/deployment/v1/message.proto"; import "temporal/api/enums/v1/workflow.proto"; -import "temporal/api/workflow/v1/message.proto"; -import "temporal/api/query/v1/message.proto"; -import "temporal/api/protocol/v1/message.proto"; import "temporal/api/failure/v1/message.proto"; +import "temporal/api/history/v1/message.proto"; import "temporal/api/nexus/v1/message.proto"; - +import "temporal/api/protocol/v1/message.proto"; +import "temporal/api/query/v1/message.proto"; +import "temporal/api/taskqueue/v1/message.proto"; +import "temporal/api/workflow/v1/message.proto"; +import "temporal/api/workflowservice/v1/request_response.proto"; +import "temporal/server/api/adminservice/v1/request_response.proto"; import "temporal/server/api/clock/v1/message.proto"; +import "temporal/server/api/common/v1/dlq.proto"; import "temporal/server/api/enums/v1/cluster.proto"; import "temporal/server/api/enums/v1/common.proto"; import "temporal/server/api/enums/v1/workflow.proto"; +import "temporal/server/api/health/v1/message.proto"; import "temporal/server/api/history/v1/message.proto"; import "temporal/server/api/namespace/v1/message.proto"; import "temporal/server/api/persistence/v1/executions.proto"; @@ -33,271 +34,267 @@ import "temporal/server/api/taskqueue/v1/message.proto"; import "temporal/server/api/token/v1/message.proto"; import "temporal/server/api/workflow/v1/message.proto"; -import "temporal/api/workflowservice/v1/request_response.proto"; -import "temporal/server/api/adminservice/v1/request_response.proto"; -import "temporal/server/api/common/v1/dlq.proto"; -import "temporal/server/api/health/v1/message.proto"; +option go_package = "go.temporal.io/server/api/historyservice/v1;historyservice"; extend google.protobuf.MessageOptions { - optional RoutingOptions routing = 7234; + optional RoutingOptions routing = 7234; } // RoutingOptions define how a request is routed to the appropriate host. message RoutingOptions { - // Routing is custom and implemented in the non-generated client/history/client.go. - bool custom = 1; - // Request will be routed to a random host. - bool any_host = 2; - // Request will be routed according to the specified shard ID field. - string shard_id = 3; - // Requested routed by task token or workflow ID may also specify how to obtain the namespace ID. Defaults to the - // "namespace_id" field. - string namespace_id = 4; - // Request will be routed by resolving the namespace ID and workflow ID to a given shard. - string workflow_id = 5; - // Request will be routed by resolving the namespace ID and the workflow ID from this task token to a given shard. - string task_token = 6; - // Request will be routed by resolving the namespace ID and the workflow ID from the first task info element. - string task_infos = 7; - // Request will be routed by resolving the namespace ID and the workflow ID from this chasm ref to a given shard. - string chasm_component_ref = 8; + // Routing is custom and implemented in the non-generated client/history/client.go. + bool custom = 1; + // Request will be routed to a random host. + bool any_host = 2; + // Request will be routed according to the specified shard ID field. + string shard_id = 3; + // Requested routed by task token or workflow ID may also specify how to obtain the namespace ID. Defaults to the + // "namespace_id" field. + string namespace_id = 4; + // Request will be routed by resolving the namespace ID and workflow ID to a given shard. + string workflow_id = 5; + // Request will be routed by resolving the namespace ID and the workflow ID from this task token to a given shard. + string task_token = 6; + // Request will be routed by resolving the namespace ID and the workflow ID from the first task info element. + string task_infos = 7; + // Request will be routed by resolving the namespace ID and the workflow ID from this chasm ref to a given shard. + string chasm_component_ref = 8; } message StartWorkflowExecutionRequest { - option (routing).workflow_id = "start_request.workflow_id"; - - string namespace_id = 1; - temporal.api.workflowservice.v1.StartWorkflowExecutionRequest start_request = 2; - temporal.server.api.workflow.v1.ParentExecutionInfo parent_execution_info = 3; - int32 attempt = 4; - google.protobuf.Timestamp workflow_execution_expiration_time = 5; - temporal.api.enums.v1.ContinueAsNewInitiator continue_as_new_initiator = 6; - // History service should use the values of continued_failure and last_completion_result - // here, not the ones in start_request (those are moved into here in the frontend). - temporal.api.failure.v1.Failure continued_failure = 7; - temporal.api.common.v1.Payloads last_completion_result = 8; - google.protobuf.Duration first_workflow_task_backoff = 9; - // For child or continued-as-new workflows, including a version here from the source - // (parent/previous) will set the initial version stamp of this workflow. - // Deprecated. use `inherited_build_id` - temporal.api.common.v1.WorkerVersionStamp source_version_stamp = 10; - // The root execution info of the new workflow. - // For top-level workflows (ie., without parent), this field must be nil. - temporal.server.api.workflow.v1.RootExecutionInfo root_execution_info = 11; - // inherited build ID from parent/previous execution - // Deprecated. Use behavior, version, and task queue fields in `parent_execution_info`. - string inherited_build_id = 12; - // If set, takes precedence over the Versioning Behavior sent by the SDK on Workflow Task completion. - // To unset the override after the workflow is running, use UpdateWorkflowExecutionOptions. - temporal.api.workflow.v1.VersioningOverride versioning_override = 13; - // If set, we verify the parent-child relationship before applying ID conflict policy WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING - bool child_workflow_only = 14; - // If present, the new workflow should start on this version with pinned base behavior. - temporal.api.deployment.v1.WorkerDeploymentVersion inherited_pinned_version = 15; - // Passes deployment version and revision number from a parent/previous workflow with AutoUpgrade behavior - // to its child/continued-as-new workflow. The first workflow task of the child/CAN workflow is dispatched to - // either this deployment version or the current version of the task queue, depending on which is the more recent version. - // After the first workflow task, the effective behavior of the workflow is determined by worker-sent values in - // subsequent workflow tasks. - temporal.api.deployment.v1.InheritedAutoUpgradeInfo inherited_auto_upgrade_info = 16; - // The target version that the previous run implicitly declined to upgrade to. - // Computed at continue-as-new time from the previous run's last_notified_target_version - // (if set) or its existing declined value (CaN chain). For retries, passed through - // directly from the started event. Written onto the new run's - // WorkflowExecutionStartedEvent. - temporal.api.history.v1.DeclinedTargetVersionUpgrade declined_target_version_upgrade = 17; + option (routing).workflow_id = "start_request.workflow_id"; + + string namespace_id = 1; + temporal.api.workflowservice.v1.StartWorkflowExecutionRequest start_request = 2; + temporal.server.api.workflow.v1.ParentExecutionInfo parent_execution_info = 3; + int32 attempt = 4; + google.protobuf.Timestamp workflow_execution_expiration_time = 5; + temporal.api.enums.v1.ContinueAsNewInitiator continue_as_new_initiator = 6; + // History service should use the values of continued_failure and last_completion_result + // here, not the ones in start_request (those are moved into here in the frontend). + temporal.api.failure.v1.Failure continued_failure = 7; + temporal.api.common.v1.Payloads last_completion_result = 8; + google.protobuf.Duration first_workflow_task_backoff = 9; + // For child or continued-as-new workflows, including a version here from the source + // (parent/previous) will set the initial version stamp of this workflow. + // Deprecated. use `inherited_build_id` + temporal.api.common.v1.WorkerVersionStamp source_version_stamp = 10; + // The root execution info of the new workflow. + // For top-level workflows (ie., without parent), this field must be nil. + temporal.server.api.workflow.v1.RootExecutionInfo root_execution_info = 11; + // inherited build ID from parent/previous execution + // Deprecated. Use behavior, version, and task queue fields in `parent_execution_info`. + string inherited_build_id = 12; + // If set, takes precedence over the Versioning Behavior sent by the SDK on Workflow Task completion. + // To unset the override after the workflow is running, use UpdateWorkflowExecutionOptions. + temporal.api.workflow.v1.VersioningOverride versioning_override = 13; + // If set, we verify the parent-child relationship before applying ID conflict policy WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING + bool child_workflow_only = 14; + // If present, the new workflow should start on this version with pinned base behavior. + temporal.api.deployment.v1.WorkerDeploymentVersion inherited_pinned_version = 15; + // Passes deployment version and revision number from a parent/previous workflow with AutoUpgrade behavior + // to its child/continued-as-new workflow. The first workflow task of the child/CAN workflow is dispatched to + // either this deployment version or the current version of the task queue, depending on which is the more recent version. + // After the first workflow task, the effective behavior of the workflow is determined by worker-sent values in + // subsequent workflow tasks. + temporal.api.deployment.v1.InheritedAutoUpgradeInfo inherited_auto_upgrade_info = 16; + // The target version that the previous run implicitly declined to upgrade to. + // Computed at continue-as-new time from the previous run's last_notified_target_version + // (if set) or its existing declined value (CaN chain). For retries, passed through + // directly from the started event. Written onto the new run's + // WorkflowExecutionStartedEvent. + temporal.api.history.v1.DeclinedTargetVersionUpgrade declined_target_version_upgrade = 17; } message StartWorkflowExecutionResponse { - string run_id = 1; - temporal.server.api.clock.v1.VectorClock clock = 2; - // Set if request_eager_execution is set on the start request - temporal.api.workflowservice.v1.PollWorkflowTaskQueueResponse eager_workflow_task = 3; - bool started = 4; - temporal.api.enums.v1.WorkflowExecutionStatus status = 5; - temporal.api.common.v1.Link link = 6; + string run_id = 1; + temporal.server.api.clock.v1.VectorClock clock = 2; + // Set if request_eager_execution is set on the start request + temporal.api.workflowservice.v1.PollWorkflowTaskQueueResponse eager_workflow_task = 3; + bool started = 4; + temporal.api.enums.v1.WorkflowExecutionStatus status = 5; + temporal.api.common.v1.Link link = 6; } message GetMutableStateRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - int64 expected_next_event_id = 3; - bytes current_branch_token = 4; - temporal.server.api.history.v1.VersionHistoryItem version_history_item = 5; - temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 6; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + int64 expected_next_event_id = 3; + bytes current_branch_token = 4; + temporal.server.api.history.v1.VersionHistoryItem version_history_item = 5; + temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 6; } message GetMutableStateResponse { - temporal.api.common.v1.WorkflowExecution execution = 1; - temporal.api.common.v1.WorkflowType workflow_type = 2; - int64 next_event_id = 3; - int64 previous_started_event_id = 4; - int64 last_first_event_id = 5; - temporal.api.taskqueue.v1.TaskQueue task_queue = 6; - temporal.api.taskqueue.v1.TaskQueue sticky_task_queue = 7; - reserved 8; - reserved 9; - reserved 10; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration sticky_task_queue_schedule_to_start_timeout = 11; - reserved 12; - bytes current_branch_token = 13; - reserved 14; - temporal.server.api.enums.v1.WorkflowExecutionState workflow_state = 15; - temporal.api.enums.v1.WorkflowExecutionStatus workflow_status = 16; - temporal.server.api.history.v1.VersionHistories version_histories = 17; - bool is_sticky_task_queue_enabled = 18; - int64 last_first_event_txn_id = 19; - string first_execution_run_id = 20; - // If using build-id based versioning: version stamp of last worker to complete a workflow - // task for this workflow. - temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 21; - // The currently assigned build ID for this execution. Presence of this value means worker versioning is used - // for this execution. - string assigned_build_id = 22; - string inherited_build_id = 23; - repeated temporal.server.api.persistence.v1.VersionedTransition transition_history = 24; - temporal.api.workflow.v1.WorkflowExecutionVersioningInfo versioning_info = 25; - // Transient or speculative workflow task events which are not yet persisted in the history. - // These events should be appended to the history when it is returned to the worker. - temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_or_speculative_tasks = 26; + temporal.api.common.v1.WorkflowExecution execution = 1; + temporal.api.common.v1.WorkflowType workflow_type = 2; + int64 next_event_id = 3; + int64 previous_started_event_id = 4; + int64 last_first_event_id = 5; + temporal.api.taskqueue.v1.TaskQueue task_queue = 6; + temporal.api.taskqueue.v1.TaskQueue sticky_task_queue = 7; + reserved 8; + reserved 9; + reserved 10; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration sticky_task_queue_schedule_to_start_timeout = 11; + reserved 12; + bytes current_branch_token = 13; + reserved 14; + temporal.server.api.enums.v1.WorkflowExecutionState workflow_state = 15; + temporal.api.enums.v1.WorkflowExecutionStatus workflow_status = 16; + temporal.server.api.history.v1.VersionHistories version_histories = 17; + bool is_sticky_task_queue_enabled = 18; + int64 last_first_event_txn_id = 19; + string first_execution_run_id = 20; + // If using build-id based versioning: version stamp of last worker to complete a workflow + // task for this workflow. + temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 21; + // The currently assigned build ID for this execution. Presence of this value means worker versioning is used + // for this execution. + string assigned_build_id = 22; + string inherited_build_id = 23; + repeated temporal.server.api.persistence.v1.VersionedTransition transition_history = 24; + temporal.api.workflow.v1.WorkflowExecutionVersioningInfo versioning_info = 25; + // Transient or speculative workflow task events which are not yet persisted in the history. + // These events should be appended to the history when it is returned to the worker. + temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_or_speculative_tasks = 26; } message PollMutableStateRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - int64 expected_next_event_id = 3; - bytes current_branch_token = 4; - temporal.server.api.history.v1.VersionHistoryItem version_history_item = 5; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + int64 expected_next_event_id = 3; + bytes current_branch_token = 4; + temporal.server.api.history.v1.VersionHistoryItem version_history_item = 5; } message PollMutableStateResponse { - temporal.api.common.v1.WorkflowExecution execution = 1; - temporal.api.common.v1.WorkflowType workflow_type = 2; - int64 next_event_id = 3; - int64 previous_started_event_id = 4; - int64 last_first_event_id = 5; - temporal.api.taskqueue.v1.TaskQueue task_queue = 6; - temporal.api.taskqueue.v1.TaskQueue sticky_task_queue = 7; - reserved 8; - reserved 9; - reserved 10; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration sticky_task_queue_schedule_to_start_timeout = 11; - bytes current_branch_token = 12; - reserved 13; - temporal.server.api.history.v1.VersionHistories version_histories = 14; - temporal.server.api.enums.v1.WorkflowExecutionState workflow_state = 15; - temporal.api.enums.v1.WorkflowExecutionStatus workflow_status = 16; - int64 last_first_event_txn_id = 17; - string first_execution_run_id = 18; + temporal.api.common.v1.WorkflowExecution execution = 1; + temporal.api.common.v1.WorkflowType workflow_type = 2; + int64 next_event_id = 3; + int64 previous_started_event_id = 4; + int64 last_first_event_id = 5; + temporal.api.taskqueue.v1.TaskQueue task_queue = 6; + temporal.api.taskqueue.v1.TaskQueue sticky_task_queue = 7; + reserved 8; + reserved 9; + reserved 10; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration sticky_task_queue_schedule_to_start_timeout = 11; + bytes current_branch_token = 12; + reserved 13; + temporal.server.api.history.v1.VersionHistories version_histories = 14; + temporal.server.api.enums.v1.WorkflowExecutionState workflow_state = 15; + temporal.api.enums.v1.WorkflowExecutionStatus workflow_status = 16; + int64 last_first_event_txn_id = 17; + string first_execution_run_id = 18; } message ResetStickyTaskQueueRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; } -message ResetStickyTaskQueueResponse { -} +message ResetStickyTaskQueueResponse {} message ExecuteMultiOperationRequest { - option (routing).workflow_id = "workflow_id"; + option (routing).workflow_id = "workflow_id"; - string namespace_id = 1; - string workflow_id = 2; - repeated Operation operations = 3; + string namespace_id = 1; + string workflow_id = 2; + repeated Operation operations = 3; - message Operation { - oneof operation { - StartWorkflowExecutionRequest start_workflow = 1; - UpdateWorkflowExecutionRequest update_workflow = 2; - } + message Operation { + oneof operation { + StartWorkflowExecutionRequest start_workflow = 1; + UpdateWorkflowExecutionRequest update_workflow = 2; } + } } message ExecuteMultiOperationResponse { - repeated Response responses = 1; + repeated Response responses = 1; - message Response { - oneof response { - StartWorkflowExecutionResponse start_workflow = 1; - UpdateWorkflowExecutionResponse update_workflow = 2; - } + message Response { + oneof response { + StartWorkflowExecutionResponse start_workflow = 1; + UpdateWorkflowExecutionResponse update_workflow = 2; } + } } message RecordWorkflowTaskStartedRequest { - option (routing).workflow_id = "workflow_execution.workflow_id"; - - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - int64 scheduled_event_id = 3; - reserved 4; - // Unique id of each poll request. Used to ensure at most once delivery of tasks. - string request_id = 5; - temporal.api.workflowservice.v1.PollWorkflowTaskQueueRequest poll_request = 6; - temporal.server.api.clock.v1.VectorClock clock = 7; - temporal.server.api.taskqueue.v1.BuildIdRedirectInfo build_id_redirect_info = 8; - // The deployment passed by History when the task was scheduled. - // Deprecated. use `version_directive.deployment`. - temporal.api.deployment.v1.Deployment scheduled_deployment = 9; - // Versioning directive that was sent by history when scheduling the task. - temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 10; - // Stamp value from when the workflow task was scheduled. Used to validate the task is still relevant. - int32 stamp = 11; - // Revision number that was sent by matching when the task was dispatched. Used to resolve eventual consistency issues - // that may arise due to stale routing configs in task queue partitions. - int64 task_dispatch_revision_number = 12; - // Target worker deployment version according to matching when starting the task. - // Computed after matching with a poller, right before calling RecordWorkflowTaskStarted. - // Sent only if the target version is different from the poller's version. - temporal.api.deployment.v1.WorkerDeploymentVersion target_deployment_version = 13; + option (routing).workflow_id = "workflow_execution.workflow_id"; + + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + int64 scheduled_event_id = 3; + reserved 4; + // Unique id of each poll request. Used to ensure at most once delivery of tasks. + string request_id = 5; + temporal.api.workflowservice.v1.PollWorkflowTaskQueueRequest poll_request = 6; + temporal.server.api.clock.v1.VectorClock clock = 7; + temporal.server.api.taskqueue.v1.BuildIdRedirectInfo build_id_redirect_info = 8; + // The deployment passed by History when the task was scheduled. + // Deprecated. use `version_directive.deployment`. + temporal.api.deployment.v1.Deployment scheduled_deployment = 9; + // Versioning directive that was sent by history when scheduling the task. + temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 10; + // Stamp value from when the workflow task was scheduled. Used to validate the task is still relevant. + int32 stamp = 11; + // Revision number that was sent by matching when the task was dispatched. Used to resolve eventual consistency issues + // that may arise due to stale routing configs in task queue partitions. + int64 task_dispatch_revision_number = 12; + // Target worker deployment version according to matching when starting the task. + // Computed after matching with a poller, right before calling RecordWorkflowTaskStarted. + // Sent only if the target version is different from the poller's version. + temporal.api.deployment.v1.WorkerDeploymentVersion target_deployment_version = 13; } message RecordWorkflowTaskStartedResponse { - temporal.api.common.v1.WorkflowType workflow_type = 1; - int64 previous_started_event_id = 2; - int64 scheduled_event_id = 3; - int64 started_event_id = 4; - int64 next_event_id = 5; - int32 attempt = 6; - bool sticky_execution_enabled = 7; - temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 8; - temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 9; - reserved 10; - bytes branch_token = 11; - google.protobuf.Timestamp scheduled_time = 12; - google.protobuf.Timestamp started_time = 13; - map queries = 14; - temporal.server.api.clock.v1.VectorClock clock = 15; - repeated temporal.api.protocol.v1.Message messages = 16; - int64 version = 17; - temporal.api.history.v1.History history = 18; - bytes next_page_token = 19; - // Deprecated: This field is being replaced by raw_history_bytes which sends raw bytes - // instead of a proto-decoded History. This avoids matching service having to decode history. - // TODO: PRATHYUSH - // DEPRECATION PLAN: - // Two dynamic config flags control the raw history optimization: - // - history.sendRawHistoryBetweenInternalServices: enables raw history (uses field 18 when OFF, field 20/21 when ON) - // - history.sendRawHistoryBytesToMatchingService: selects field 20 (OFF) vs field 21 (ON) - // - // Version timeline (current version: v1.29): - // - v1.31: This change is released. Both flags default to false for backward compatibility. - // - v1.32: Both flags will be enabled by default in code. - // - v1.33: raw_history (field 20) and history (field 18) will be deprecated and removed, - // as raw_history_bytes (field 21) will be the only field used. - temporal.api.history.v1.History raw_history = 20 [deprecated = true]; - repeated bytes raw_history_bytes = 21; + temporal.api.common.v1.WorkflowType workflow_type = 1; + int64 previous_started_event_id = 2; + int64 scheduled_event_id = 3; + int64 started_event_id = 4; + int64 next_event_id = 5; + int32 attempt = 6; + bool sticky_execution_enabled = 7; + temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 8; + temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 9; + reserved 10; + bytes branch_token = 11; + google.protobuf.Timestamp scheduled_time = 12; + google.protobuf.Timestamp started_time = 13; + map queries = 14; + temporal.server.api.clock.v1.VectorClock clock = 15; + repeated temporal.api.protocol.v1.Message messages = 16; + int64 version = 17; + temporal.api.history.v1.History history = 18; + bytes next_page_token = 19; + // Deprecated: This field is being replaced by raw_history_bytes which sends raw bytes + // instead of a proto-decoded History. This avoids matching service having to decode history. + // TODO: PRATHYUSH + // DEPRECATION PLAN: + // Two dynamic config flags control the raw history optimization: + // - history.sendRawHistoryBetweenInternalServices: enables raw history (uses field 18 when OFF, field 20/21 when ON) + // - history.sendRawHistoryBytesToMatchingService: selects field 20 (OFF) vs field 21 (ON) + // + // Version timeline (current version: v1.29): + // - v1.31: This change is released. Both flags default to false for backward compatibility. + // - v1.32: Both flags will be enabled by default in code. + // - v1.33: raw_history (field 20) and history (field 18) will be deprecated and removed, + // as raw_history_bytes (field 21) will be the only field used. + temporal.api.history.v1.History raw_history = 20 [deprecated = true]; + repeated bytes raw_history_bytes = 21; } // RecordWorkflowTaskStartedResponseWithRawHistory is wire-compatible with RecordWorkflowTaskStartedResponse. @@ -319,284 +316,273 @@ message RecordWorkflowTaskStartedResponse { // IMPORTANT: Field numbers and all other fields must remain identical between these two messages. // Any change to RecordWorkflowTaskStartedResponse must be mirrored here. message RecordWorkflowTaskStartedResponseWithRawHistory { - temporal.api.common.v1.WorkflowType workflow_type = 1; - int64 previous_started_event_id = 2; - int64 scheduled_event_id = 3; - int64 started_event_id = 4; - int64 next_event_id = 5; - int32 attempt = 6; - bool sticky_execution_enabled = 7; - temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 8; - temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 9; - reserved 10; - bytes branch_token = 11; - google.protobuf.Timestamp scheduled_time = 12; - google.protobuf.Timestamp started_time = 13; - map queries = 14; - temporal.server.api.clock.v1.VectorClock clock = 15; - repeated temporal.api.protocol.v1.Message messages = 16; - int64 version = 17; - temporal.api.history.v1.History history = 18; - bytes next_page_token = 19; - // Deprecated: This field is being replaced by raw_history_bytes which sends raw bytes - // instead of a proto-decoded History. This avoids matching service having to decode history. - repeated bytes raw_history = 20 [deprecated = true]; - repeated bytes raw_history_bytes = 21; + temporal.api.common.v1.WorkflowType workflow_type = 1; + int64 previous_started_event_id = 2; + int64 scheduled_event_id = 3; + int64 started_event_id = 4; + int64 next_event_id = 5; + int32 attempt = 6; + bool sticky_execution_enabled = 7; + temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 8; + temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 9; + reserved 10; + bytes branch_token = 11; + google.protobuf.Timestamp scheduled_time = 12; + google.protobuf.Timestamp started_time = 13; + map queries = 14; + temporal.server.api.clock.v1.VectorClock clock = 15; + repeated temporal.api.protocol.v1.Message messages = 16; + int64 version = 17; + temporal.api.history.v1.History history = 18; + bytes next_page_token = 19; + // Deprecated: This field is being replaced by raw_history_bytes which sends raw bytes + // instead of a proto-decoded History. This avoids matching service having to decode history. + repeated bytes raw_history = 20 [deprecated = true]; + repeated bytes raw_history_bytes = 21; } message RecordActivityTaskStartedRequest { - option (routing).custom = true; - - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - int64 scheduled_event_id = 3; - reserved 4; - // Unique id of each poll request. Used to ensure at most once delivery of tasks. - string request_id = 5; - temporal.api.workflowservice.v1.PollActivityTaskQueueRequest poll_request = 6; - temporal.server.api.clock.v1.VectorClock clock = 7; - temporal.server.api.taskqueue.v1.BuildIdRedirectInfo build_id_redirect_info = 8; - - // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. - int32 stamp = 9; - // The deployment passed by History when the task was scheduled. - // Deprecated. use `version_directive.deployment`. - temporal.api.deployment.v1.Deployment scheduled_deployment = 10; - reserved 11; - // Versioning directive that was sent by history when scheduling the task. - temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 12; - // Revision number that was sent by matching when the task was dispatched. Used to resolve eventual consistency issues - // that may arise due to stale routing configs in task queue partitions. - int64 task_dispatch_revision_number = 13; - // Reference to the Chasm component for activity execution (if applicable). For standalone activities, all necessary - // start information is carried within this component, obviating the need to use the fields that apply to embedded - // activities with the exception of version_directive. - bytes component_ref = 14; + option (routing).custom = true; + + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + int64 scheduled_event_id = 3; + reserved 4; + // Unique id of each poll request. Used to ensure at most once delivery of tasks. + string request_id = 5; + temporal.api.workflowservice.v1.PollActivityTaskQueueRequest poll_request = 6; + temporal.server.api.clock.v1.VectorClock clock = 7; + temporal.server.api.taskqueue.v1.BuildIdRedirectInfo build_id_redirect_info = 8; + + // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. + int32 stamp = 9; + // The deployment passed by History when the task was scheduled. + // Deprecated. use `version_directive.deployment`. + temporal.api.deployment.v1.Deployment scheduled_deployment = 10; + reserved 11; + // Versioning directive that was sent by history when scheduling the task. + temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 12; + // Revision number that was sent by matching when the task was dispatched. Used to resolve eventual consistency issues + // that may arise due to stale routing configs in task queue partitions. + int64 task_dispatch_revision_number = 13; + // Reference to the Chasm component for activity execution (if applicable). For standalone activities, all necessary + // start information is carried within this component, obviating the need to use the fields that apply to embedded + // activities with the exception of version_directive. + bytes component_ref = 14; } message RecordActivityTaskStartedResponse { - temporal.api.history.v1.HistoryEvent scheduled_event = 1; - google.protobuf.Timestamp started_time = 2; - int32 attempt = 3; - google.protobuf.Timestamp current_attempt_scheduled_time = 4; - temporal.api.common.v1.Payloads heartbeat_details = 5; - temporal.api.common.v1.WorkflowType workflow_type = 6; - string workflow_namespace = 7; - temporal.server.api.clock.v1.VectorClock clock = 8; - int64 version = 9; - temporal.api.common.v1.Priority priority = 10; - temporal.api.common.v1.RetryPolicy retry_policy = 11; - int64 start_version = 12; - // ID of the activity run (applicable for standalone activities only) - string activity_run_id = 13; + temporal.api.history.v1.HistoryEvent scheduled_event = 1; + google.protobuf.Timestamp started_time = 2; + int32 attempt = 3; + google.protobuf.Timestamp current_attempt_scheduled_time = 4; + temporal.api.common.v1.Payloads heartbeat_details = 5; + temporal.api.common.v1.WorkflowType workflow_type = 6; + string workflow_namespace = 7; + temporal.server.api.clock.v1.VectorClock clock = 8; + int64 version = 9; + temporal.api.common.v1.Priority priority = 10; + temporal.api.common.v1.RetryPolicy retry_policy = 11; + int64 start_version = 12; + // ID of the activity run (applicable for standalone activities only) + string activity_run_id = 13; } message RespondWorkflowTaskCompletedRequest { - option (routing).task_token = "complete_request.task_token"; + option (routing).task_token = "complete_request.task_token"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RespondWorkflowTaskCompletedRequest complete_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RespondWorkflowTaskCompletedRequest complete_request = 2; } message RespondWorkflowTaskCompletedResponse { - RecordWorkflowTaskStartedResponse started_response = 1 [deprecated = true]; - repeated temporal.api.workflowservice.v1.PollActivityTaskQueueResponse activity_tasks = 2; - int64 reset_history_event_id = 3; - temporal.api.workflowservice.v1.PollWorkflowTaskQueueResponse new_workflow_task = 4; + RecordWorkflowTaskStartedResponse started_response = 1 [deprecated = true]; + repeated temporal.api.workflowservice.v1.PollActivityTaskQueueResponse activity_tasks = 2; + int64 reset_history_event_id = 3; + temporal.api.workflowservice.v1.PollWorkflowTaskQueueResponse new_workflow_task = 4; } message RespondWorkflowTaskFailedRequest { - option (routing).task_token = "failed_request.task_token"; + option (routing).task_token = "failed_request.task_token"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RespondWorkflowTaskFailedRequest failed_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RespondWorkflowTaskFailedRequest failed_request = 2; } -message RespondWorkflowTaskFailedResponse { -} +message RespondWorkflowTaskFailedResponse {} message IsWorkflowTaskValidRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - temporal.server.api.clock.v1.VectorClock clock = 3; - int64 scheduled_event_id = 4; - int32 stamp = 5; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + temporal.server.api.clock.v1.VectorClock clock = 3; + int64 scheduled_event_id = 4; + int32 stamp = 5; } message IsWorkflowTaskValidResponse { - // whether matching service can call history service to start the workflow task - bool is_valid = 1; + // whether matching service can call history service to start the workflow task + bool is_valid = 1; } message RecordActivityTaskHeartbeatRequest { - option (routing).task_token = "heartbeat_request.task_token"; + option (routing).task_token = "heartbeat_request.task_token"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RecordActivityTaskHeartbeatRequest heartbeat_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RecordActivityTaskHeartbeatRequest heartbeat_request = 2; } message RecordActivityTaskHeartbeatResponse { - bool cancel_requested = 1; - bool activity_paused = 2; - bool activity_reset = 3; + bool cancel_requested = 1; + bool activity_paused = 2; + bool activity_reset = 3; } message RespondActivityTaskCompletedRequest { - option (routing).task_token = "complete_request.task_token"; + option (routing).task_token = "complete_request.task_token"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RespondActivityTaskCompletedRequest complete_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RespondActivityTaskCompletedRequest complete_request = 2; } -message RespondActivityTaskCompletedResponse { -} +message RespondActivityTaskCompletedResponse {} message RespondActivityTaskFailedRequest { - option (routing).task_token = "failed_request.task_token"; + option (routing).task_token = "failed_request.task_token"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RespondActivityTaskFailedRequest failed_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RespondActivityTaskFailedRequest failed_request = 2; } -message RespondActivityTaskFailedResponse { -} +message RespondActivityTaskFailedResponse {} message RespondActivityTaskCanceledRequest { - option (routing).task_token = "cancel_request.task_token"; + option (routing).task_token = "cancel_request.task_token"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RespondActivityTaskCanceledRequest cancel_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RespondActivityTaskCanceledRequest cancel_request = 2; } -message RespondActivityTaskCanceledResponse { -} +message RespondActivityTaskCanceledResponse {} message IsActivityTaskValidRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - temporal.server.api.clock.v1.VectorClock clock = 3; - int64 scheduled_event_id = 4; - // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. - int32 stamp = 5; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + temporal.server.api.clock.v1.VectorClock clock = 3; + int64 scheduled_event_id = 4; + // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. + int32 stamp = 5; } message IsActivityTaskValidResponse { - // whether matching service can call history service to start the activity task - bool is_valid = 1; + // whether matching service can call history service to start the activity task + bool is_valid = 1; } message SignalWorkflowExecutionRequest { - option (routing).workflow_id = "signal_request.workflow_execution.workflow_id"; + option (routing).workflow_id = "signal_request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.SignalWorkflowExecutionRequest signal_request = 2; - temporal.api.common.v1.WorkflowExecution external_workflow_execution = 3; - bool child_workflow_only = 4; + string namespace_id = 1; + temporal.api.workflowservice.v1.SignalWorkflowExecutionRequest signal_request = 2; + temporal.api.common.v1.WorkflowExecution external_workflow_execution = 3; + bool child_workflow_only = 4; } -message SignalWorkflowExecutionResponse { -} +message SignalWorkflowExecutionResponse {} message SignalWithStartWorkflowExecutionRequest { - option (routing).workflow_id = "signal_with_start_request.workflow_id"; + option (routing).workflow_id = "signal_with_start_request.workflow_id"; - string namespace_id = 1; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "with" is needed here. --) - temporal.api.workflowservice.v1.SignalWithStartWorkflowExecutionRequest signal_with_start_request = 2; + string namespace_id = 1; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "with" is needed here. --) + temporal.api.workflowservice.v1.SignalWithStartWorkflowExecutionRequest signal_with_start_request = 2; } message SignalWithStartWorkflowExecutionResponse { - string run_id = 1; - bool started = 2; + string run_id = 1; + bool started = 2; } message RemoveSignalMutableStateRequest { - option (routing).workflow_id = "workflow_execution.workflow_id"; + option (routing).workflow_id = "workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - string request_id = 3; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + string request_id = 3; } -message RemoveSignalMutableStateResponse { -} +message RemoveSignalMutableStateResponse {} message TerminateWorkflowExecutionRequest { - option (routing).workflow_id = "terminate_request.workflow_execution.workflow_id"; + option (routing).workflow_id = "terminate_request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.TerminateWorkflowExecutionRequest terminate_request = 2; - temporal.api.common.v1.WorkflowExecution external_workflow_execution = 3; - bool child_workflow_only = 4; + string namespace_id = 1; + temporal.api.workflowservice.v1.TerminateWorkflowExecutionRequest terminate_request = 2; + temporal.api.common.v1.WorkflowExecution external_workflow_execution = 3; + bool child_workflow_only = 4; } -message TerminateWorkflowExecutionResponse { -} +message TerminateWorkflowExecutionResponse {} message DeleteWorkflowExecutionRequest { - option (routing).workflow_id = "workflow_execution.workflow_id"; + option (routing).workflow_id = "workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - reserved 3; - bool closed_workflow_only = 4; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + reserved 3; + bool closed_workflow_only = 4; } -message DeleteWorkflowExecutionResponse { -} +message DeleteWorkflowExecutionResponse {} message ResetWorkflowExecutionRequest { - option (routing).workflow_id = "reset_request.workflow_execution.workflow_id"; + option (routing).workflow_id = "reset_request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.ResetWorkflowExecutionRequest reset_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.ResetWorkflowExecutionRequest reset_request = 2; } message ResetWorkflowExecutionResponse { - string run_id = 1; + string run_id = 1; } message RequestCancelWorkflowExecutionRequest { - option (routing).workflow_id = "cancel_request.workflow_execution.workflow_id"; + option (routing).workflow_id = "cancel_request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.RequestCancelWorkflowExecutionRequest cancel_request = 2; - int64 external_initiated_event_id = 3; - temporal.api.common.v1.WorkflowExecution external_workflow_execution = 4; - bool child_workflow_only = 5; + string namespace_id = 1; + temporal.api.workflowservice.v1.RequestCancelWorkflowExecutionRequest cancel_request = 2; + int64 external_initiated_event_id = 3; + temporal.api.common.v1.WorkflowExecution external_workflow_execution = 4; + bool child_workflow_only = 5; } -message RequestCancelWorkflowExecutionResponse { -} +message RequestCancelWorkflowExecutionResponse {} message ScheduleWorkflowTaskRequest { - option (routing).workflow_id = "workflow_execution.workflow_id"; + option (routing).workflow_id = "workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - bool is_first_workflow_task = 3; - temporal.server.api.clock.v1.VectorClock child_clock = 4; - temporal.server.api.clock.v1.VectorClock parent_clock = 5; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + bool is_first_workflow_task = 3; + temporal.server.api.clock.v1.VectorClock child_clock = 4; + temporal.server.api.clock.v1.VectorClock parent_clock = 5; } -message ScheduleWorkflowTaskResponse { -} +message ScheduleWorkflowTaskResponse {} message VerifyFirstWorkflowTaskScheduledRequest { - option (routing).workflow_id = "workflow_execution.workflow_id"; + option (routing).workflow_id = "workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - temporal.server.api.clock.v1.VectorClock clock = 3; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + temporal.server.api.clock.v1.VectorClock clock = 3; } -message VerifyFirstWorkflowTaskScheduledResponse { -} +message VerifyFirstWorkflowTaskScheduledResponse {} /** * RecordChildExecutionCompletedRequest is used for reporting the completion of child execution to parent workflow @@ -606,501 +592,485 @@ message VerifyFirstWorkflowTaskScheduledResponse { * child creates multiple runs through ContinueAsNew before finally completing. **/ message RecordChildExecutionCompletedRequest { - option (routing).workflow_id = "parent_execution.workflow_id"; + option (routing).workflow_id = "parent_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution parent_execution = 2; - int64 parent_initiated_id = 3; - temporal.api.common.v1.WorkflowExecution child_execution = 4; - temporal.api.history.v1.HistoryEvent completion_event = 5; - temporal.server.api.clock.v1.VectorClock clock = 6; - int64 parent_initiated_version = 7; - string child_first_execution_run_id = 8; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution parent_execution = 2; + int64 parent_initiated_id = 3; + temporal.api.common.v1.WorkflowExecution child_execution = 4; + temporal.api.history.v1.HistoryEvent completion_event = 5; + temporal.server.api.clock.v1.VectorClock clock = 6; + int64 parent_initiated_version = 7; + string child_first_execution_run_id = 8; } -message RecordChildExecutionCompletedResponse { -} +message RecordChildExecutionCompletedResponse {} message VerifyChildExecutionCompletionRecordedRequest { - option (routing).workflow_id = "parent_execution.workflow_id"; + option (routing).workflow_id = "parent_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution parent_execution = 2; - temporal.api.common.v1.WorkflowExecution child_execution = 3; - int64 parent_initiated_id = 4; - int64 parent_initiated_version = 5; - temporal.server.api.clock.v1.VectorClock clock = 6; - bool resend_parent = 7; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution parent_execution = 2; + temporal.api.common.v1.WorkflowExecution child_execution = 3; + int64 parent_initiated_id = 4; + int64 parent_initiated_version = 5; + temporal.server.api.clock.v1.VectorClock clock = 6; + bool resend_parent = 7; } -message VerifyChildExecutionCompletionRecordedResponse { -} +message VerifyChildExecutionCompletionRecordedResponse {} message DescribeWorkflowExecutionRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.DescribeWorkflowExecutionRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.DescribeWorkflowExecutionRequest request = 2; } message DescribeWorkflowExecutionResponse { - temporal.api.workflow.v1.WorkflowExecutionConfig execution_config = 1; - temporal.api.workflow.v1.WorkflowExecutionInfo workflow_execution_info = 2; - repeated temporal.api.workflow.v1.PendingActivityInfo pending_activities = 3; - repeated temporal.api.workflow.v1.PendingChildExecutionInfo pending_children = 4; - temporal.api.workflow.v1.PendingWorkflowTaskInfo pending_workflow_task = 5; - repeated temporal.api.workflow.v1.CallbackInfo callbacks = 6; - repeated temporal.api.workflow.v1.PendingNexusOperationInfo pending_nexus_operations = 7; - temporal.api.workflow.v1.WorkflowExecutionExtendedInfo workflow_extended_info = 8; + temporal.api.workflow.v1.WorkflowExecutionConfig execution_config = 1; + temporal.api.workflow.v1.WorkflowExecutionInfo workflow_execution_info = 2; + repeated temporal.api.workflow.v1.PendingActivityInfo pending_activities = 3; + repeated temporal.api.workflow.v1.PendingChildExecutionInfo pending_children = 4; + temporal.api.workflow.v1.PendingWorkflowTaskInfo pending_workflow_task = 5; + repeated temporal.api.workflow.v1.CallbackInfo callbacks = 6; + repeated temporal.api.workflow.v1.PendingNexusOperationInfo pending_nexus_operations = 7; + temporal.api.workflow.v1.WorkflowExecutionExtendedInfo workflow_extended_info = 8; } message ReplicateEventsV2Request { - option (routing).workflow_id = "workflow_execution.workflow_id"; + option (routing).workflow_id = "workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - repeated temporal.server.api.history.v1.VersionHistoryItem version_history_items = 3; - temporal.api.common.v1.DataBlob events = 4; - // New run events does not need version history since there is no prior events. - temporal.api.common.v1.DataBlob new_run_events = 5; - temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 6; - string new_run_id = 7; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + repeated temporal.server.api.history.v1.VersionHistoryItem version_history_items = 3; + temporal.api.common.v1.DataBlob events = 4; + // New run events does not need version history since there is no prior events. + temporal.api.common.v1.DataBlob new_run_events = 5; + temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 6; + string new_run_id = 7; } -message ReplicateEventsV2Response { -} +message ReplicateEventsV2Response {} message ReplicateWorkflowStateRequest { - option (routing).workflow_id = "workflow_state.execution_info.workflow_id"; + option (routing).workflow_id = "workflow_state.execution_info.workflow_id"; - temporal.server.api.persistence.v1.WorkflowMutableState workflow_state = 1; - string remote_cluster = 2; - string namespace_id= 3; - bool is_force_replication = 4; - bool is_close_transfer_task_acked = 5; + temporal.server.api.persistence.v1.WorkflowMutableState workflow_state = 1; + string remote_cluster = 2; + string namespace_id = 3; + bool is_force_replication = 4; + bool is_close_transfer_task_acked = 5; } -message ReplicateWorkflowStateResponse { -} +message ReplicateWorkflowStateResponse {} message SyncShardStatusRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - string source_cluster = 1; - int32 shard_id = 2; - google.protobuf.Timestamp status_time = 3; + string source_cluster = 1; + int32 shard_id = 2; + google.protobuf.Timestamp status_time = 3; } -message SyncShardStatusResponse { -} +message SyncShardStatusResponse {} message SyncActivityRequest { - option (routing).workflow_id = "workflow_id"; - - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - int64 version = 4; - int64 scheduled_event_id = 5; - google.protobuf.Timestamp scheduled_time = 6; - int64 started_event_id = 7; - google.protobuf.Timestamp started_time = 8; - google.protobuf.Timestamp last_heartbeat_time = 9; - temporal.api.common.v1.Payloads details = 10; - int32 attempt = 11; - temporal.api.failure.v1.Failure last_failure = 12; - string last_worker_identity = 13; - temporal.server.api.history.v1.VersionHistory version_history = 14; - temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 15; - // build ID of the worker who received this activity last time - string last_started_build_id = 16; - // workflows redirect_counter value when this activity started last time - int64 last_started_redirect_counter = 17; - - // The first time the activity was scheduled. - google.protobuf.Timestamp first_scheduled_time = 18; - // The last time an activity attempt completion was recorded by the server. - google.protobuf.Timestamp last_attempt_complete_time = 19; - // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. - int32 stamp = 20; - // Indicates if the activity is paused. - bool paused = 21; - - // Retry policy for the activity. - google.protobuf.Duration retry_initial_interval = 22; - google.protobuf.Duration retry_maximum_interval = 23; - int32 retry_maximum_attempts = 24; - double retry_backoff_coefficient = 25; - int64 start_version = 26; + option (routing).workflow_id = "workflow_id"; + + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + int64 version = 4; + int64 scheduled_event_id = 5; + google.protobuf.Timestamp scheduled_time = 6; + int64 started_event_id = 7; + google.protobuf.Timestamp started_time = 8; + google.protobuf.Timestamp last_heartbeat_time = 9; + temporal.api.common.v1.Payloads details = 10; + int32 attempt = 11; + temporal.api.failure.v1.Failure last_failure = 12; + string last_worker_identity = 13; + temporal.server.api.history.v1.VersionHistory version_history = 14; + temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 15; + // build ID of the worker who received this activity last time + string last_started_build_id = 16; + // workflows redirect_counter value when this activity started last time + int64 last_started_redirect_counter = 17; + + // The first time the activity was scheduled. + google.protobuf.Timestamp first_scheduled_time = 18; + // The last time an activity attempt completion was recorded by the server. + google.protobuf.Timestamp last_attempt_complete_time = 19; + // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. + int32 stamp = 20; + // Indicates if the activity is paused. + bool paused = 21; + + // Retry policy for the activity. + google.protobuf.Duration retry_initial_interval = 22; + google.protobuf.Duration retry_maximum_interval = 23; + int32 retry_maximum_attempts = 24; + double retry_backoff_coefficient = 25; + int64 start_version = 26; } message SyncActivitiesRequest { - option (routing).workflow_id = "workflow_id"; + option (routing).workflow_id = "workflow_id"; - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - repeated ActivitySyncInfo activities_info = 4; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + repeated ActivitySyncInfo activities_info = 4; } message ActivitySyncInfo { - int64 version = 1; - int64 scheduled_event_id = 2; - google.protobuf.Timestamp scheduled_time = 3; - int64 started_event_id = 4; - google.protobuf.Timestamp started_time = 5 ; - google.protobuf.Timestamp last_heartbeat_time = 6; - temporal.api.common.v1.Payloads details = 7; - int32 attempt = 8; - temporal.api.failure.v1.Failure last_failure = 9; - string last_worker_identity = 10; - temporal.server.api.history.v1.VersionHistory version_history = 11; - // build ID of the worker who received this activity last time - string last_started_build_id = 12; - // workflows redirect_counter value when this activity started last time - int64 last_started_redirect_counter = 13; - - - // The first time the activity was scheduled. - google.protobuf.Timestamp first_scheduled_time = 18; - // The last time an activity attempt completion was recorded by the server. - google.protobuf.Timestamp last_attempt_complete_time = 19; - // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. - int32 stamp = 20; - // Indicates if the activity is paused. - bool paused = 21; - // Retry policy for the activity. It needs to be replicated now, since the activity properties can be updated. - google.protobuf.Duration retry_initial_interval = 22; - google.protobuf.Duration retry_maximum_interval = 23; - int32 retry_maximum_attempts = 24; - double retry_backoff_coefficient = 25; - int64 start_version = 26; - -} - -message SyncActivityResponse { -} + int64 version = 1; + int64 scheduled_event_id = 2; + google.protobuf.Timestamp scheduled_time = 3; + int64 started_event_id = 4; + google.protobuf.Timestamp started_time = 5; + google.protobuf.Timestamp last_heartbeat_time = 6; + temporal.api.common.v1.Payloads details = 7; + int32 attempt = 8; + temporal.api.failure.v1.Failure last_failure = 9; + string last_worker_identity = 10; + temporal.server.api.history.v1.VersionHistory version_history = 11; + // build ID of the worker who received this activity last time + string last_started_build_id = 12; + // workflows redirect_counter value when this activity started last time + int64 last_started_redirect_counter = 13; + + // The first time the activity was scheduled. + google.protobuf.Timestamp first_scheduled_time = 18; + // The last time an activity attempt completion was recorded by the server. + google.protobuf.Timestamp last_attempt_complete_time = 19; + // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. + int32 stamp = 20; + // Indicates if the activity is paused. + bool paused = 21; + // Retry policy for the activity. It needs to be replicated now, since the activity properties can be updated. + google.protobuf.Duration retry_initial_interval = 22; + google.protobuf.Duration retry_maximum_interval = 23; + int32 retry_maximum_attempts = 24; + double retry_backoff_coefficient = 25; + int64 start_version = 26; +} + +message SyncActivityResponse {} message DescribeMutableStateRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - bool skip_force_reload = 3; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 4; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + bool skip_force_reload = 3; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 4; } message DescribeMutableStateResponse { - // CacheMutableState is only available when mutable state is in cache. - temporal.server.api.persistence.v1.WorkflowMutableState cache_mutable_state = 1; - // DatabaseMutableState is always available, - // but only loaded from database when mutable state is NOT in cache or skip_force_reload is false. - temporal.server.api.persistence.v1.WorkflowMutableState database_mutable_state = 2; + // CacheMutableState is only available when mutable state is in cache. + temporal.server.api.persistence.v1.WorkflowMutableState cache_mutable_state = 1; + // DatabaseMutableState is always available, + // but only loaded from database when mutable state is NOT in cache or skip_force_reload is false. + temporal.server.api.persistence.v1.WorkflowMutableState database_mutable_state = 2; } // At least one of the parameters needs to be provided. message DescribeHistoryHostRequest { - option (routing).custom = true; + option (routing).custom = true; - //ip:port - string host_address = 1; - int32 shard_id = 2; - string namespace_id = 3; - temporal.api.common.v1.WorkflowExecution workflow_execution = 4; + //ip:port + string host_address = 1; + int32 shard_id = 2; + string namespace_id = 3; + temporal.api.common.v1.WorkflowExecution workflow_execution = 4; } message DescribeHistoryHostResponse { - int32 shards_number = 1; - repeated int32 - shard_ids = 2; - temporal.server.api.namespace.v1.NamespaceCacheInfo namespace_cache = 3; - reserved 4; - string address = 5; + int32 shards_number = 1; + repeated int32 shard_ids = 2; + temporal.server.api.namespace.v1.NamespaceCacheInfo namespace_cache = 3; + reserved 4; + string address = 5; } message CloseShardRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - int32 shard_id = 1; + int32 shard_id = 1; } -message CloseShardResponse { -} +message CloseShardResponse {} message GetShardRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - int32 shard_id = 1; + int32 shard_id = 1; } message GetShardResponse { - temporal.server.api.persistence.v1.ShardInfo shard_info = 1; + temporal.server.api.persistence.v1.ShardInfo shard_info = 1; } message RemoveTaskRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - int32 shard_id = 1; - // The task category. See tasks.TaskCategoryRegistry for more. - int32 category = 2; - int64 task_id = 3; - google.protobuf.Timestamp visibility_time = 4; + int32 shard_id = 1; + // The task category. See tasks.TaskCategoryRegistry for more. + int32 category = 2; + int64 task_id = 3; + google.protobuf.Timestamp visibility_time = 4; } -message RemoveTaskResponse { -} +message RemoveTaskResponse {} message GetReplicationMessagesRequest { - option (routing).custom = true; + option (routing).custom = true; - repeated temporal.server.api.replication.v1.ReplicationToken tokens = 1; - string cluster_name = 2; + repeated temporal.server.api.replication.v1.ReplicationToken tokens = 1; + string cluster_name = 2; } message GetReplicationMessagesResponse { - map shard_messages = 1; + map shard_messages = 1; } message GetDLQReplicationMessagesRequest { - option (routing).task_infos = "task_infos"; + option (routing).task_infos = "task_infos"; - repeated temporal.server.api.replication.v1.ReplicationTaskInfo task_infos = 1; + repeated temporal.server.api.replication.v1.ReplicationTaskInfo task_infos = 1; } message GetDLQReplicationMessagesResponse { - repeated temporal.server.api.replication.v1.ReplicationTask replication_tasks = 1; + repeated temporal.server.api.replication.v1.ReplicationTask replication_tasks = 1; } message QueryWorkflowRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.QueryWorkflowRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.QueryWorkflowRequest request = 2; } message QueryWorkflowResponse { - temporal.api.workflowservice.v1.QueryWorkflowResponse response = 1; + temporal.api.workflowservice.v1.QueryWorkflowResponse response = 1; } message ReapplyEventsRequest { - option (routing).workflow_id = "request.workflow_execution.workflow_id"; + option (routing).workflow_id = "request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.server.api.adminservice.v1.ReapplyEventsRequest request = 2; + string namespace_id = 1; + temporal.server.api.adminservice.v1.ReapplyEventsRequest request = 2; } -message ReapplyEventsResponse { -} +message ReapplyEventsResponse {} message GetDLQMessagesRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - temporal.server.api.enums.v1.DeadLetterQueueType type = 1; - int32 shard_id = 2; - string source_cluster = 3; - int64 inclusive_end_message_id = 4; - int32 maximum_page_size = 5; - bytes next_page_token = 6; + temporal.server.api.enums.v1.DeadLetterQueueType type = 1; + int32 shard_id = 2; + string source_cluster = 3; + int64 inclusive_end_message_id = 4; + int32 maximum_page_size = 5; + bytes next_page_token = 6; } message GetDLQMessagesResponse { - temporal.server.api.enums.v1.DeadLetterQueueType type = 1; - repeated temporal.server.api.replication.v1.ReplicationTask replication_tasks = 2; - bytes next_page_token = 3; - repeated temporal.server.api.replication.v1.ReplicationTaskInfo replication_tasks_info = 4; + temporal.server.api.enums.v1.DeadLetterQueueType type = 1; + repeated temporal.server.api.replication.v1.ReplicationTask replication_tasks = 2; + bytes next_page_token = 3; + repeated temporal.server.api.replication.v1.ReplicationTaskInfo replication_tasks_info = 4; } message PurgeDLQMessagesRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - temporal.server.api.enums.v1.DeadLetterQueueType type = 1; - int32 shard_id = 2; - string source_cluster = 3; - int64 inclusive_end_message_id = 4; + temporal.server.api.enums.v1.DeadLetterQueueType type = 1; + int32 shard_id = 2; + string source_cluster = 3; + int64 inclusive_end_message_id = 4; } -message PurgeDLQMessagesResponse { -} +message PurgeDLQMessagesResponse {} message MergeDLQMessagesRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - temporal.server.api.enums.v1.DeadLetterQueueType type = 1; - int32 shard_id = 2; - string source_cluster = 3; - int64 inclusive_end_message_id = 4; - int32 maximum_page_size = 5; - bytes next_page_token = 6; + temporal.server.api.enums.v1.DeadLetterQueueType type = 1; + int32 shard_id = 2; + string source_cluster = 3; + int64 inclusive_end_message_id = 4; + int32 maximum_page_size = 5; + bytes next_page_token = 6; } message MergeDLQMessagesResponse { - bytes next_page_token = 1; + bytes next_page_token = 1; } message RefreshWorkflowTasksRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 3; - temporal.server.api.adminservice.v1.RefreshWorkflowTasksRequest request = 2; + string namespace_id = 1; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 3; + temporal.server.api.adminservice.v1.RefreshWorkflowTasksRequest request = 2; } -message RefreshWorkflowTasksResponse { -} +message RefreshWorkflowTasksResponse {} message GenerateLastHistoryReplicationTasksRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - repeated string target_clusters = 3; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 4; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + repeated string target_clusters = 3; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 4; } message GenerateLastHistoryReplicationTasksResponse { - int64 state_transition_count = 1; - int64 history_length = 2; + int64 state_transition_count = 1; + int64 history_length = 2; } message GetReplicationStatusRequest { - option (routing).custom = true; + option (routing).custom = true; - // Remote cluster names to query for. If omit, will return for all remote clusters. - repeated string remote_clusters = 1; + // Remote cluster names to query for. If omit, will return for all remote clusters. + repeated string remote_clusters = 1; } message GetReplicationStatusResponse { - repeated ShardReplicationStatus shards = 1; + repeated ShardReplicationStatus shards = 1; } message ShardReplicationStatus { - int32 shard_id = 1; - // Max replication task id of current cluster - int64 max_replication_task_id = 2; - // Local time on this shard - google.protobuf.Timestamp shard_local_time = 3; - map remote_clusters = 4; - map handover_namespaces = 5; + int32 shard_id = 1; + // Max replication task id of current cluster + int64 max_replication_task_id = 2; + // Local time on this shard + google.protobuf.Timestamp shard_local_time = 3; + map remote_clusters = 4; + map handover_namespaces = 5; - google.protobuf.Timestamp max_replication_task_visibility_time = 6; + google.protobuf.Timestamp max_replication_task_visibility_time = 6; } message HandoverNamespaceInfo { - // max replication task id when namespace transition to Handover state - int64 handover_replication_task_id = 1; + // max replication task id when namespace transition to Handover state + int64 handover_replication_task_id = 1; } message ShardReplicationStatusPerCluster { - // Acked replication task id - int64 acked_task_id = 1; - // Acked replication task creation time - google.protobuf.Timestamp acked_task_visibility_time = 2; + // Acked replication task id + int64 acked_task_id = 1; + // Acked replication task creation time + google.protobuf.Timestamp acked_task_visibility_time = 2; } message RebuildMutableStateRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; } -message RebuildMutableStateResponse { -} +message RebuildMutableStateResponse {} message ImportWorkflowExecutionRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - repeated temporal.api.common.v1.DataBlob history_batches = 3; - temporal.server.api.history.v1.VersionHistory version_history = 4; - bytes token = 5; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + repeated temporal.api.common.v1.DataBlob history_batches = 3; + temporal.server.api.history.v1.VersionHistory version_history = 4; + bytes token = 5; } message ImportWorkflowExecutionResponse { - bytes token = 1; - bool events_applied = 2; + bytes token = 1; + bool events_applied = 2; } message DeleteWorkflowVisibilityRecordRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - google.protobuf.Timestamp workflow_start_time = 3; - google.protobuf.Timestamp workflow_close_time = 4; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + google.protobuf.Timestamp workflow_start_time = 3; + google.protobuf.Timestamp workflow_close_time = 4; } -message DeleteWorkflowVisibilityRecordResponse { -} +message DeleteWorkflowVisibilityRecordResponse {} // (-- api-linter: core::0134=disabled // aip.dev/not-precedent: This service does not follow the update method AIP --) message UpdateWorkflowExecutionRequest { - option (routing).workflow_id = "request.workflow_execution.workflow_id"; + option (routing).workflow_id = "request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.UpdateWorkflowExecutionRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.UpdateWorkflowExecutionRequest request = 2; } message UpdateWorkflowExecutionResponse { - temporal.api.workflowservice.v1.UpdateWorkflowExecutionResponse response = 1; + temporal.api.workflowservice.v1.UpdateWorkflowExecutionResponse response = 1; } message StreamWorkflowReplicationMessagesRequest { - option (routing).custom = true; + option (routing).custom = true; - oneof attributes { - temporal.server.api.replication.v1.SyncReplicationState sync_replication_state = 1; - } + oneof attributes { + temporal.server.api.replication.v1.SyncReplicationState sync_replication_state = 1; + } } message StreamWorkflowReplicationMessagesResponse { - oneof attributes { - temporal.server.api.replication.v1.WorkflowReplicationMessages messages = 1; - } + oneof attributes { + temporal.server.api.replication.v1.WorkflowReplicationMessages messages = 1; + } } message PollWorkflowExecutionUpdateRequest { - option (routing).workflow_id = "request.update_ref.workflow_execution.workflow_id"; + option (routing).workflow_id = "request.update_ref.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.PollWorkflowExecutionUpdateRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.PollWorkflowExecutionUpdateRequest request = 2; } message PollWorkflowExecutionUpdateResponse { - temporal.api.workflowservice.v1.PollWorkflowExecutionUpdateResponse response = 1; + temporal.api.workflowservice.v1.PollWorkflowExecutionUpdateResponse response = 1; } message GetWorkflowExecutionHistoryRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryRequest request = 2; } message GetWorkflowExecutionHistoryResponse { - temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryResponse response = 1; - temporal.api.history.v1.History history = 2; + temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryResponse response = 1; + temporal.api.history.v1.History history = 2; } // This message must be wire compatible with GetWorkflowExecutionHistoryResponse. message GetWorkflowExecutionHistoryResponseWithRaw { - temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryResponse response = 1; - repeated bytes history = 2; + temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryResponse response = 1; + repeated bytes history = 2; } message GetWorkflowExecutionHistoryReverseRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryReverseRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryReverseRequest request = 2; } message GetWorkflowExecutionHistoryReverseResponse { - temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryReverseResponse response = 1; + temporal.api.workflowservice.v1.GetWorkflowExecutionHistoryReverseResponse response = 1; } /** @@ -1108,332 +1078,328 @@ message GetWorkflowExecutionHistoryReverseResponse { * EndEventId and EndEventVersion defines the end of the event to fetch. The end event is exclusive. **/ message GetWorkflowExecutionRawHistoryV2Request { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryV2Request request = 2; + string namespace_id = 1; + temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryV2Request request = 2; } message GetWorkflowExecutionRawHistoryV2Response { - temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryV2Response response = 1; + temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryV2Response response = 1; } message GetWorkflowExecutionRawHistoryRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryRequest request = 2; + string namespace_id = 1; + temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryRequest request = 2; } message GetWorkflowExecutionRawHistoryResponse { - temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryResponse response = 1; + temporal.server.api.adminservice.v1.GetWorkflowExecutionRawHistoryResponse response = 1; } message ForceDeleteWorkflowExecutionRequest { - option (routing).workflow_id = "request.execution.workflow_id"; + option (routing).workflow_id = "request.execution.workflow_id"; - string namespace_id = 1; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 3; - temporal.server.api.adminservice.v1.DeleteWorkflowExecutionRequest request = 2; + string namespace_id = 1; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 3; + temporal.server.api.adminservice.v1.DeleteWorkflowExecutionRequest request = 2; } message ForceDeleteWorkflowExecutionResponse { - temporal.server.api.adminservice.v1.DeleteWorkflowExecutionResponse response = 1; + temporal.server.api.adminservice.v1.DeleteWorkflowExecutionResponse response = 1; } message GetDLQTasksRequest { - option (routing).any_host = true; + option (routing).any_host = true; - temporal.server.api.common.v1.HistoryDLQKey dlq_key = 1; - // page_size must be positive. Up to this many tasks will be returned. - int32 page_size = 2; - bytes next_page_token = 3; + temporal.server.api.common.v1.HistoryDLQKey dlq_key = 1; + // page_size must be positive. Up to this many tasks will be returned. + int32 page_size = 2; + bytes next_page_token = 3; } message GetDLQTasksResponse { - repeated temporal.server.api.common.v1.HistoryDLQTask dlq_tasks = 1; - // next_page_token is empty if there are no more results. However, the converse is not true. If there are no more - // results, this field may still be non-empty. This is to avoid having to do a count query to determine whether - // there are more results. - bytes next_page_token = 2; + repeated temporal.server.api.common.v1.HistoryDLQTask dlq_tasks = 1; + // next_page_token is empty if there are no more results. However, the converse is not true. If there are no more + // results, this field may still be non-empty. This is to avoid having to do a count query to determine whether + // there are more results. + bytes next_page_token = 2; } message DeleteDLQTasksRequest { - option (routing).any_host = true; + option (routing).any_host = true; - temporal.server.api.common.v1.HistoryDLQKey dlq_key = 1; - temporal.server.api.common.v1.HistoryDLQTaskMetadata inclusive_max_task_metadata = 2; + temporal.server.api.common.v1.HistoryDLQKey dlq_key = 1; + temporal.server.api.common.v1.HistoryDLQTaskMetadata inclusive_max_task_metadata = 2; } message DeleteDLQTasksResponse { - // messages_deleted is the total number of messages deleted in DeleteDLQTasks operation. - int64 messages_deleted = 1; + // messages_deleted is the total number of messages deleted in DeleteDLQTasks operation. + int64 messages_deleted = 1; } message ListQueuesRequest { - option (routing).any_host = true; + option (routing).any_host = true; - int32 queue_type = 1; - int32 page_size = 2; - bytes next_page_token = 3; + int32 queue_type = 1; + int32 page_size = 2; + bytes next_page_token = 3; } message ListQueuesResponse { - message QueueInfo { - string queue_name = 1; - int64 message_count = 2; - int64 last_message_id = 3; - } - repeated QueueInfo queues = 1; - bytes next_page_token = 2; + message QueueInfo { + string queue_name = 1; + int64 message_count = 2; + int64 last_message_id = 3; + } + repeated QueueInfo queues = 1; + bytes next_page_token = 2; } message AddTasksRequest { - option (routing).shard_id = "shard_id"; - - // Even though we can obtain the shard ID from the tasks, we still need the shard_id in the request for routing. If - // not, it would be possible to include tasks for shards that belong to different hosts, and we'd need to fan-out the - // request, which would be more complicated. - int32 shard_id = 1; - - message Task { - // category_id is needed to deserialize the tasks. See TaskCategory for a list of options here. However, keep in mind - // that the list of valid options is registered dynamically with the server in the history/tasks package, so that - // enum is not comprehensive. - int32 category_id = 1; - // blob is the serialized task. - temporal.api.common.v1.DataBlob blob = 2; - } + option (routing).shard_id = "shard_id"; + + // Even though we can obtain the shard ID from the tasks, we still need the shard_id in the request for routing. If + // not, it would be possible to include tasks for shards that belong to different hosts, and we'd need to fan-out the + // request, which would be more complicated. + int32 shard_id = 1; - // A list of tasks to enqueue or re-enqueue. - repeated Task tasks = 2; + message Task { + // category_id is needed to deserialize the tasks. See TaskCategory for a list of options here. However, keep in mind + // that the list of valid options is registered dynamically with the server in the history/tasks package, so that + // enum is not comprehensive. + int32 category_id = 1; + // blob is the serialized task. + temporal.api.common.v1.DataBlob blob = 2; + } + + // A list of tasks to enqueue or re-enqueue. + repeated Task tasks = 2; } message AddTasksResponse {} message ListTasksRequest { - option (routing).shard_id = "request.shard_id"; + option (routing).shard_id = "request.shard_id"; - temporal.server.api.adminservice.v1.ListHistoryTasksRequest request = 1; + temporal.server.api.adminservice.v1.ListHistoryTasksRequest request = 1; } message ListTasksResponse { - temporal.server.api.adminservice.v1.ListHistoryTasksResponse response = 1; + temporal.server.api.adminservice.v1.ListHistoryTasksResponse response = 1; } message CompleteNexusOperationChasmRequest { - option (routing).chasm_component_ref = "completion.component_ref"; - - // Completion token - holds information for locating an entity and the corresponding component. - temporal.server.api.token.v1.NexusOperationCompletion completion = 1; - oneof outcome { - // Result of a successful operation, only set if state == successful. - temporal.api.common.v1.Payload success = 2; - // Operation failure, only set if state != successful. - temporal.api.failure.v1.Failure failure = 3; - } - // Time when the operation was closed. - google.protobuf.Timestamp close_time = 4; + option (routing).chasm_component_ref = "completion.component_ref"; + + // Completion token - holds information for locating an entity and the corresponding component. + temporal.server.api.token.v1.NexusOperationCompletion completion = 1; + oneof outcome { + // Result of a successful operation, only set if state == successful. + temporal.api.common.v1.Payload success = 2; + // Operation failure, only set if state != successful. + temporal.api.failure.v1.Failure failure = 3; + } + // Time when the operation was closed. + google.protobuf.Timestamp close_time = 4; } message CompleteNexusOperationChasmResponse {} message CompleteNexusOperationRequest { - option (routing) = { - namespace_id: "completion.namespace_id" - workflow_id: "completion.workflow_id" - }; - - // Completion token - holds information for locating a run and the corresponding operation state machine. - temporal.server.api.token.v1.NexusOperationCompletion completion = 1; - // Operation state - may only be successful / failed / canceled. - string state = 2; - oneof outcome { - // Result of a successful operation, only set if state == successful. - temporal.api.common.v1.Payload success = 3; - // Operation failure, only set if state != successful. - temporal.api.nexus.v1.Failure failure = 4; - } - // Operation token - used when the completion is received before the started response. - string operation_token = 5; - // Time the operation was started. Used when completion is received before the started response. - google.protobuf.Timestamp start_time = 6; - // Links to be attached to a fabricated start event if completion is received before started response. - repeated temporal.api.common.v1.Link links = 7; -} - -message CompleteNexusOperationResponse { -} + option (routing) = { + namespace_id: "completion.namespace_id" + workflow_id: "completion.workflow_id" + }; + + // Completion token - holds information for locating a run and the corresponding operation state machine. + temporal.server.api.token.v1.NexusOperationCompletion completion = 1; + // Operation state - may only be successful / failed / canceled. + string state = 2; + oneof outcome { + // Result of a successful operation, only set if state == successful. + temporal.api.common.v1.Payload success = 3; + // Operation failure, only set if state != successful. + temporal.api.nexus.v1.Failure failure = 4; + } + // Operation token - used when the completion is received before the started response. + string operation_token = 5; + // Time the operation was started. Used when completion is received before the started response. + google.protobuf.Timestamp start_time = 6; + // Links to be attached to a fabricated start event if completion is received before started response. + repeated temporal.api.common.v1.Link links = 7; +} + +message CompleteNexusOperationResponse {} message InvokeStateMachineMethodRequest { - option (routing).workflow_id = "workflow_id"; + option (routing).workflow_id = "workflow_id"; - // TODO(Tianyu): This is the same as NexusOperationsCompletion but obviously is not about Nexus. This is because - // State machine signaling is a generalization of the Nexus mechanisms. Perhaps eventually they should be merged. - // Namespace UUID. - string namespace_id = 1; - // Workflow ID. - string workflow_id = 2; - // Run ID at the time this token was generated. - string run_id = 3; - // Reference including the path to the backing Operation state machine and a version + transition count for - // staleness checks. - temporal.server.api.persistence.v1.StateMachineRef ref = 4; + // TODO(Tianyu): This is the same as NexusOperationsCompletion but obviously is not about Nexus. This is because + // State machine signaling is a generalization of the Nexus mechanisms. Perhaps eventually they should be merged. + // Namespace UUID. + string namespace_id = 1; + // Workflow ID. + string workflow_id = 2; + // Run ID at the time this token was generated. + string run_id = 3; + // Reference including the path to the backing Operation state machine and a version + transition count for + // staleness checks. + temporal.server.api.persistence.v1.StateMachineRef ref = 4; - // The method name to invoke. Methods must be explicitly registered for the target state machine in the state - // machine registry, and accept an argument type of HistoryEvent that is the completion event of the completed - // workflow. - string method_name = 5; + // The method name to invoke. Methods must be explicitly registered for the target state machine in the state + // machine registry, and accept an argument type of HistoryEvent that is the completion event of the completed + // workflow. + string method_name = 5; - // Input, in serialized bytes, to the method. Users specify a deserializer during method registration for each state machine. - bytes input = 6; + // Input, in serialized bytes, to the method. Users specify a deserializer during method registration for each state machine. + bytes input = 6; } message InvokeStateMachineMethodResponse { - // Output, in serialized bytes, of the method. Users specify a serializer during method registration for each state machine. - bytes output = 1; + // Output, in serialized bytes, of the method. Users specify a serializer during method registration for each state machine. + bytes output = 1; } message DeepHealthCheckRequest { - option (routing).custom = true; + option (routing).custom = true; - string host_address = 1; + string host_address = 1; } message DeepHealthCheckResponse { - temporal.server.api.enums.v1.HealthState state = 1; - // Per-check diagnostic results. Populated for all checks regardless of state. - repeated temporal.server.api.health.v1.HealthCheck checks = 2; + temporal.server.api.enums.v1.HealthState state = 1; + // Per-check diagnostic results. Populated for all checks regardless of state. + repeated temporal.server.api.health.v1.HealthCheck checks = 2; } message SyncWorkflowStateRequest { - option (routing).workflow_id = "execution.workflow_id"; + option (routing).workflow_id = "execution.workflow_id"; - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 3; - temporal.server.api.history.v1.VersionHistories version_histories = 4; - int32 target_cluster_id = 5; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 6; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 3; + temporal.server.api.history.v1.VersionHistories version_histories = 4; + int32 target_cluster_id = 5; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 6; } message SyncWorkflowStateResponse { - reserved 1; - reserved 2; - reserved 3; - reserved 4; - replication.v1.VersionedTransitionArtifact versioned_transition_artifact = 5; + reserved 1; + reserved 2; + reserved 3; + reserved 4; + replication.v1.VersionedTransitionArtifact versioned_transition_artifact = 5; } // (-- api-linter: core::0134::request-mask-required=disabled // (-- api-linter: core::0134::request-resource-required=disabled message UpdateActivityOptionsRequest { - option (routing).workflow_id = "update_request.execution.workflow_id"; + option (routing).workflow_id = "update_request.execution.workflow_id"; - // Namespace ID of the workflow which scheduled this activity - string namespace_id = 1; + // Namespace ID of the workflow which scheduled this activity + string namespace_id = 1; - temporal.api.workflowservice.v1.UpdateActivityOptionsRequest update_request = 2; + temporal.api.workflowservice.v1.UpdateActivityOptionsRequest update_request = 2; } message UpdateActivityOptionsResponse { - // Activity options after an update - temporal.api.activity.v1.ActivityOptions activity_options = 1; + // Activity options after an update + temporal.api.activity.v1.ActivityOptions activity_options = 1; } message PauseActivityRequest { - option (routing).workflow_id = "frontend_request.execution.workflow_id"; + option (routing).workflow_id = "frontend_request.execution.workflow_id"; - // Namespace ID of the workflow which scheduled this activity - string namespace_id = 1; + // Namespace ID of the workflow which scheduled this activity + string namespace_id = 1; - temporal.api.workflowservice.v1.PauseActivityRequest frontend_request = 2; + temporal.api.workflowservice.v1.PauseActivityRequest frontend_request = 2; } -message PauseActivityResponse { -} +message PauseActivityResponse {} message UnpauseActivityRequest { - option (routing).workflow_id = "frontend_request.execution.workflow_id"; + option (routing).workflow_id = "frontend_request.execution.workflow_id"; - // Namespace ID of the workflow which scheduled this activity - string namespace_id = 1; + // Namespace ID of the workflow which scheduled this activity + string namespace_id = 1; - temporal.api.workflowservice.v1.UnpauseActivityRequest frontend_request = 2; + temporal.api.workflowservice.v1.UnpauseActivityRequest frontend_request = 2; } -message UnpauseActivityResponse { -} +message UnpauseActivityResponse {} message ResetActivityRequest { - option (routing).workflow_id = "frontend_request.execution.workflow_id"; + option (routing).workflow_id = "frontend_request.execution.workflow_id"; - // Namespace ID of the workflow which scheduled this activity - string namespace_id = 1; + // Namespace ID of the workflow which scheduled this activity + string namespace_id = 1; - temporal.api.workflowservice.v1.ResetActivityRequest frontend_request = 2; + temporal.api.workflowservice.v1.ResetActivityRequest frontend_request = 2; } -message ResetActivityResponse { -} +message ResetActivityResponse {} // (-- api-linter: core::0134::request-mask-required=disabled // (-- api-linter: core::0134::request-resource-required=disabled message UpdateWorkflowExecutionOptionsRequest { - option (routing).workflow_id = "update_request.workflow_execution.workflow_id"; + option (routing).workflow_id = "update_request.workflow_execution.workflow_id"; - string namespace_id = 1; - temporal.api.workflowservice.v1.UpdateWorkflowExecutionOptionsRequest update_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.UpdateWorkflowExecutionOptionsRequest update_request = 2; } message UpdateWorkflowExecutionOptionsResponse { - // Workflow Execution options after update. - temporal.api.workflow.v1.WorkflowExecutionOptions workflow_execution_options = 1; + // Workflow Execution options after update. + temporal.api.workflow.v1.WorkflowExecutionOptions workflow_execution_options = 1; } message PauseWorkflowExecutionRequest { - option (routing).workflow_id = "pause_request.workflow_id"; + option (routing).workflow_id = "pause_request.workflow_id"; - // Namespace ID of the workflow which is being paused - string namespace_id = 1; + // Namespace ID of the workflow which is being paused + string namespace_id = 1; - temporal.api.workflowservice.v1.PauseWorkflowExecutionRequest pause_request = 2; + temporal.api.workflowservice.v1.PauseWorkflowExecutionRequest pause_request = 2; } -message PauseWorkflowExecutionResponse { } +message PauseWorkflowExecutionResponse {} message UnpauseWorkflowExecutionRequest { - option (routing).workflow_id = "unpause_request.workflow_id"; + option (routing).workflow_id = "unpause_request.workflow_id"; - // Namespace ID of the workflow which is being unpaused - string namespace_id = 1; + // Namespace ID of the workflow which is being unpaused + string namespace_id = 1; - temporal.api.workflowservice.v1.UnpauseWorkflowExecutionRequest unpause_request = 2; + temporal.api.workflowservice.v1.UnpauseWorkflowExecutionRequest unpause_request = 2; } -message UnpauseWorkflowExecutionResponse { } +message UnpauseWorkflowExecutionResponse {} message StartNexusOperationRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - string namespace_id = 1; - int32 shard_id = 2; - temporal.api.nexus.v1.StartOperationRequest request = 3; + string namespace_id = 1; + int32 shard_id = 2; + temporal.api.nexus.v1.StartOperationRequest request = 3; } message StartNexusOperationResponse { - temporal.api.nexus.v1.StartOperationResponse response = 1; + temporal.api.nexus.v1.StartOperationResponse response = 1; } message CancelNexusOperationRequest { - option (routing).shard_id = "shard_id"; + option (routing).shard_id = "shard_id"; - string namespace_id = 1; - int32 shard_id = 2; - temporal.api.nexus.v1.CancelOperationRequest request = 3; + string namespace_id = 1; + int32 shard_id = 2; + temporal.api.nexus.v1.CancelOperationRequest request = 3; } message CancelNexusOperationResponse { - temporal.api.nexus.v1.CancelOperationResponse response = 1; + temporal.api.nexus.v1.CancelOperationResponse response = 1; } diff --git a/proto/internal/temporal/server/api/historyservice/v1/service.proto b/proto/internal/temporal/server/api/historyservice/v1/service.proto index 9526fb4211..4dcd9ce077 100644 --- a/proto/internal/temporal/server/api/historyservice/v1/service.proto +++ b/proto/internal/temporal/server/api/historyservice/v1/service.proto @@ -1,516 +1,517 @@ syntax = "proto3"; package temporal.server.api.historyservice.v1; -option go_package = "go.temporal.io/server/api/historyservice/v1;historyservice"; -import "temporal/server/api/historyservice/v1/request_response.proto"; import "temporal/server/api/common/v1/api_category.proto"; +import "temporal/server/api/historyservice/v1/request_response.proto"; + +option go_package = "go.temporal.io/server/api/historyservice/v1;historyservice"; // HistoryService provides API to start a new long running workflow instance, as well as query and update the history // of workflow instances already created. service HistoryService { - // StartWorkflowExecution starts a new long running workflow instance. It will create the instance with - // 'WorkflowExecutionStarted' event in history and also schedule the first WorkflowTask for the worker to produce the - // initial list of commands for this instance. It will return 'WorkflowExecutionAlreadyStartedError', if an instance already - // exists with same workflowId. - rpc StartWorkflowExecution (StartWorkflowExecutionRequest) returns (StartWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Returns the information from mutable state of workflow execution. - // It fails with 'EntityNotExistError' if specified workflow execution in unknown to the service. - // It returns CurrentBranchChangedError if the workflow version branch has changed. - rpc GetMutableState (GetMutableStateRequest) returns (GetMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Returns the information from mutable state of workflow execution. - // It fails with 'EntityNotExistError' if specified workflow execution in unknown to the service. - // It returns CurrentBranchChangedError if the workflow version branch has changed. - rpc PollMutableState (PollMutableStateRequest) returns (PollMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // Reset the sticky task queue related information in mutable state of a given workflow. - // Things cleared are: - // 1. StickyTaskQueue - // 2. StickyScheduleToStartTimeout - rpc ResetStickyTaskQueue (ResetStickyTaskQueueRequest) returns (ResetStickyTaskQueueResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RecordWorkflowTaskStarted is called by the Matchingservice before it hands a workflow task to the application worker in response to - // a PollWorkflowTaskQueue call. It records in the history the event that the workflow task has started. It will return 'TaskAlreadyStartedError', - // if the workflow's execution history already includes a record of the event starting. - rpc RecordWorkflowTaskStarted (RecordWorkflowTaskStartedRequest) returns (RecordWorkflowTaskStartedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RecordActivityTaskStarted is called by the Matchingservice before it hands a workflow task to the application worker in response to - // a PollActivityTaskQueue call. It records in the history the event that the workflow task has started. It will return 'TaskAlreadyStartedError', - // if the workflow's execution history already includes a record of the event starting. - rpc RecordActivityTaskStarted (RecordActivityTaskStartedRequest) returns (RecordActivityTaskStartedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RespondWorkflowTaskCompleted is called by application worker to complete a WorkflowTask handed as a result of - // 'PollWorkflowTaskQueue' API call. Completing a WorkflowTask will result in new result in new commands for the - // workflow execution and potentially new ActivityTasks created for correspondent commands. It will also create a - // WorkflowTaskCompleted event in the history for that session. Use the 'taskToken' provided as response of - // PollWorkflowTaskQueue API call for completing the WorkflowTask. - rpc RespondWorkflowTaskCompleted (RespondWorkflowTaskCompletedRequest) returns (RespondWorkflowTaskCompletedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RespondWorkflowTaskFailed is called by application worker to indicate failure. This results in - // WorkflowTaskFailedEvent written to the history and a new WorkflowTask created. This API can be used by client to - // either clear sticky task queue or report ny panics during WorkflowTask processing. - rpc RespondWorkflowTaskFailed (RespondWorkflowTaskFailedRequest) returns (RespondWorkflowTaskFailedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // IsWorkflowTaskValid is called by matching service checking whether the workflow task is valid. - rpc IsWorkflowTaskValid (IsWorkflowTaskValidRequest) returns (IsWorkflowTaskValidResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RecordActivityTaskHeartbeat is called by application worker while it is processing an ActivityTask. If worker fails - // to heartbeat within 'heartbeatTimeoutSeconds' interval for the ActivityTask, then it will be marked as timedout and - // 'ActivityTaskTimedOut' event will be written to the workflow history. Calling 'RecordActivityTaskHeartbeat' will - // fail with 'EntityNotExistsError' in such situations. Use the 'taskToken' provided as response of - // PollActivityTaskQueue API call for heartbeating. - rpc RecordActivityTaskHeartbeat (RecordActivityTaskHeartbeatRequest) returns (RecordActivityTaskHeartbeatResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RespondActivityTaskCompleted is called by application worker when it is done processing an ActivityTask. It will - // result in a new 'ActivityTaskCompleted' event being written to the workflow history and a new WorkflowTask - // created for the workflow so new commands could be made. Use the 'taskToken' provided as response of - // PollActivityTaskQueue API call for completion. It fails with 'EntityNotExistsError' if the taskToken is not valid - // anymore due to activity timeout. - rpc RespondActivityTaskCompleted (RespondActivityTaskCompletedRequest) returns (RespondActivityTaskCompletedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RespondActivityTaskFailed is called by application worker when it is done processing an ActivityTask. It will - // result in a new 'ActivityTaskFailed' event being written to the workflow history and a new WorkflowTask - // created for the workflow instance so new commands could be made. Use the 'taskToken' provided as response of - // PollActivityTaskQueue API call for completion. It fails with 'EntityNotExistsError' if the taskToken is not valid - // anymore due to activity timeout. - rpc RespondActivityTaskFailed (RespondActivityTaskFailedRequest) returns (RespondActivityTaskFailedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RespondActivityTaskCanceled is called by application worker when it is successfully canceled an ActivityTask. It will - // result in a new 'ActivityTaskCanceled' event being written to the workflow history and a new WorkflowTask - // created for the workflow instance so new commands could be made. Use the 'taskToken' provided as response of - // PollActivityTaskQueue API call for completion. It fails with 'EntityNotExistsError' if the taskToken is not valid - // anymore due to activity timeout. - rpc RespondActivityTaskCanceled (RespondActivityTaskCanceledRequest) returns (RespondActivityTaskCanceledResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // IsActivityTaskValid is called by matching service checking whether the workflow task is valid. - rpc IsActivityTaskValid (IsActivityTaskValidRequest) returns (IsActivityTaskValidResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // SignalWorkflowExecution is used to send a signal event to running workflow execution. This results in - // WorkflowExecutionSignaled event recorded in the history and a workflow task being created for the execution. - rpc SignalWorkflowExecution (SignalWorkflowExecutionRequest) returns (SignalWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // (-- api-linter: core::0136::prepositions=disabled - // aip.dev/not-precedent: "With" is needed here. --) - // SignalWithStartWorkflowExecution is used to ensure sending a signal event to a workflow execution. - // If workflow is running, this results in WorkflowExecutionSignaled event recorded in the history - // and a workflow task being created for the execution. - // If workflow is not running or not found, it will first try start workflow with given WorkflowIdResuePolicy, - // and record WorkflowExecutionStarted and WorkflowExecutionSignaled event in case of success. - // It will return `WorkflowExecutionAlreadyStartedError` if start workflow failed with given policy. - rpc SignalWithStartWorkflowExecution (SignalWithStartWorkflowExecutionRequest) returns (SignalWithStartWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ExecuteMultiOperation executes multiple operations within a single workflow. - rpc ExecuteMultiOperation (ExecuteMultiOperationRequest) returns (ExecuteMultiOperationResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // RemoveSignalMutableState is used to remove a signal request Id that was previously recorded. This is currently - // used to clean execution info when signal workflow task finished. - rpc RemoveSignalMutableState (RemoveSignalMutableStateRequest) returns (RemoveSignalMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // TerminateWorkflowExecution terminates an existing workflow execution by recording WorkflowExecutionTerminated event - // in the history and immediately terminating the execution instance. - rpc TerminateWorkflowExecution (TerminateWorkflowExecutionRequest) returns (TerminateWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DeleteWorkflowExecution asynchronously deletes a specific Workflow Execution (when WorkflowExecution.run_id is - // provided) or the latest Workflow Execution (when WorkflowExecution.run_id is not provided). If the Workflow - // Execution is Running, it will be terminated before deletion. - rpc DeleteWorkflowExecution (DeleteWorkflowExecutionRequest) returns (DeleteWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ResetWorkflowExecution reset an existing workflow execution by a firstEventId of a existing event batch - // in the history and immediately terminating the current execution instance. - // After reset, the history will grow from nextFirstEventId. - rpc ResetWorkflowExecution (ResetWorkflowExecutionRequest) returns (ResetWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // UpdateWorkflowExecutionOptions modifies the options of an existing workflow execution. - // Currently the option that can be updated is setting and unsetting a versioning behavior override. - // (-- api-linter: core::0134::method-signature=disabled - // (-- api-linter: core::0134::response-message-name=disabled - rpc UpdateWorkflowExecutionOptions (UpdateWorkflowExecutionOptionsRequest) returns (UpdateWorkflowExecutionOptionsResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RequestCancelWorkflowExecution is called by application worker when it wants to request cancellation of a workflow instance. - // It will result in a new 'WorkflowExecutionCancelRequested' event being written to the workflow history and a new WorkflowTask - // created for the workflow instance so new commands could be made. It fails with 'EntityNotExistsError' if the workflow is not valid - // anymore due to completion or doesn't exist. - rpc RequestCancelWorkflowExecution (RequestCancelWorkflowExecutionRequest) returns (RequestCancelWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ScheduleWorkflowTask is used for creating a workflow task for already started workflow execution. This is mainly - // used by transfer queue processor during the processing of StartChildWorkflowExecution task, where it first starts - // child execution without creating the workflow task and then calls this API after updating the mutable state of - // parent execution. - rpc ScheduleWorkflowTask (ScheduleWorkflowTaskRequest) returns (ScheduleWorkflowTaskResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // VerifyFirstWorkflowTaskScheduled checks if workflow has its first workflow task scheduled. - // This is only used by standby transfer start child workflow task logic to make sure parent workflow has - // scheduled first workflow task in child after recording child started in its mutable state; otherwise, - // during namespace failover, it's possible that none of the clusters will schedule the first workflow task. - // NOTE: This is an experimental API. If later we found there are more verification API and there's a clear pattern - // of how verification is done, we may unify them into one generic verfication API. - rpc VerifyFirstWorkflowTaskScheduled (VerifyFirstWorkflowTaskScheduledRequest) returns (VerifyFirstWorkflowTaskScheduledResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RecordChildExecutionCompleted is used for reporting the completion of child workflow execution to parent. - // This is mainly called by transfer queue processor during the processing of DeleteExecution task. - rpc RecordChildExecutionCompleted (RecordChildExecutionCompletedRequest) returns (RecordChildExecutionCompletedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // VerifyChildExecutionCompletionRecorded checks if child completion result is recorded in parent workflow. - // This is only used by standby transfer close execution logic to make sure parent workflow has the result - // recorded before completing the task, otherwise during namespace failover, it's possible that none of the - // clusters will record the child result in parent workflow. - // NOTE: This is an experimental API. If later we found there are more verification API and there's a clear pattern - // of how verification is done, we may unify them into one generic verfication API. - rpc VerifyChildExecutionCompletionRecorded (VerifyChildExecutionCompletionRecordedRequest) returns (VerifyChildExecutionCompletionRecordedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeWorkflowExecution returns information about the specified workflow execution. - rpc DescribeWorkflowExecution (DescribeWorkflowExecutionRequest) returns (DescribeWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ReplicateEventsV2 replicates workflow history events - rpc ReplicateEventsV2 (ReplicateEventsV2Request) returns (ReplicateEventsV2Response) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ReplicateWorkflowState replicates workflow state - rpc ReplicateWorkflowState(ReplicateWorkflowStateRequest) returns (ReplicateWorkflowStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // SyncShardStatus sync the status between shards. - rpc SyncShardStatus (SyncShardStatusRequest) returns (SyncShardStatusResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // SyncActivity sync the activity status. - rpc SyncActivity (SyncActivityRequest) returns (SyncActivityResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeMutableState returns information about the internal states of workflow mutable state. - rpc DescribeMutableState (DescribeMutableStateRequest) returns (DescribeMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeHistoryHost returns information about the internal states of a history host. - rpc DescribeHistoryHost (DescribeHistoryHostRequest) returns (DescribeHistoryHostResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // CloseShard close the shard. - rpc CloseShard (CloseShardRequest) returns (CloseShardResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // GetShard gets the ShardInfo - rpc GetShard (GetShardRequest) returns (GetShardResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RemoveTask remove task based on type, taskid, shardid. - rpc RemoveTask (RemoveTaskRequest) returns (RemoveTaskResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // GetReplicationMessages return replication messages based on the read level - rpc GetReplicationMessages (GetReplicationMessagesRequest) returns (GetReplicationMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // GetDLQReplicationMessages return replication messages based on dlq info - rpc GetDLQReplicationMessages(GetDLQReplicationMessagesRequest) returns(GetDLQReplicationMessagesResponse){ - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // QueryWorkflow returns query result for a specified workflow execution. - rpc QueryWorkflow (QueryWorkflowRequest) returns (QueryWorkflowResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // ReapplyEvents applies stale events to the current workflow and current run. - rpc ReapplyEvents (ReapplyEventsRequest) returns (ReapplyEventsResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // GetDLQMessages returns messages from DLQ. - rpc GetDLQMessages(GetDLQMessagesRequest) returns (GetDLQMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // (-- api-linter: core::0165::response-message-name=disabled - // aip.dev/not-precedent: --) - // PurgeDLQMessages purges messages from DLQ. - rpc PurgeDLQMessages(PurgeDLQMessagesRequest) returns (PurgeDLQMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // MergeDLQMessages merges messages from DLQ. - rpc MergeDLQMessages(MergeDLQMessagesRequest) returns (MergeDLQMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // RefreshWorkflowTasks refreshes all tasks of a workflow. - rpc RefreshWorkflowTasks(RefreshWorkflowTasksRequest) returns (RefreshWorkflowTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // GenerateLastHistoryReplicationTasks generate a replication task for last history event for requested workflow execution - rpc GenerateLastHistoryReplicationTasks(GenerateLastHistoryReplicationTasksRequest) returns (GenerateLastHistoryReplicationTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc GetReplicationStatus(GetReplicationStatusRequest) returns (GetReplicationStatusResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RebuildMutableState attempts to rebuild mutable state according to persisted history events. - // NOTE: this is experimental API - rpc RebuildMutableState (RebuildMutableStateRequest) returns (RebuildMutableStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ImportWorkflowExecution attempts to import workflow according to persisted history events. - // NOTE: this is experimental API - rpc ImportWorkflowExecution (ImportWorkflowExecutionRequest) returns (ImportWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DeleteWorkflowVisibilityRecord force delete a workflow's visibility record. - // This is used by admin delete workflow execution API to delete visibility record as frontend - // visibility manager doesn't support write operations - rpc DeleteWorkflowVisibilityRecord (DeleteWorkflowVisibilityRecordRequest) returns (DeleteWorkflowVisibilityRecordResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - // (-- api-linter: core::0134=disabled - // aip.dev/not-precedent: This service does not follow the update method API --) - rpc UpdateWorkflowExecution(UpdateWorkflowExecutionRequest) returns (UpdateWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // (-- api-linter: core::0134=disabled - // aip.dev/not-precedent: This service does not follow the update method API --) - rpc PollWorkflowExecutionUpdate(PollWorkflowExecutionUpdateRequest) returns (PollWorkflowExecutionUpdateResponse){ - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - rpc StreamWorkflowReplicationMessages(stream StreamWorkflowReplicationMessagesRequest) returns (stream StreamWorkflowReplicationMessagesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc GetWorkflowExecutionHistory(GetWorkflowExecutionHistoryRequest) returns (GetWorkflowExecutionHistoryResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc GetWorkflowExecutionHistoryReverse(GetWorkflowExecutionHistoryReverseRequest) returns (GetWorkflowExecutionHistoryReverseResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc GetWorkflowExecutionRawHistoryV2(GetWorkflowExecutionRawHistoryV2Request) returns (GetWorkflowExecutionRawHistoryV2Response) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc GetWorkflowExecutionRawHistory(GetWorkflowExecutionRawHistoryRequest) returns (GetWorkflowExecutionRawHistoryResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc ForceDeleteWorkflowExecution(ForceDeleteWorkflowExecutionRequest) returns (ForceDeleteWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc GetDLQTasks (GetDLQTasksRequest) returns (GetDLQTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc DeleteDLQTasks (DeleteDLQTasksRequest) returns (DeleteDLQTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc ListQueues (ListQueuesRequest) returns (ListQueuesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // The AddTasks API is used to add history tasks to a shard. The first use-case for this API is the DLQ. When we are - // unable to process history tasks, we add them to a DLQ. When they need to be retried, we take them out of the DLQ - // and add them back using this API. We expose this via an API instead of doing this in the history engine because - // replication tasks, which are DLQ'd on the target cluster need to be added back to the queue on the source - // cluster, so there is already a network boundary. There is a maximum of 1000 tasks per request. There must be at - // least one task per request. If any task in the list does not have the same shard ID as the request, the request - // will fail with an InvalidArgument error. It is ok to have tasks for different workflow runs as long as they are - // in the same shard. Calls to the persistence API will be batched by workflow run. - rpc AddTasks (AddTasksRequest) returns (AddTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc ListTasks (ListTasksRequest) returns (ListTasksResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Complete an async Nexus Operation using a completion token. The completion state could be successful, failed, or - // canceled. - // - // Deprecated. Will be renamed to CompleteNexusOperationHsm in a future release. - rpc CompleteNexusOperation (CompleteNexusOperationRequest) returns (CompleteNexusOperationResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Complete an async Nexus Operation using a CHASM reference. The completion - // state could be successful, failed, or canceled. - rpc CompleteNexusOperationChasm (CompleteNexusOperationChasmRequest) returns (CompleteNexusOperationChasmResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc InvokeStateMachineMethod (InvokeStateMachineMethodRequest) returns (InvokeStateMachineMethodResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Deep health check history service dependencies health status - rpc DeepHealthCheck (DeepHealthCheckRequest) returns (DeepHealthCheckResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; - } - - rpc SyncWorkflowState(SyncWorkflowStateRequest) returns (SyncWorkflowStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - // UpdateActivityOptions is called by the client to update the options of an activity - // (-- api-linter: core::0134::method-signature=disabled - // (-- api-linter: core::0134::response-message-name=disabled - rpc UpdateActivityOptions (UpdateActivityOptionsRequest) returns (UpdateActivityOptionsResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // PauseActivity pauses the execution of an activity specified by its ID. - // Returns a `NotFound` error if there is no pending activity with the provided ID. - // - // Pausing an activity means: - // - If the activity is currently waiting for a retry or is running and subsequently fails, - // it will not be rescheduled until it is unpause. - // - If the activity is already paused, calling this method will have no effect. - // - If the activity is running and finishes successfully, the activity will be completed. - // - If the activity is running and finishes with failure: - // * if there is no retry left - the activity will be completed. - // * if there are more retries left - the activity will be paused. - // For long-running activities: - // - activities in paused state will send a cancellation with "activity_paused" set to 'true' in response to 'RecordActivityTaskHeartbeat'. - // - The activity should respond to the cancellation accordingly. - // For long-running activities: - // - activity in paused state will send a cancellation with "activity_paused" set to 'true' in response to 'RecordActivityTaskHeartbeat'. - // - The activity should respond to the cancellation accordingly. - // (-- api-linter: core::0134::method-signature=disabled - // (-- api-linter: core::0134::response-message-name=disabled - rpc PauseActivity (PauseActivityRequest) returns (PauseActivityResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // UnpauseActivity unpauses the execution of an activity specified by its ID. - // - // If activity is not paused, this call will have no effect. - // If the activity is waiting for retry, it will be scheduled immediately (* see 'jitter' flag). - // Once the activity is unpause, all timeout timers will be regenerated. - // - // Flags: - // 'jitter': the activity will be scheduled at a random time within the jitter duration. - // 'reset_attempts': the number of attempts will be reset. - // 'reset_heartbeat': the activity heartbeat timer and heartbeats will be reset. - // - // Returns a `NotFound` error if there is no pending activity with the provided ID. - // (-- api-linter: core::0134::method-signature=disabled - // (-- api-linter: core::0134::response-message-name=disabled - rpc UnpauseActivity (UnpauseActivityRequest) returns (UnpauseActivityResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ResetActivity resets the execution of an activity specified by its ID. - // - // Resetting an activity means: - // * number of attempts will be reset to 0. - // * activity timeouts will be reset. - // * if the activity is waiting for retry, and it is not paused or 'keep_paused' is not provided: - // it will be scheduled immediately (* see 'jitter' flag), - // - // Flags: - // - // 'jitter': the activity will be scheduled at a random time within the jitter duration. - // If the activity currently paused it will be unpause, unless 'keep_paused' flag is provided. - // 'reset_heartbeats': the activity heartbeat timer and heartbeats will be reset. - // 'keep_paused': if the activity is paused, it will remain paused. - // - // Returns a `NotFound` error if there is no pending activity with the provided ID. - // (-- api-linter: core::0134::method-signature=disabled - // (-- api-linter: core::0134::response-message-name=disabled - rpc ResetActivity (ResetActivityRequest) returns (ResetActivityResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // PauseWorkflowExecution pauses the workflow execution specified in the request. - rpc PauseWorkflowExecution (PauseWorkflowExecutionRequest) returns (PauseWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // UnpauseWorkflowExecution unpauses the workflow execution specified in the request. - rpc UnpauseWorkflowExecution (UnpauseWorkflowExecutionRequest) returns (UnpauseWorkflowExecutionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // StartNexusOperation starts a Nexus operation on the __temporal_system endpoint. - rpc StartNexusOperation (StartNexusOperationRequest) returns (StartNexusOperationResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // CancelNexusOperation cancels a Nexus operation on the __temporal_system endpoint. - rpc CancelNexusOperation (CancelNexusOperationRequest) returns (CancelNexusOperationResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } + // StartWorkflowExecution starts a new long running workflow instance. It will create the instance with + // 'WorkflowExecutionStarted' event in history and also schedule the first WorkflowTask for the worker to produce the + // initial list of commands for this instance. It will return 'WorkflowExecutionAlreadyStartedError', if an instance already + // exists with same workflowId. + rpc StartWorkflowExecution(StartWorkflowExecutionRequest) returns (StartWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Returns the information from mutable state of workflow execution. + // It fails with 'EntityNotExistError' if specified workflow execution in unknown to the service. + // It returns CurrentBranchChangedError if the workflow version branch has changed. + rpc GetMutableState(GetMutableStateRequest) returns (GetMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Returns the information from mutable state of workflow execution. + // It fails with 'EntityNotExistError' if specified workflow execution in unknown to the service. + // It returns CurrentBranchChangedError if the workflow version branch has changed. + rpc PollMutableState(PollMutableStateRequest) returns (PollMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // Reset the sticky task queue related information in mutable state of a given workflow. + // Things cleared are: + // 1. StickyTaskQueue + // 2. StickyScheduleToStartTimeout + rpc ResetStickyTaskQueue(ResetStickyTaskQueueRequest) returns (ResetStickyTaskQueueResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RecordWorkflowTaskStarted is called by the Matchingservice before it hands a workflow task to the application worker in response to + // a PollWorkflowTaskQueue call. It records in the history the event that the workflow task has started. It will return 'TaskAlreadyStartedError', + // if the workflow's execution history already includes a record of the event starting. + rpc RecordWorkflowTaskStarted(RecordWorkflowTaskStartedRequest) returns (RecordWorkflowTaskStartedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RecordActivityTaskStarted is called by the Matchingservice before it hands a workflow task to the application worker in response to + // a PollActivityTaskQueue call. It records in the history the event that the workflow task has started. It will return 'TaskAlreadyStartedError', + // if the workflow's execution history already includes a record of the event starting. + rpc RecordActivityTaskStarted(RecordActivityTaskStartedRequest) returns (RecordActivityTaskStartedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RespondWorkflowTaskCompleted is called by application worker to complete a WorkflowTask handed as a result of + // 'PollWorkflowTaskQueue' API call. Completing a WorkflowTask will result in new result in new commands for the + // workflow execution and potentially new ActivityTasks created for correspondent commands. It will also create a + // WorkflowTaskCompleted event in the history for that session. Use the 'taskToken' provided as response of + // PollWorkflowTaskQueue API call for completing the WorkflowTask. + rpc RespondWorkflowTaskCompleted(RespondWorkflowTaskCompletedRequest) returns (RespondWorkflowTaskCompletedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RespondWorkflowTaskFailed is called by application worker to indicate failure. This results in + // WorkflowTaskFailedEvent written to the history and a new WorkflowTask created. This API can be used by client to + // either clear sticky task queue or report ny panics during WorkflowTask processing. + rpc RespondWorkflowTaskFailed(RespondWorkflowTaskFailedRequest) returns (RespondWorkflowTaskFailedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // IsWorkflowTaskValid is called by matching service checking whether the workflow task is valid. + rpc IsWorkflowTaskValid(IsWorkflowTaskValidRequest) returns (IsWorkflowTaskValidResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RecordActivityTaskHeartbeat is called by application worker while it is processing an ActivityTask. If worker fails + // to heartbeat within 'heartbeatTimeoutSeconds' interval for the ActivityTask, then it will be marked as timedout and + // 'ActivityTaskTimedOut' event will be written to the workflow history. Calling 'RecordActivityTaskHeartbeat' will + // fail with 'EntityNotExistsError' in such situations. Use the 'taskToken' provided as response of + // PollActivityTaskQueue API call for heartbeating. + rpc RecordActivityTaskHeartbeat(RecordActivityTaskHeartbeatRequest) returns (RecordActivityTaskHeartbeatResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RespondActivityTaskCompleted is called by application worker when it is done processing an ActivityTask. It will + // result in a new 'ActivityTaskCompleted' event being written to the workflow history and a new WorkflowTask + // created for the workflow so new commands could be made. Use the 'taskToken' provided as response of + // PollActivityTaskQueue API call for completion. It fails with 'EntityNotExistsError' if the taskToken is not valid + // anymore due to activity timeout. + rpc RespondActivityTaskCompleted(RespondActivityTaskCompletedRequest) returns (RespondActivityTaskCompletedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RespondActivityTaskFailed is called by application worker when it is done processing an ActivityTask. It will + // result in a new 'ActivityTaskFailed' event being written to the workflow history and a new WorkflowTask + // created for the workflow instance so new commands could be made. Use the 'taskToken' provided as response of + // PollActivityTaskQueue API call for completion. It fails with 'EntityNotExistsError' if the taskToken is not valid + // anymore due to activity timeout. + rpc RespondActivityTaskFailed(RespondActivityTaskFailedRequest) returns (RespondActivityTaskFailedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RespondActivityTaskCanceled is called by application worker when it is successfully canceled an ActivityTask. It will + // result in a new 'ActivityTaskCanceled' event being written to the workflow history and a new WorkflowTask + // created for the workflow instance so new commands could be made. Use the 'taskToken' provided as response of + // PollActivityTaskQueue API call for completion. It fails with 'EntityNotExistsError' if the taskToken is not valid + // anymore due to activity timeout. + rpc RespondActivityTaskCanceled(RespondActivityTaskCanceledRequest) returns (RespondActivityTaskCanceledResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // IsActivityTaskValid is called by matching service checking whether the workflow task is valid. + rpc IsActivityTaskValid(IsActivityTaskValidRequest) returns (IsActivityTaskValidResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // SignalWorkflowExecution is used to send a signal event to running workflow execution. This results in + // WorkflowExecutionSignaled event recorded in the history and a workflow task being created for the execution. + rpc SignalWorkflowExecution(SignalWorkflowExecutionRequest) returns (SignalWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // (-- api-linter: core::0136::prepositions=disabled + // aip.dev/not-precedent: "With" is needed here. --) + // SignalWithStartWorkflowExecution is used to ensure sending a signal event to a workflow execution. + // If workflow is running, this results in WorkflowExecutionSignaled event recorded in the history + // and a workflow task being created for the execution. + // If workflow is not running or not found, it will first try start workflow with given WorkflowIdResuePolicy, + // and record WorkflowExecutionStarted and WorkflowExecutionSignaled event in case of success. + // It will return `WorkflowExecutionAlreadyStartedError` if start workflow failed with given policy. + rpc SignalWithStartWorkflowExecution(SignalWithStartWorkflowExecutionRequest) returns (SignalWithStartWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ExecuteMultiOperation executes multiple operations within a single workflow. + rpc ExecuteMultiOperation(ExecuteMultiOperationRequest) returns (ExecuteMultiOperationResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // RemoveSignalMutableState is used to remove a signal request Id that was previously recorded. This is currently + // used to clean execution info when signal workflow task finished. + rpc RemoveSignalMutableState(RemoveSignalMutableStateRequest) returns (RemoveSignalMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // TerminateWorkflowExecution terminates an existing workflow execution by recording WorkflowExecutionTerminated event + // in the history and immediately terminating the execution instance. + rpc TerminateWorkflowExecution(TerminateWorkflowExecutionRequest) returns (TerminateWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DeleteWorkflowExecution asynchronously deletes a specific Workflow Execution (when WorkflowExecution.run_id is + // provided) or the latest Workflow Execution (when WorkflowExecution.run_id is not provided). If the Workflow + // Execution is Running, it will be terminated before deletion. + rpc DeleteWorkflowExecution(DeleteWorkflowExecutionRequest) returns (DeleteWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ResetWorkflowExecution reset an existing workflow execution by a firstEventId of a existing event batch + // in the history and immediately terminating the current execution instance. + // After reset, the history will grow from nextFirstEventId. + rpc ResetWorkflowExecution(ResetWorkflowExecutionRequest) returns (ResetWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // UpdateWorkflowExecutionOptions modifies the options of an existing workflow execution. + // Currently the option that can be updated is setting and unsetting a versioning behavior override. + // (-- api-linter: core::0134::method-signature=disabled + // (-- api-linter: core::0134::response-message-name=disabled + rpc UpdateWorkflowExecutionOptions(UpdateWorkflowExecutionOptionsRequest) returns (UpdateWorkflowExecutionOptionsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RequestCancelWorkflowExecution is called by application worker when it wants to request cancellation of a workflow instance. + // It will result in a new 'WorkflowExecutionCancelRequested' event being written to the workflow history and a new WorkflowTask + // created for the workflow instance so new commands could be made. It fails with 'EntityNotExistsError' if the workflow is not valid + // anymore due to completion or doesn't exist. + rpc RequestCancelWorkflowExecution(RequestCancelWorkflowExecutionRequest) returns (RequestCancelWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ScheduleWorkflowTask is used for creating a workflow task for already started workflow execution. This is mainly + // used by transfer queue processor during the processing of StartChildWorkflowExecution task, where it first starts + // child execution without creating the workflow task and then calls this API after updating the mutable state of + // parent execution. + rpc ScheduleWorkflowTask(ScheduleWorkflowTaskRequest) returns (ScheduleWorkflowTaskResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // VerifyFirstWorkflowTaskScheduled checks if workflow has its first workflow task scheduled. + // This is only used by standby transfer start child workflow task logic to make sure parent workflow has + // scheduled first workflow task in child after recording child started in its mutable state; otherwise, + // during namespace failover, it's possible that none of the clusters will schedule the first workflow task. + // NOTE: This is an experimental API. If later we found there are more verification API and there's a clear pattern + // of how verification is done, we may unify them into one generic verfication API. + rpc VerifyFirstWorkflowTaskScheduled(VerifyFirstWorkflowTaskScheduledRequest) returns (VerifyFirstWorkflowTaskScheduledResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RecordChildExecutionCompleted is used for reporting the completion of child workflow execution to parent. + // This is mainly called by transfer queue processor during the processing of DeleteExecution task. + rpc RecordChildExecutionCompleted(RecordChildExecutionCompletedRequest) returns (RecordChildExecutionCompletedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // VerifyChildExecutionCompletionRecorded checks if child completion result is recorded in parent workflow. + // This is only used by standby transfer close execution logic to make sure parent workflow has the result + // recorded before completing the task, otherwise during namespace failover, it's possible that none of the + // clusters will record the child result in parent workflow. + // NOTE: This is an experimental API. If later we found there are more verification API and there's a clear pattern + // of how verification is done, we may unify them into one generic verfication API. + rpc VerifyChildExecutionCompletionRecorded(VerifyChildExecutionCompletionRecordedRequest) returns (VerifyChildExecutionCompletionRecordedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeWorkflowExecution returns information about the specified workflow execution. + rpc DescribeWorkflowExecution(DescribeWorkflowExecutionRequest) returns (DescribeWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ReplicateEventsV2 replicates workflow history events + rpc ReplicateEventsV2(ReplicateEventsV2Request) returns (ReplicateEventsV2Response) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ReplicateWorkflowState replicates workflow state + rpc ReplicateWorkflowState(ReplicateWorkflowStateRequest) returns (ReplicateWorkflowStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // SyncShardStatus sync the status between shards. + rpc SyncShardStatus(SyncShardStatusRequest) returns (SyncShardStatusResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // SyncActivity sync the activity status. + rpc SyncActivity(SyncActivityRequest) returns (SyncActivityResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeMutableState returns information about the internal states of workflow mutable state. + rpc DescribeMutableState(DescribeMutableStateRequest) returns (DescribeMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeHistoryHost returns information about the internal states of a history host. + rpc DescribeHistoryHost(DescribeHistoryHostRequest) returns (DescribeHistoryHostResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // CloseShard close the shard. + rpc CloseShard(CloseShardRequest) returns (CloseShardResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // GetShard gets the ShardInfo + rpc GetShard(GetShardRequest) returns (GetShardResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RemoveTask remove task based on type, taskid, shardid. + rpc RemoveTask(RemoveTaskRequest) returns (RemoveTaskResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // GetReplicationMessages return replication messages based on the read level + rpc GetReplicationMessages(GetReplicationMessagesRequest) returns (GetReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // GetDLQReplicationMessages return replication messages based on dlq info + rpc GetDLQReplicationMessages(GetDLQReplicationMessagesRequest) returns (GetDLQReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // QueryWorkflow returns query result for a specified workflow execution. + rpc QueryWorkflow(QueryWorkflowRequest) returns (QueryWorkflowResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // ReapplyEvents applies stale events to the current workflow and current run. + rpc ReapplyEvents(ReapplyEventsRequest) returns (ReapplyEventsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // GetDLQMessages returns messages from DLQ. + rpc GetDLQMessages(GetDLQMessagesRequest) returns (GetDLQMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // (-- api-linter: core::0165::response-message-name=disabled + // aip.dev/not-precedent: --) + // PurgeDLQMessages purges messages from DLQ. + rpc PurgeDLQMessages(PurgeDLQMessagesRequest) returns (PurgeDLQMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // MergeDLQMessages merges messages from DLQ. + rpc MergeDLQMessages(MergeDLQMessagesRequest) returns (MergeDLQMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // RefreshWorkflowTasks refreshes all tasks of a workflow. + rpc RefreshWorkflowTasks(RefreshWorkflowTasksRequest) returns (RefreshWorkflowTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // GenerateLastHistoryReplicationTasks generate a replication task for last history event for requested workflow execution + rpc GenerateLastHistoryReplicationTasks(GenerateLastHistoryReplicationTasksRequest) returns (GenerateLastHistoryReplicationTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc GetReplicationStatus(GetReplicationStatusRequest) returns (GetReplicationStatusResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RebuildMutableState attempts to rebuild mutable state according to persisted history events. + // NOTE: this is experimental API + rpc RebuildMutableState(RebuildMutableStateRequest) returns (RebuildMutableStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ImportWorkflowExecution attempts to import workflow according to persisted history events. + // NOTE: this is experimental API + rpc ImportWorkflowExecution(ImportWorkflowExecutionRequest) returns (ImportWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DeleteWorkflowVisibilityRecord force delete a workflow's visibility record. + // This is used by admin delete workflow execution API to delete visibility record as frontend + // visibility manager doesn't support write operations + rpc DeleteWorkflowVisibilityRecord(DeleteWorkflowVisibilityRecordRequest) returns (DeleteWorkflowVisibilityRecordResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + // (-- api-linter: core::0134=disabled + // aip.dev/not-precedent: This service does not follow the update method API --) + rpc UpdateWorkflowExecution(UpdateWorkflowExecutionRequest) returns (UpdateWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // (-- api-linter: core::0134=disabled + // aip.dev/not-precedent: This service does not follow the update method API --) + rpc PollWorkflowExecutionUpdate(PollWorkflowExecutionUpdateRequest) returns (PollWorkflowExecutionUpdateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + rpc StreamWorkflowReplicationMessages(stream StreamWorkflowReplicationMessagesRequest) returns (stream StreamWorkflowReplicationMessagesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc GetWorkflowExecutionHistory(GetWorkflowExecutionHistoryRequest) returns (GetWorkflowExecutionHistoryResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc GetWorkflowExecutionHistoryReverse(GetWorkflowExecutionHistoryReverseRequest) returns (GetWorkflowExecutionHistoryReverseResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc GetWorkflowExecutionRawHistoryV2(GetWorkflowExecutionRawHistoryV2Request) returns (GetWorkflowExecutionRawHistoryV2Response) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc GetWorkflowExecutionRawHistory(GetWorkflowExecutionRawHistoryRequest) returns (GetWorkflowExecutionRawHistoryResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc ForceDeleteWorkflowExecution(ForceDeleteWorkflowExecutionRequest) returns (ForceDeleteWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc GetDLQTasks(GetDLQTasksRequest) returns (GetDLQTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc DeleteDLQTasks(DeleteDLQTasksRequest) returns (DeleteDLQTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc ListQueues(ListQueuesRequest) returns (ListQueuesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // The AddTasks API is used to add history tasks to a shard. The first use-case for this API is the DLQ. When we are + // unable to process history tasks, we add them to a DLQ. When they need to be retried, we take them out of the DLQ + // and add them back using this API. We expose this via an API instead of doing this in the history engine because + // replication tasks, which are DLQ'd on the target cluster need to be added back to the queue on the source + // cluster, so there is already a network boundary. There is a maximum of 1000 tasks per request. There must be at + // least one task per request. If any task in the list does not have the same shard ID as the request, the request + // will fail with an InvalidArgument error. It is ok to have tasks for different workflow runs as long as they are + // in the same shard. Calls to the persistence API will be batched by workflow run. + rpc AddTasks(AddTasksRequest) returns (AddTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc ListTasks(ListTasksRequest) returns (ListTasksResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Complete an async Nexus Operation using a completion token. The completion state could be successful, failed, or + // canceled. + // + // Deprecated. Will be renamed to CompleteNexusOperationHsm in a future release. + rpc CompleteNexusOperation(CompleteNexusOperationRequest) returns (CompleteNexusOperationResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Complete an async Nexus Operation using a CHASM reference. The completion + // state could be successful, failed, or canceled. + rpc CompleteNexusOperationChasm(CompleteNexusOperationChasmRequest) returns (CompleteNexusOperationChasmResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc InvokeStateMachineMethod(InvokeStateMachineMethodRequest) returns (InvokeStateMachineMethodResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Deep health check history service dependencies health status + rpc DeepHealthCheck(DeepHealthCheckRequest) returns (DeepHealthCheckResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_SYSTEM; + } + + rpc SyncWorkflowState(SyncWorkflowStateRequest) returns (SyncWorkflowStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + // UpdateActivityOptions is called by the client to update the options of an activity + // (-- api-linter: core::0134::method-signature=disabled + // (-- api-linter: core::0134::response-message-name=disabled + rpc UpdateActivityOptions(UpdateActivityOptionsRequest) returns (UpdateActivityOptionsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // PauseActivity pauses the execution of an activity specified by its ID. + // Returns a `NotFound` error if there is no pending activity with the provided ID. + // + // Pausing an activity means: + // - If the activity is currently waiting for a retry or is running and subsequently fails, + // it will not be rescheduled until it is unpause. + // - If the activity is already paused, calling this method will have no effect. + // - If the activity is running and finishes successfully, the activity will be completed. + // - If the activity is running and finishes with failure: + // * if there is no retry left - the activity will be completed. + // * if there are more retries left - the activity will be paused. + // For long-running activities: + // - activities in paused state will send a cancellation with "activity_paused" set to 'true' in response to 'RecordActivityTaskHeartbeat'. + // - The activity should respond to the cancellation accordingly. + // For long-running activities: + // - activity in paused state will send a cancellation with "activity_paused" set to 'true' in response to 'RecordActivityTaskHeartbeat'. + // - The activity should respond to the cancellation accordingly. + // (-- api-linter: core::0134::method-signature=disabled + // (-- api-linter: core::0134::response-message-name=disabled + rpc PauseActivity(PauseActivityRequest) returns (PauseActivityResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // UnpauseActivity unpauses the execution of an activity specified by its ID. + // + // If activity is not paused, this call will have no effect. + // If the activity is waiting for retry, it will be scheduled immediately (* see 'jitter' flag). + // Once the activity is unpause, all timeout timers will be regenerated. + // + // Flags: + // 'jitter': the activity will be scheduled at a random time within the jitter duration. + // 'reset_attempts': the number of attempts will be reset. + // 'reset_heartbeat': the activity heartbeat timer and heartbeats will be reset. + // + // Returns a `NotFound` error if there is no pending activity with the provided ID. + // (-- api-linter: core::0134::method-signature=disabled + // (-- api-linter: core::0134::response-message-name=disabled + rpc UnpauseActivity(UnpauseActivityRequest) returns (UnpauseActivityResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ResetActivity resets the execution of an activity specified by its ID. + // + // Resetting an activity means: + // * number of attempts will be reset to 0. + // * activity timeouts will be reset. + // * if the activity is waiting for retry, and it is not paused or 'keep_paused' is not provided: + // it will be scheduled immediately (* see 'jitter' flag), + // + // Flags: + // + // 'jitter': the activity will be scheduled at a random time within the jitter duration. + // If the activity currently paused it will be unpause, unless 'keep_paused' flag is provided. + // 'reset_heartbeats': the activity heartbeat timer and heartbeats will be reset. + // 'keep_paused': if the activity is paused, it will remain paused. + // + // Returns a `NotFound` error if there is no pending activity with the provided ID. + // (-- api-linter: core::0134::method-signature=disabled + // (-- api-linter: core::0134::response-message-name=disabled + rpc ResetActivity(ResetActivityRequest) returns (ResetActivityResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // PauseWorkflowExecution pauses the workflow execution specified in the request. + rpc PauseWorkflowExecution(PauseWorkflowExecutionRequest) returns (PauseWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // UnpauseWorkflowExecution unpauses the workflow execution specified in the request. + rpc UnpauseWorkflowExecution(UnpauseWorkflowExecutionRequest) returns (UnpauseWorkflowExecutionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // StartNexusOperation starts a Nexus operation on the __temporal_system endpoint. + rpc StartNexusOperation(StartNexusOperationRequest) returns (StartNexusOperationResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // CancelNexusOperation cancels a Nexus operation on the __temporal_system endpoint. + rpc CancelNexusOperation(CancelNexusOperationRequest) returns (CancelNexusOperationResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } } diff --git a/proto/internal/temporal/server/api/matchingservice/v1/request_response.proto b/proto/internal/temporal/server/api/matchingservice/v1/request_response.proto index 0d42c42375..28bf1593a7 100644 --- a/proto/internal/temporal/server/api/matchingservice/v1/request_response.proto +++ b/proto/internal/temporal/server/api/matchingservice/v1/request_response.proto @@ -1,69 +1,67 @@ syntax = "proto3"; package temporal.server.api.matchingservice.v1; -option go_package = "go.temporal.io/server/api/matchingservice/v1;matchingservice"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; - import "temporal/api/common/v1/message.proto"; import "temporal/api/deployment/v1/message.proto"; import "temporal/api/enums/v1/task_queue.proto"; import "temporal/api/failure/v1/message.proto"; import "temporal/api/history/v1/message.proto"; -import "temporal/api/taskqueue/v1/message.proto"; -import "temporal/api/query/v1/message.proto"; +import "temporal/api/nexus/v1/message.proto"; import "temporal/api/protocol/v1/message.proto"; - +import "temporal/api/query/v1/message.proto"; +import "temporal/api/taskqueue/v1/message.proto"; +import "temporal/api/worker/v1/message.proto"; +import "temporal/api/workflowservice/v1/request_response.proto"; import "temporal/server/api/clock/v1/message.proto"; import "temporal/server/api/deployment/v1/message.proto"; +import "temporal/server/api/enums/v1/fairness_state.proto"; import "temporal/server/api/history/v1/message.proto"; import "temporal/server/api/persistence/v1/nexus.proto"; import "temporal/server/api/persistence/v1/task_queues.proto"; import "temporal/server/api/taskqueue/v1/message.proto"; -import "temporal/server/api/enums/v1/fairness_state.proto"; -import "temporal/api/workflowservice/v1/request_response.proto"; -import "temporal/api/nexus/v1/message.proto"; -import "temporal/api/worker/v1/message.proto"; +option go_package = "go.temporal.io/server/api/matchingservice/v1;matchingservice"; message PollWorkflowTaskQueueRequest { - string namespace_id = 1; - string poller_id = 2; - temporal.api.workflowservice.v1.PollWorkflowTaskQueueRequest poll_request = 3; - string forwarded_source = 4; - // Extra conditions on this poll request. Only supported with new matcher. - PollConditions conditions = 5; + string namespace_id = 1; + string poller_id = 2; + temporal.api.workflowservice.v1.PollWorkflowTaskQueueRequest poll_request = 3; + string forwarded_source = 4; + // Extra conditions on this poll request. Only supported with new matcher. + PollConditions conditions = 5; } message PollWorkflowTaskQueueResponse { - bytes task_token = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - temporal.api.common.v1.WorkflowType workflow_type = 3; - int64 previous_started_event_id = 4; - int64 started_event_id = 5; - int32 attempt = 6; - int64 next_event_id = 7; - int64 backlog_count_hint = 8; - bool sticky_execution_enabled = 9; - temporal.api.query.v1.WorkflowQuery query = 10; - temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 11; - temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 12; - reserved 13; - bytes branch_token = 14; - google.protobuf.Timestamp scheduled_time = 15; - google.protobuf.Timestamp started_time = 16; - map queries = 17; - repeated temporal.api.protocol.v1.Message messages = 18; - // The history for this workflow, which will either be complete or partial. Partial histories - // are sent to workers who have signaled that they are using a sticky queue when completing - // a workflow task. Sticky query tasks will not include any history. - temporal.api.history.v1.History history = 19; - bytes next_page_token = 20; - temporal.api.taskqueue.v1.PollerScalingDecision poller_scaling_decision = 21; - // Raw history bytes sent from matching service when history.sendRawHistoryBetweenInternalServices is enabled. - // Matching client will deserialize this to History when it receives the response. - temporal.api.history.v1.History raw_history = 22; + bytes task_token = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + temporal.api.common.v1.WorkflowType workflow_type = 3; + int64 previous_started_event_id = 4; + int64 started_event_id = 5; + int32 attempt = 6; + int64 next_event_id = 7; + int64 backlog_count_hint = 8; + bool sticky_execution_enabled = 9; + temporal.api.query.v1.WorkflowQuery query = 10; + temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 11; + temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 12; + reserved 13; + bytes branch_token = 14; + google.protobuf.Timestamp scheduled_time = 15; + google.protobuf.Timestamp started_time = 16; + map queries = 17; + repeated temporal.api.protocol.v1.Message messages = 18; + // The history for this workflow, which will either be complete or partial. Partial histories + // are sent to workers who have signaled that they are using a sticky queue when completing + // a workflow task. Sticky query tasks will not include any history. + temporal.api.history.v1.History history = 19; + bytes next_page_token = 20; + temporal.api.taskqueue.v1.PollerScalingDecision poller_scaling_decision = 21; + // Raw history bytes sent from matching service when history.sendRawHistoryBetweenInternalServices is enabled. + // Matching client will deserialize this to History when it receives the response. + temporal.api.history.v1.History raw_history = 22; } // PollWorkflowTaskQueueResponseWithRawHistory is wire-compatible with PollWorkflowTaskQueueResponse. @@ -86,247 +84,245 @@ message PollWorkflowTaskQueueResponse { // IMPORTANT: Field numbers and all other fields must remain identical between these two messages. // Any change to PollWorkflowTaskQueueResponse must be mirrored here. message PollWorkflowTaskQueueResponseWithRawHistory { - bytes task_token = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - temporal.api.common.v1.WorkflowType workflow_type = 3; - int64 previous_started_event_id = 4; - int64 started_event_id = 5; - int32 attempt = 6; - int64 next_event_id = 7; - int64 backlog_count_hint = 8; - bool sticky_execution_enabled = 9; - temporal.api.query.v1.WorkflowQuery query = 10; - temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 11; - temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 12; - reserved 13; - bytes branch_token = 14; - google.protobuf.Timestamp scheduled_time = 15; - google.protobuf.Timestamp started_time = 16; - map queries = 17; - repeated temporal.api.protocol.v1.Message messages = 18; - // The history for this workflow, which will either be complete or partial. Partial histories - // are sent to workers who have signaled that they are using a sticky queue when completing - // a workflow task. Sticky query tasks will not include any history. - temporal.api.history.v1.History history = 19; - bytes next_page_token = 20; - temporal.api.taskqueue.v1.PollerScalingDecision poller_scaling_decision = 21; - // Raw history bytes. Each element is a proto-encoded batch of history events. - // When matching client deserializes this to PollWorkflowTaskQueueResponse, this field - // will be automatically deserialized to the raw_history field as History. - repeated bytes raw_history = 22; + bytes task_token = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + temporal.api.common.v1.WorkflowType workflow_type = 3; + int64 previous_started_event_id = 4; + int64 started_event_id = 5; + int32 attempt = 6; + int64 next_event_id = 7; + int64 backlog_count_hint = 8; + bool sticky_execution_enabled = 9; + temporal.api.query.v1.WorkflowQuery query = 10; + temporal.server.api.history.v1.TransientWorkflowTaskInfo transient_workflow_task = 11; + temporal.api.taskqueue.v1.TaskQueue workflow_execution_task_queue = 12; + reserved 13; + bytes branch_token = 14; + google.protobuf.Timestamp scheduled_time = 15; + google.protobuf.Timestamp started_time = 16; + map queries = 17; + repeated temporal.api.protocol.v1.Message messages = 18; + // The history for this workflow, which will either be complete or partial. Partial histories + // are sent to workers who have signaled that they are using a sticky queue when completing + // a workflow task. Sticky query tasks will not include any history. + temporal.api.history.v1.History history = 19; + bytes next_page_token = 20; + temporal.api.taskqueue.v1.PollerScalingDecision poller_scaling_decision = 21; + // Raw history bytes. Each element is a proto-encoded batch of history events. + // When matching client deserializes this to PollWorkflowTaskQueueResponse, this field + // will be automatically deserialized to the raw_history field as History. + repeated bytes raw_history = 22; } message PollActivityTaskQueueRequest { - string namespace_id = 1; - string poller_id = 2; - temporal.api.workflowservice.v1.PollActivityTaskQueueRequest poll_request = 3; - string forwarded_source = 4; - // Extra conditions on this poll request. Only supported with new matcher. - PollConditions conditions = 5; + string namespace_id = 1; + string poller_id = 2; + temporal.api.workflowservice.v1.PollActivityTaskQueueRequest poll_request = 3; + string forwarded_source = 4; + // Extra conditions on this poll request. Only supported with new matcher. + PollConditions conditions = 5; } message PollActivityTaskQueueResponse { - bytes task_token = 1; - temporal.api.common.v1.WorkflowExecution workflow_execution = 2; - string activity_id = 3; - temporal.api.common.v1.ActivityType activity_type = 4; - temporal.api.common.v1.Payloads input = 5; - google.protobuf.Timestamp scheduled_time = 6; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_close_timeout = 7; - google.protobuf.Timestamp started_time = 8; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration start_to_close_timeout = 9; - google.protobuf.Duration heartbeat_timeout = 10; - int32 attempt = 11; - google.protobuf.Timestamp current_attempt_scheduled_time = 12; - temporal.api.common.v1.Payloads heartbeat_details = 13; - temporal.api.common.v1.WorkflowType workflow_type = 14; - string workflow_namespace = 15; - temporal.api.common.v1.Header header = 16; - temporal.api.taskqueue.v1.PollerScalingDecision poller_scaling_decision = 17; - temporal.api.common.v1.Priority priority = 18; - temporal.api.common.v1.RetryPolicy retry_policy = 19; - // ID of the activity run (applicable for standalone activities only) - string activity_run_id = 20; + bytes task_token = 1; + temporal.api.common.v1.WorkflowExecution workflow_execution = 2; + string activity_id = 3; + temporal.api.common.v1.ActivityType activity_type = 4; + temporal.api.common.v1.Payloads input = 5; + google.protobuf.Timestamp scheduled_time = 6; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_close_timeout = 7; + google.protobuf.Timestamp started_time = 8; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration start_to_close_timeout = 9; + google.protobuf.Duration heartbeat_timeout = 10; + int32 attempt = 11; + google.protobuf.Timestamp current_attempt_scheduled_time = 12; + temporal.api.common.v1.Payloads heartbeat_details = 13; + temporal.api.common.v1.WorkflowType workflow_type = 14; + string workflow_namespace = 15; + temporal.api.common.v1.Header header = 16; + temporal.api.taskqueue.v1.PollerScalingDecision poller_scaling_decision = 17; + temporal.api.common.v1.Priority priority = 18; + temporal.api.common.v1.RetryPolicy retry_policy = 19; + // ID of the activity run (applicable for standalone activities only) + string activity_run_id = 20; } message AddWorkflowTaskRequest { - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - temporal.api.taskqueue.v1.TaskQueue task_queue = 3; - int64 scheduled_event_id = 4; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_start_timeout = 5; - temporal.server.api.clock.v1.VectorClock clock = 9; - // How this task should be directed by matching. (Missing means the default - // for TaskVersionDirective, which is unversioned.) - temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 10; - temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 11; - temporal.api.common.v1.Priority priority = 12; - // Stamp value from when the workflow task was scheduled. Used to validate the task is still relevant. - int32 stamp = 13; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + temporal.api.taskqueue.v1.TaskQueue task_queue = 3; + int64 scheduled_event_id = 4; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_start_timeout = 5; + temporal.server.api.clock.v1.VectorClock clock = 9; + // How this task should be directed by matching. (Missing means the default + // for TaskVersionDirective, which is unversioned.) + temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 10; + temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 11; + temporal.api.common.v1.Priority priority = 12; + // Stamp value from when the workflow task was scheduled. Used to validate the task is still relevant. + int32 stamp = 13; } message AddWorkflowTaskResponse { - // When present, it means that the task is spooled to a versioned queue of this build ID - // Deprecated. [cleanup-old-wv] - string assigned_build_id = 1; + // When present, it means that the task is spooled to a versioned queue of this build ID + // Deprecated. [cleanup-old-wv] + string assigned_build_id = 1; } message AddActivityTaskRequest { - string namespace_id = 1; - temporal.api.common.v1.WorkflowExecution execution = 2; - reserved 3; - temporal.api.taskqueue.v1.TaskQueue task_queue = 4; - int64 scheduled_event_id = 5; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_start_timeout = 6; - temporal.server.api.clock.v1.VectorClock clock = 9; - // How this task should be directed by matching. (Missing means the default - // for TaskVersionDirective, which is unversioned.) - temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 10; - temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 11; - int32 stamp = 12; - temporal.api.common.v1.Priority priority = 13; - // Reference to the Chasm component for activity execution (if applicable). For standalone activities, all - // necessary start information is carried within this component, obviating the need to use the fields that apply to - // embedded activities. - bytes component_ref = 14; + string namespace_id = 1; + temporal.api.common.v1.WorkflowExecution execution = 2; + reserved 3; + temporal.api.taskqueue.v1.TaskQueue task_queue = 4; + int64 scheduled_event_id = 5; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_start_timeout = 6; + temporal.server.api.clock.v1.VectorClock clock = 9; + // How this task should be directed by matching. (Missing means the default + // for TaskVersionDirective, which is unversioned.) + temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 10; + temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 11; + int32 stamp = 12; + temporal.api.common.v1.Priority priority = 13; + // Reference to the Chasm component for activity execution (if applicable). For standalone activities, all + // necessary start information is carried within this component, obviating the need to use the fields that apply to + // embedded activities. + bytes component_ref = 14; } message AddActivityTaskResponse { - // When present, it means that the task is spooled to a versioned queue of this build ID - // Deprecated. [cleanup-old-wv] - string assigned_build_id = 1; + // When present, it means that the task is spooled to a versioned queue of this build ID + // Deprecated. [cleanup-old-wv] + string assigned_build_id = 1; } message QueryWorkflowRequest { - string namespace_id = 1; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - temporal.api.workflowservice.v1.QueryWorkflowRequest query_request = 3; - // How this task should be directed by matching. (Missing means the default - // for TaskVersionDirective, which is unversioned.) - temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 5; - temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 6; - temporal.api.common.v1.Priority priority = 7; + string namespace_id = 1; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + temporal.api.workflowservice.v1.QueryWorkflowRequest query_request = 3; + // How this task should be directed by matching. (Missing means the default + // for TaskVersionDirective, which is unversioned.) + temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 5; + temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 6; + temporal.api.common.v1.Priority priority = 7; } message QueryWorkflowResponse { - temporal.api.common.v1.Payloads query_result = 1; - temporal.api.query.v1.QueryRejected query_rejected = 2; + temporal.api.common.v1.Payloads query_result = 1; + temporal.api.query.v1.QueryRejected query_rejected = 2; } message RespondQueryTaskCompletedRequest { - string namespace_id = 1; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - string task_id = 3; - temporal.api.workflowservice.v1.RespondQueryTaskCompletedRequest completed_request = 4; + string namespace_id = 1; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + string task_id = 3; + temporal.api.workflowservice.v1.RespondQueryTaskCompletedRequest completed_request = 4; } -message RespondQueryTaskCompletedResponse { -} +message RespondQueryTaskCompletedResponse {} message CancelOutstandingPollRequest { - string namespace_id = 1; - temporal.api.enums.v1.TaskQueueType task_queue_type = 2; - temporal.api.taskqueue.v1.TaskQueue task_queue = 3; - string poller_id = 4; + string namespace_id = 1; + temporal.api.enums.v1.TaskQueueType task_queue_type = 2; + temporal.api.taskqueue.v1.TaskQueue task_queue = 3; + string poller_id = 4; } -message CancelOutstandingPollResponse { -} +message CancelOutstandingPollResponse {} // CancelOutstandingWorkerPollsRequest cancels all outstanding polls for a given worker instance key. message CancelOutstandingWorkerPollsRequest { - string namespace_id = 1; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - temporal.api.enums.v1.TaskQueueType task_queue_type = 3; - string worker_instance_key = 4; - // Worker identity string (e.g., "pid@hostname"). Used to eagerly remove the worker - // from pollerHistory so DescribeTaskQueue doesn't show stale pollers. - // Note: pollerHistory predates worker_instance_key and uses identity as its key, - // so we pass both for backward compatibility. - string worker_identity = 5; + string namespace_id = 1; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + temporal.api.enums.v1.TaskQueueType task_queue_type = 3; + string worker_instance_key = 4; + // Worker identity string (e.g., "pid@hostname"). Used to eagerly remove the worker + // from pollerHistory so DescribeTaskQueue doesn't show stale pollers. + // Note: pollerHistory predates worker_instance_key and uses identity as its key, + // so we pass both for backward compatibility. + string worker_identity = 5; } message CancelOutstandingWorkerPollsResponse { - // Used for debugging. - int32 cancelled_count = 1; + // Used for debugging. + int32 cancelled_count = 1; } message DescribeTaskQueueRequest { - string namespace_id = 1; - temporal.api.workflowservice.v1.DescribeTaskQueueRequest desc_request = 2; - temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 3; + string namespace_id = 1; + temporal.api.workflowservice.v1.DescribeTaskQueueRequest desc_request = 2; + temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 3; } message DescribeTaskQueueResponse { - reserved 1 to 2; - temporal.api.workflowservice.v1.DescribeTaskQueueResponse desc_response = 3; + reserved 1 to 2; + temporal.api.workflowservice.v1.DescribeTaskQueueResponse desc_response = 3; } message DescribeVersionedTaskQueuesRequest { - string namespace_id = 1; + string namespace_id = 1; - // This task queue is for routing purposes. - temporal.api.enums.v1.TaskQueueType task_queue_type = 2; - temporal.api.taskqueue.v1.TaskQueue task_queue = 3; + // This task queue is for routing purposes. + temporal.api.enums.v1.TaskQueueType task_queue_type = 2; + temporal.api.taskqueue.v1.TaskQueue task_queue = 3; - temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 4; + temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 4; - // List of task queues to describe. - repeated VersionTaskQueue version_task_queues = 5; - // (-- api-linter: core::0123::resource-annotation=disabled --) - message VersionTaskQueue { - string name = 1; - temporal.api.enums.v1.TaskQueueType type = 2; - } + // List of task queues to describe. + repeated VersionTaskQueue version_task_queues = 5; + // (-- api-linter: core::0123::resource-annotation=disabled --) + message VersionTaskQueue { + string name = 1; + temporal.api.enums.v1.TaskQueueType type = 2; + } } message DescribeVersionedTaskQueuesResponse { - repeated VersionTaskQueue version_task_queues = 1; - // (-- api-linter: core::0123::resource-annotation=disabled --) - message VersionTaskQueue { - string name = 1; - temporal.api.enums.v1.TaskQueueType type = 2; - temporal.api.taskqueue.v1.TaskQueueStats stats = 3; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "by" is used to clarify the key. --) - map stats_by_priority_key = 4; - } + repeated VersionTaskQueue version_task_queues = 1; + // (-- api-linter: core::0123::resource-annotation=disabled --) + message VersionTaskQueue { + string name = 1; + temporal.api.enums.v1.TaskQueueType type = 2; + temporal.api.taskqueue.v1.TaskQueueStats stats = 3; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "by" is used to clarify the key. --) + map stats_by_priority_key = 4; + } } message DescribeTaskQueuePartitionRequest { - string namespace_id = 1; - temporal.server.api.taskqueue.v1.TaskQueuePartition task_queue_partition = 2; + string namespace_id = 1; + temporal.server.api.taskqueue.v1.TaskQueuePartition task_queue_partition = 2; - temporal.api.taskqueue.v1.TaskQueueVersionSelection versions = 3; + temporal.api.taskqueue.v1.TaskQueueVersionSelection versions = 3; - // Report task queue stats for the requested task queue types and versions - bool report_stats = 4; - // Report list of pollers for requested task queue types and versions - bool report_pollers = 5; - bool report_internal_task_queue_status = 6; + // Report task queue stats for the requested task queue types and versions + bool report_stats = 4; + // Report list of pollers for requested task queue types and versions + bool report_pollers = 5; + bool report_internal_task_queue_status = 6; } message DescribeTaskQueuePartitionResponse { - map versions_info_internal = 1; + map versions_info_internal = 1; } message ListTaskQueuePartitionsRequest { - string namespace = 1; - string namespace_id = 3; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + string namespace = 1; + string namespace_id = 3; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; } message ListTaskQueuePartitionsResponse { - repeated temporal.api.taskqueue.v1.TaskQueuePartitionMetadata activity_task_queue_partitions = 1; - repeated temporal.api.taskqueue.v1.TaskQueuePartitionMetadata workflow_task_queue_partitions = 2; + repeated temporal.api.taskqueue.v1.TaskQueuePartitionMetadata activity_task_queue_partitions = 1; + repeated temporal.api.taskqueue.v1.TaskQueuePartitionMetadata workflow_task_queue_partitions = 2; } // (-- api-linter: core::0134::request-mask-required=disabled @@ -334,42 +330,42 @@ message ListTaskQueuePartitionsResponse { // (-- api-linter: core::0134::request-resource-required=disabled // aip.dev/not-precedent: UpdateWorkerBuildIdCompatibilityRequest RPC doesn't follow Google API format. --) message UpdateWorkerBuildIdCompatibilityRequest { - // Apply request from public API. - message ApplyPublicRequest { - temporal.api.workflowservice.v1.UpdateWorkerBuildIdCompatibilityRequest request = 1; - } + // Apply request from public API. + message ApplyPublicRequest { + temporal.api.workflowservice.v1.UpdateWorkerBuildIdCompatibilityRequest request = 1; + } - // Remove build ids (internal only) - message RemoveBuildIds { - // The last known user data version, used to prevent concurrent updates. - int64 known_user_data_version = 1; - // List of build ids to remove. - repeated string build_ids = 2; - } + // Remove build ids (internal only) + message RemoveBuildIds { + // The last known user data version, used to prevent concurrent updates. + int64 known_user_data_version = 1; + // List of build ids to remove. + repeated string build_ids = 2; + } - string namespace_id = 1; - string task_queue = 2; + string namespace_id = 1; + string task_queue = 2; - oneof operation { - ApplyPublicRequest apply_public_request = 3; - RemoveBuildIds remove_build_ids = 4; - string persist_unknown_build_id = 5; - } + oneof operation { + ApplyPublicRequest apply_public_request = 3; + RemoveBuildIds remove_build_ids = 4; + string persist_unknown_build_id = 5; + } } message UpdateWorkerBuildIdCompatibilityResponse {} message GetWorkerVersioningRulesRequest { - string namespace_id = 1; - string task_queue = 2; + string namespace_id = 1; + string task_queue = 2; - oneof command { - temporal.api.workflowservice.v1.GetWorkerVersioningRulesRequest request = 3; - } + oneof command { + temporal.api.workflowservice.v1.GetWorkerVersioningRulesRequest request = 3; + } } message GetWorkerVersioningRulesResponse { - temporal.api.workflowservice.v1.GetWorkerVersioningRulesResponse response = 1; + temporal.api.workflowservice.v1.GetWorkerVersioningRulesResponse response = 1; } // (-- api-linter: core::0134::request-mask-required=disabled @@ -377,141 +373,139 @@ message GetWorkerVersioningRulesResponse { // (-- api-linter: core::0134::request-resource-required=disabled // aip.dev/not-precedent: UpdateWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) message UpdateWorkerVersioningRulesRequest { - string namespace_id = 1; - string task_queue = 2; + string namespace_id = 1; + string task_queue = 2; - oneof command { - temporal.api.workflowservice.v1.UpdateWorkerVersioningRulesRequest request = 3; - } + oneof command { + temporal.api.workflowservice.v1.UpdateWorkerVersioningRulesRequest request = 3; + } } message UpdateWorkerVersioningRulesResponse { - temporal.api.workflowservice.v1.UpdateWorkerVersioningRulesResponse response = 1; + temporal.api.workflowservice.v1.UpdateWorkerVersioningRulesResponse response = 1; } message GetWorkerBuildIdCompatibilityRequest { - string namespace_id = 1; - temporal.api.workflowservice.v1.GetWorkerBuildIdCompatibilityRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.GetWorkerBuildIdCompatibilityRequest request = 2; } message GetWorkerBuildIdCompatibilityResponse { - temporal.api.workflowservice.v1.GetWorkerBuildIdCompatibilityResponse response = 1; + temporal.api.workflowservice.v1.GetWorkerBuildIdCompatibilityResponse response = 1; } message GetTaskQueueUserDataRequest { - string namespace_id = 1; - // The task queue to fetch data from. The task queue is always considered as a normal - // queue, since sticky queues have no user data. - string task_queue = 2; - temporal.api.enums.v1.TaskQueueType task_queue_type = 5; - // The value of the last known user data version. - // If the requester has no data, it should set this to 0. - // This value must not be set to a negative number (note that our linter suggests avoiding uint64). - int64 last_known_user_data_version = 3; - // The value of the last known ephemeral data version. - // If the requester has no data yet, it should use 0. - // If the requester doesn't want ephemeral data (i.e. it's root of an activity/nexus - // queue which have separate ephemeral data), it should use -1 (noEphemeralDataVersion). - int64 last_known_ephemeral_data_version = 7; - // If set and last_known_{user_data,ephemeral_data}_version is the current version, - // block until new data is available (or timeout). - bool wait_new_data = 4; - // If set, do not load task queue if unloaded. (Returns FailedPrecondition error in that case.) - bool only_if_loaded = 6; + string namespace_id = 1; + // The task queue to fetch data from. The task queue is always considered as a normal + // queue, since sticky queues have no user data. + string task_queue = 2; + temporal.api.enums.v1.TaskQueueType task_queue_type = 5; + // The value of the last known user data version. + // If the requester has no data, it should set this to 0. + // This value must not be set to a negative number (note that our linter suggests avoiding uint64). + int64 last_known_user_data_version = 3; + // The value of the last known ephemeral data version. + // If the requester has no data yet, it should use 0. + // If the requester doesn't want ephemeral data (i.e. it's root of an activity/nexus + // queue which have separate ephemeral data), it should use -1 (noEphemeralDataVersion). + int64 last_known_ephemeral_data_version = 7; + // If set and last_known_{user_data,ephemeral_data}_version is the current version, + // block until new data is available (or timeout). + bool wait_new_data = 4; + // If set, do not load task queue if unloaded. (Returns FailedPrecondition error in that case.) + bool only_if_loaded = 6; } message GetTaskQueueUserDataResponse { - reserved 1; - // Versioned user data, set if the task queue has user data and the request's last_known_user_data_version is less - // than the version cached in the root partition. - temporal.server.api.persistence.v1.VersionedTaskQueueUserData user_data = 2; - temporal.server.api.taskqueue.v1.VersionedEphemeralData ephemeral_data = 3; + reserved 1; + // Versioned user data, set if the task queue has user data and the request's last_known_user_data_version is less + // than the version cached in the root partition. + temporal.server.api.persistence.v1.VersionedTaskQueueUserData user_data = 2; + temporal.server.api.taskqueue.v1.VersionedEphemeralData ephemeral_data = 3; } message SyncDeploymentUserDataRequest { - string namespace_id = 1; - string task_queue = 2; - // Required, unless deprecated fields are used. - // (-- api-linter: core::0203::required=disabled - // aip.dev/not-precedent: Not following Google API format --) - string deployment_name = 9; - reserved 3, 4, 5; - repeated temporal.api.enums.v1.TaskQueueType task_queue_types = 8; - - oneof operation { - // The deployment version and its data that is being updated. - temporal.server.api.deployment.v1.DeploymentVersionData update_version_data = 6 [deprecated = true]; - // The version whose data should be cleaned from the task queue. - temporal.server.api.deployment.v1.WorkerDeploymentVersion forget_version = 7 [deprecated = true]; - } - - // Absent means no change. - // Ignored by the task queue if new revision number is not greater that what it has. - temporal.api.deployment.v1.RoutingConfig update_routing_config = 10; - // Optional map of build id to upsert version data. - // (-- api-linter: core::0203::required=disabled - // aip.dev/not-precedent: Not following Google API format --) - map upsert_versions_data = 11; - // List of build ids to forget from task queue. - // Deprecated. Use upsert_versions_data with deleted=true. - repeated string forget_versions = 12; + string namespace_id = 1; + string task_queue = 2; + // Required, unless deprecated fields are used. + // (-- api-linter: core::0203::required=disabled + // aip.dev/not-precedent: Not following Google API format --) + string deployment_name = 9; + reserved 3, 4, 5; + repeated temporal.api.enums.v1.TaskQueueType task_queue_types = 8; + + oneof operation { + // The deployment version and its data that is being updated. + temporal.server.api.deployment.v1.DeploymentVersionData update_version_data = 6 [deprecated = true]; + // The version whose data should be cleaned from the task queue. + temporal.server.api.deployment.v1.WorkerDeploymentVersion forget_version = 7 [deprecated = true]; + } + + // Absent means no change. + // Ignored by the task queue if new revision number is not greater that what it has. + temporal.api.deployment.v1.RoutingConfig update_routing_config = 10; + // Optional map of build id to upsert version data. + // (-- api-linter: core::0203::required=disabled + // aip.dev/not-precedent: Not following Google API format --) + map upsert_versions_data = 11; + // List of build ids to forget from task queue. + repeated string forget_versions = 12; } message SyncDeploymentUserDataResponse { - // New task queue user data version. Can be used to wait for propagation. - int64 version = 1; - // If the routing config changed after applying this operation. Compared base on revision number. - // Deprecated. using this is not totaly safe in case of retries. - bool routing_config_changed = 2 [deprecated = true]; + // New task queue user data version. Can be used to wait for propagation. + int64 version = 1; + // If the routing config changed after applying this operation. Compared base on revision number. + // Deprecated. using this is not totaly safe in case of retries. + bool routing_config_changed = 2 [deprecated = true]; } message ApplyTaskQueueUserDataReplicationEventRequest { - string namespace_id = 1; - string task_queue = 2; - temporal.server.api.persistence.v1.TaskQueueUserData user_data = 3; + string namespace_id = 1; + string task_queue = 2; + temporal.server.api.persistence.v1.TaskQueueUserData user_data = 3; } -message ApplyTaskQueueUserDataReplicationEventResponse { -} +message ApplyTaskQueueUserDataReplicationEventResponse {} message GetBuildIdTaskQueueMappingRequest { - string namespace_id = 1; - string build_id = 2; + string namespace_id = 1; + string build_id = 2; } message GetBuildIdTaskQueueMappingResponse { - repeated string task_queues = 1; + repeated string task_queues = 1; } message ForceLoadTaskQueuePartitionRequest { - string namespace_id = 1; + string namespace_id = 1; - temporal.server.api.taskqueue.v1.TaskQueuePartition task_queue_partition = 2; + temporal.server.api.taskqueue.v1.TaskQueuePartition task_queue_partition = 2; } message ForceLoadTaskQueuePartitionResponse { - bool was_unloaded = 1; + bool was_unloaded = 1; } // TODO Shivam - Please remove this in 123 message ForceUnloadTaskQueueRequest { - string namespace_id = 1; - string task_queue = 2; - temporal.api.enums.v1.TaskQueueType task_queue_type = 3; + string namespace_id = 1; + string task_queue = 2; + temporal.api.enums.v1.TaskQueueType task_queue_type = 3; } // TODO Shivam - Please remove this in 123 message ForceUnloadTaskQueueResponse { - bool was_loaded = 1; + bool was_loaded = 1; } message ForceUnloadTaskQueuePartitionRequest { - string namespace_id = 1; - temporal.server.api.taskqueue.v1.TaskQueuePartition task_queue_partition = 2; + string namespace_id = 1; + temporal.server.api.taskqueue.v1.TaskQueuePartition task_queue_partition = 2; } message ForceUnloadTaskQueuePartitionResponse { - bool was_loaded = 1; + bool was_loaded = 1; } // (-- api-linter: core::0134::request-mask-required=disabled @@ -519,100 +513,95 @@ message ForceUnloadTaskQueuePartitionResponse { // (-- api-linter: core::0134::request-resource-required=disabled // aip.dev/not-precedent: UpdateTaskQueueUserDataRequest RPC doesn't follow Google API format. --) message UpdateTaskQueueUserDataRequest { - string namespace_id = 1; - string task_queue = 2; - // Versioned user data, set if the task queue has user data and the request's last_known_user_data_version is less - // than the version cached in the root partition. - temporal.server.api.persistence.v1.VersionedTaskQueueUserData user_data = 3; - // List of added build ids - repeated string build_ids_added = 4; - // List of removed build ids - repeated string build_ids_removed = 5; + string namespace_id = 1; + string task_queue = 2; + // Versioned user data, set if the task queue has user data and the request's last_known_user_data_version is less + // than the version cached in the root partition. + temporal.server.api.persistence.v1.VersionedTaskQueueUserData user_data = 3; + // List of added build ids + repeated string build_ids_added = 4; + // List of removed build ids + repeated string build_ids_removed = 5; } -message UpdateTaskQueueUserDataResponse { -} +message UpdateTaskQueueUserDataResponse {} message ReplicateTaskQueueUserDataRequest { - string namespace_id = 1; - string task_queue = 2; - temporal.server.api.persistence.v1.TaskQueueUserData user_data = 3; + string namespace_id = 1; + string task_queue = 2; + temporal.server.api.persistence.v1.TaskQueueUserData user_data = 3; } -message ReplicateTaskQueueUserDataResponse { -} +message ReplicateTaskQueueUserDataResponse {} message CheckTaskQueueUserDataPropagationRequest { - string namespace_id = 1; - string task_queue = 2; - int64 version = 3; + string namespace_id = 1; + string task_queue = 2; + int64 version = 3; } -message CheckTaskQueueUserDataPropagationResponse { -} +message CheckTaskQueueUserDataPropagationResponse {} message DispatchNexusTaskRequest { - string namespace_id = 1; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - // Nexus request extracted by the frontend and translated into Temporal API format. - temporal.api.nexus.v1.Request request = 3; - temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 4; + string namespace_id = 1; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + // Nexus request extracted by the frontend and translated into Temporal API format. + temporal.api.nexus.v1.Request request = 3; + temporal.server.api.taskqueue.v1.TaskForwardInfo forward_info = 4; } message DispatchNexusTaskResponse { - message Timeout {} + message Timeout {} - oneof outcome { - // Deprecated. Use failure field instead. - temporal.api.nexus.v1.HandlerError handler_error = 1 [deprecated = true]; - // Set if the worker's handler responded successfully to the nexus task. - temporal.api.nexus.v1.Response response = 2; - Timeout request_timeout = 3; - // Set if the worker's handler failed the nexus task. Must contain a NexusHandlerFailureInfo object. - temporal.api.failure.v1.Failure failure = 4; - } + oneof outcome { + // Deprecated. Use failure field instead. + temporal.api.nexus.v1.HandlerError handler_error = 1 [deprecated = true]; + // Set if the worker's handler responded successfully to the nexus task. + temporal.api.nexus.v1.Response response = 2; + Timeout request_timeout = 3; + // Set if the worker's handler failed the nexus task. Must contain a NexusHandlerFailureInfo object. + temporal.api.failure.v1.Failure failure = 4; + } } message PollNexusTaskQueueRequest { - string namespace_id = 1; - // A unique ID generated by the frontend for this request. - string poller_id = 2; - // Original WorkflowService poll request as received by the frontend. - temporal.api.workflowservice.v1.PollNexusTaskQueueRequest request = 3; - // Non-empty if this poll was forwarded from a child partition. - string forwarded_source = 4; - // Extra conditions on this poll request. Only supported with new matcher. - PollConditions conditions = 5; + string namespace_id = 1; + // A unique ID generated by the frontend for this request. + string poller_id = 2; + // Original WorkflowService poll request as received by the frontend. + temporal.api.workflowservice.v1.PollNexusTaskQueueRequest request = 3; + // Non-empty if this poll was forwarded from a child partition. + string forwarded_source = 4; + // Extra conditions on this poll request. Only supported with new matcher. + PollConditions conditions = 5; } message PollNexusTaskQueueResponse { - // Response that should be delivered to the worker containing a request from DispatchNexusTaskRequest. - temporal.api.workflowservice.v1.PollNexusTaskQueueResponse response = 1; + // Response that should be delivered to the worker containing a request from DispatchNexusTaskRequest. + temporal.api.workflowservice.v1.PollNexusTaskQueueResponse response = 1; } message RespondNexusTaskCompletedRequest { - string namespace_id = 1; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - // A unique ID for this task generated by the matching engine. Decoded from the incoming request's task token. - string task_id = 3; - // Original completion as received by the frontend. - temporal.api.workflowservice.v1.RespondNexusTaskCompletedRequest request = 4; + string namespace_id = 1; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + // A unique ID for this task generated by the matching engine. Decoded from the incoming request's task token. + string task_id = 3; + // Original completion as received by the frontend. + temporal.api.workflowservice.v1.RespondNexusTaskCompletedRequest request = 4; } -message RespondNexusTaskCompletedResponse { -} +message RespondNexusTaskCompletedResponse {} message RespondNexusTaskFailedRequest { - string namespace_id = 1; - temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - // A unique ID for this task generated by the matching engine. Decoded from the incoming request's task token. - string task_id = 3; - // Original failure as received by the frontend. - temporal.api.workflowservice.v1.RespondNexusTaskFailedRequest request = 4; + string namespace_id = 1; + temporal.api.taskqueue.v1.TaskQueue task_queue = 2; + // A unique ID for this task generated by the matching engine. Decoded from the incoming request's task token. + string task_id = 3; + // Original failure as received by the frontend. + temporal.api.workflowservice.v1.RespondNexusTaskFailedRequest request = 4; } -message RespondNexusTaskFailedResponse { -} +message RespondNexusTaskFailedResponse {} // (-- api-linter: core::0133::request-unknown-fields=disabled // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) @@ -621,11 +610,11 @@ message RespondNexusTaskFailedResponse { // (-- api-linter: core::0133::request-parent-required=disabled // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) message CreateNexusEndpointRequest { - temporal.server.api.persistence.v1.NexusEndpointSpec spec = 1; + temporal.server.api.persistence.v1.NexusEndpointSpec spec = 1; } message CreateNexusEndpointResponse { - temporal.server.api.persistence.v1.NexusEndpointEntry entry = 1; + temporal.server.api.persistence.v1.NexusEndpointEntry entry = 1; } // (-- api-linter: core::0134::request-resource-required=disabled @@ -633,16 +622,16 @@ message CreateNexusEndpointResponse { // (-- api-linter: core::0134::request-mask-required=disabled // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) message UpdateNexusEndpointRequest { - // ID of the endpoint to update. - string id = 1; - // Version of the endpoint, used for optimistic concurrency. Must match current version in persistence or the - // request will fail a FAILED_PRECONDITION error. - int64 version = 2; - temporal.server.api.persistence.v1.NexusEndpointSpec spec = 3; + // ID of the endpoint to update. + string id = 1; + // Version of the endpoint, used for optimistic concurrency. Must match current version in persistence or the + // request will fail a FAILED_PRECONDITION error. + int64 version = 2; + temporal.server.api.persistence.v1.NexusEndpointSpec spec = 3; } message UpdateNexusEndpointResponse { - temporal.server.api.persistence.v1.NexusEndpointEntry entry = 1; + temporal.server.api.persistence.v1.NexusEndpointEntry entry = 1; } // (-- api-linter: core::0135::request-name-behavior=disabled @@ -650,58 +639,55 @@ message UpdateNexusEndpointResponse { // (-- api-linter: core::0135::request-name-reference=disabled // aip.dev/not-precedent: DeleteNexusEndpointRequest RPC doesn't follow Google API format. --) message DeleteNexusEndpointRequest { - // ID of the endpoint to delete. - string id = 1; + // ID of the endpoint to delete. + string id = 1; } -message DeleteNexusEndpointResponse { -} +message DeleteNexusEndpointResponse {} message ListNexusEndpointsRequest { - // To get the next page, pass in `ListNexusEndpointsResponse.next_page_token` from the previous page's response. The - // token will be empty if there's no other page. - // Note: the last page may be empty if the total number of services registered is a multiple of the page size. - // Mutually exclusive with wait. Specifying both will result in an invalid argument error. - bytes next_page_token = 1; - int32 page_size = 2; - // The nexus_endpoints table has a monotonically increasing version number that is incremented on every change to - // the table. This field can be used to provide the last known table version in conjuction with the `wait` field to - // long poll on changes to the table. - // If next_page_token is not empty and the current table version does not match this field, this request will fail - // with a failed precondition error. - int64 last_known_table_version = 3; - // If true, this request becomes a long poll and will be unblocked once the DB version is incremented. - // Mutually exclusive with next_page_token. Specifying both will result in an invalid argument error. - bool wait = 4; + // To get the next page, pass in `ListNexusEndpointsResponse.next_page_token` from the previous page's response. The + // token will be empty if there's no other page. + // Note: the last page may be empty if the total number of services registered is a multiple of the page size. + // Mutually exclusive with wait. Specifying both will result in an invalid argument error. + bytes next_page_token = 1; + int32 page_size = 2; + // The nexus_endpoints table has a monotonically increasing version number that is incremented on every change to + // the table. This field can be used to provide the last known table version in conjuction with the `wait` field to + // long poll on changes to the table. + // If next_page_token is not empty and the current table version does not match this field, this request will fail + // with a failed precondition error. + int64 last_known_table_version = 3; + // If true, this request becomes a long poll and will be unblocked once the DB version is incremented. + // Mutually exclusive with next_page_token. Specifying both will result in an invalid argument error. + bool wait = 4; } message ListNexusEndpointsResponse { - // Token for getting the next page. - bytes next_page_token = 1; - int64 table_version = 2; - repeated temporal.server.api.persistence.v1.NexusEndpointEntry entries = 3; + // Token for getting the next page. + bytes next_page_token = 1; + int64 table_version = 2; + repeated temporal.server.api.persistence.v1.NexusEndpointEntry entries = 3; } message RecordWorkerHeartbeatRequest { - string namespace_id = 1; - temporal.api.workflowservice.v1.RecordWorkerHeartbeatRequest heartbeart_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.RecordWorkerHeartbeatRequest heartbeart_request = 2; } -message RecordWorkerHeartbeatResponse { - -} +message RecordWorkerHeartbeatResponse {} message ListWorkersRequest { - string namespace_id = 1; - temporal.api.workflowservice.v1.ListWorkersRequest list_request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.ListWorkersRequest list_request = 2; } message ListWorkersResponse { - // Deprecated: Use workers instead. This field returns full WorkerInfo which - // includes expensive runtime metrics. We will stop populating this field in the future. - repeated temporal.api.worker.v1.WorkerInfo workers_info = 1 [deprecated = true]; - bytes next_page_token = 2; - repeated temporal.api.worker.v1.WorkerListInfo workers = 3; + // Deprecated: Use workers instead. This field returns full WorkerInfo which + // includes expensive runtime metrics. We will stop populating this field in the future. + repeated temporal.api.worker.v1.WorkerInfo workers_info = 1 [deprecated = true]; + bytes next_page_token = 2; + repeated temporal.api.worker.v1.WorkerListInfo workers = 3; } // (-- api-linter: core::0134::request-resource-required=disabled @@ -711,21 +697,21 @@ message ListWorkersResponse { // (-- api-linter: core::0134::method-signature=disabled // aip.dev/not-precedent: UpdateTaskQueueConfigRequest RPC doesn't follow Google API format. --) message UpdateTaskQueueConfigRequest { - string namespace_id = 1; - temporal.api.workflowservice.v1.UpdateTaskQueueConfigRequest update_taskqueue_config = 3; + string namespace_id = 1; + temporal.api.workflowservice.v1.UpdateTaskQueueConfigRequest update_taskqueue_config = 3; } message UpdateTaskQueueConfigResponse { - temporal.api.taskqueue.v1.TaskQueueConfig updated_taskqueue_config = 1; + temporal.api.taskqueue.v1.TaskQueueConfig updated_taskqueue_config = 1; } message DescribeWorkerRequest { - string namespace_id = 1; - temporal.api.workflowservice.v1.DescribeWorkerRequest request = 2; + string namespace_id = 1; + temporal.api.workflowservice.v1.DescribeWorkerRequest request = 2; } message DescribeWorkerResponse { - temporal.api.worker.v1.WorkerInfo worker_info = 1; + temporal.api.worker.v1.WorkerInfo worker_info = 1; } // (-- api-linter: core::0134::request-resource-required=disabled @@ -735,34 +721,33 @@ message DescribeWorkerResponse { // (-- api-linter: core::0134::method-signature=disabled // aip.dev/not-precedent: UpdateFairnessStateRequest RPC doesn't follow Google API format. --) message UpdateFairnessStateRequest { - string namespace_id = 1; - string task_queue = 2; - temporal.api.enums.v1.TaskQueueType task_queue_type = 3; - temporal.server.api.enums.v1.FairnessState fairness_state = 4; + string namespace_id = 1; + string task_queue = 2; + temporal.api.enums.v1.TaskQueueType task_queue_type = 3; + temporal.server.api.enums.v1.FairnessState fairness_state = 4; } -message UpdateFairnessStateResponse { -} +message UpdateFairnessStateResponse {} message CheckTaskQueueVersionMembershipRequest { - string namespace_id = 1; - string task_queue = 2; - temporal.api.enums.v1.TaskQueueType task_queue_type = 3; - temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 4; + string namespace_id = 1; + string task_queue = 2; + temporal.api.enums.v1.TaskQueueType task_queue_type = 3; + temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 4; } message CheckTaskQueueVersionMembershipResponse { - bool is_member = 1; + bool is_member = 1; } // PollConditions are extra conditions to set on the poll. Only supported with new matcher. message PollConditions { - // If set (non-zero), this poll will not match a task with lower priority than this value. - // Note that "min" priority is "max" numeric value, e.g. "min_priority: 3" means to match - // tasks with priority 1, 2, or 3. - int32 min_priority = 1; - // If true, don't block waiting for a task, just return a task immediately or an empty - // response. This is most useful combined with min_priority, to poll for task at a specific - // priority level on a partition that you think is there. - bool no_wait = 2; + // If set (non-zero), this poll will not match a task with lower priority than this value. + // Note that "min" priority is "max" numeric value, e.g. "min_priority: 3" means to match + // tasks with priority 1, 2, or 3. + int32 min_priority = 1; + // If true, don't block waiting for a task, just return a task immediately or an empty + // response. This is most useful combined with min_priority, to poll for task at a specific + // priority level on a partition that you think is there. + bool no_wait = 2; } diff --git a/proto/internal/temporal/server/api/matchingservice/v1/service.proto b/proto/internal/temporal/server/api/matchingservice/v1/service.proto index b375b27407..908f0b3eee 100644 --- a/proto/internal/temporal/server/api/matchingservice/v1/service.proto +++ b/proto/internal/temporal/server/api/matchingservice/v1/service.proto @@ -1,10 +1,11 @@ syntax = "proto3"; package temporal.server.api.matchingservice.v1; -option go_package = "go.temporal.io/server/api/matchingservice/v1;matchingservice"; -import "temporal/server/api/matchingservice/v1/request_response.proto"; import "temporal/server/api/common/v1/api_category.proto"; +import "temporal/server/api/matchingservice/v1/request_response.proto"; + +option go_package = "go.temporal.io/server/api/matchingservice/v1;matchingservice"; // MatchingService API is exposed to provide support for polling from long running applications. // Such applications are expected to have a worker which regularly polls for WorkflowTask and ActivityTask. For each @@ -12,268 +13,267 @@ import "temporal/server/api/common/v1/api_category.proto"; // commands. For each ActivityTask, application is expected to execute the actual logic for that task and respond back // with completion or failure. service MatchingService { - // PollWorkflowTaskQueue is called by frontend to process WorkflowTask from a specific task queue. A - // WorkflowTask is dispatched to callers for active workflow executions, with pending workflow tasks. - rpc PollWorkflowTaskQueue (PollWorkflowTaskQueueRequest) returns (PollWorkflowTaskQueueResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // PollActivityTaskQueue is called by frontend to process ActivityTask from a specific task queue. ActivityTask - // is dispatched to callers whenever a ScheduleTask command is made for a workflow execution. - rpc PollActivityTaskQueue (PollActivityTaskQueueRequest) returns (PollActivityTaskQueueResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // AddWorkflowTask is called by the history service when a workflow task is scheduled, so that it can be dispatched - // by the MatchingEngine. - rpc AddWorkflowTask (AddWorkflowTaskRequest) returns (AddWorkflowTaskResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // AddActivityTask is called by the history service when a workflow task is scheduled, so that it can be dispatched - // by the MatchingEngine. - rpc AddActivityTask (AddActivityTaskRequest) returns (AddActivityTaskResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // QueryWorkflow is called by frontend to query a workflow. - rpc QueryWorkflow (QueryWorkflowRequest) returns (QueryWorkflowResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // RespondQueryTaskCompleted is called by frontend to respond query completed. - rpc RespondQueryTaskCompleted (RespondQueryTaskCompletedRequest) returns (RespondQueryTaskCompletedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Request from frontend to synchronously dispatch a nexus task to a worker. - rpc DispatchNexusTask (DispatchNexusTaskRequest) returns (DispatchNexusTaskResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Request from worker (via frontend) to long poll on nexus tasks. - rpc PollNexusTaskQueue (PollNexusTaskQueueRequest) returns (PollNexusTaskQueueResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; - } - - // Response from a worker (via frontend) to a Nexus task, unblocks the corresponding DispatchNexusTask request. - rpc RespondNexusTaskCompleted (RespondNexusTaskCompletedRequest) returns (RespondNexusTaskCompletedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Response from a worker (via frontend) to a Nexus task, unblocks the corresponding DispatchNexusTask request. - rpc RespondNexusTaskFailed (RespondNexusTaskFailedRequest) returns (RespondNexusTaskFailedResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // CancelOutstandingPoll is called by frontend to unblock long polls on matching for zombie pollers. - // Our rpc stack does not support context propagation, so when a client connection goes away frontend sees - // cancellation of context for that handler, but any corresponding calls (long-poll) to matching service does not - // see the cancellation propagated so it can unblock corresponding long-polls on its end. This results is tasks - // being dispatched to zombie pollers in this situation. This API is added so every time frontend makes a long-poll - // api call to matching it passes in a pollerId and then calls this API when it detects client connection is closed - // to unblock long polls for this poller and prevent tasks being sent to these zombie pollers. - rpc CancelOutstandingPoll (CancelOutstandingPollRequest) returns (CancelOutstandingPollResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // CancelOutstandingWorkerPolls cancels any outstanding polls for a given worker instance key. - // These polls could be waiting on different partitions of the task queue. - // This is called during worker shutdown to eagerly cancel polls and avoid giving out tasks to workers that are shutting down. - // Note: This only cancels polls that are currently outstanding. The caller must ensure no new polls - // are issued after calling this RPC, otherwise those polls will not be cancelled. - rpc CancelOutstandingWorkerPolls (CancelOutstandingWorkerPollsRequest) returns (CancelOutstandingWorkerPollsResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeTaskQueue returns information about the target task queue, right now this API returns the - // pollers which polled this task queue in last few minutes. - rpc DescribeTaskQueue (DescribeTaskQueueRequest) returns (DescribeTaskQueueResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeTaskQueuePartition returns information about the target task queue partition. - rpc DescribeTaskQueuePartition (DescribeTaskQueuePartitionRequest) returns (DescribeTaskQueuePartitionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeVersionedTaskQueues returns details about the requested versioned task queues. - // It is an internal API; there is no direct user-facing equivalent. - rpc DescribeVersionedTaskQueues (DescribeVersionedTaskQueuesRequest) returns (DescribeVersionedTaskQueuesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ListTaskQueuePartitions returns a map of partitionKey and hostAddress for a task queue. - rpc ListTaskQueuePartitions(ListTaskQueuePartitionsRequest) returns (ListTaskQueuePartitionsResponse){ - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // (-- api-linter: core::0134::response-message-name=disabled - // aip.dev/not-precedent: UpdateWorkerBuildIdCompatibility RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::method-signature=disabled - // aip.dev/not-precedent: UpdateWorkerBuildIdCompatibility RPC doesn't follow Google API format. --) - rpc UpdateWorkerBuildIdCompatibility (UpdateWorkerBuildIdCompatibilityRequest) returns (UpdateWorkerBuildIdCompatibilityResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - rpc GetWorkerBuildIdCompatibility (GetWorkerBuildIdCompatibilityRequest) returns (GetWorkerBuildIdCompatibilityResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Fetch user data for a task queue, this request should always be routed to the node holding the root partition of the workflow task queue. - rpc GetTaskQueueUserData (GetTaskQueueUserDataRequest) returns (GetTaskQueueUserDataResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Allows updating the Build ID assignment and redirect rules for a given Task Queue. - // (-- api-linter: core::0134::method-signature=disabled - // aip.dev/not-precedent: UpdateWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::response-message-name=disabled - // aip.dev/not-precedent: UpdateWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) - rpc UpdateWorkerVersioningRules (UpdateWorkerVersioningRulesRequest) returns (UpdateWorkerVersioningRulesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Fetches the Build ID assignment and redirect rules for a Task Queue - // (-- api-linter: core::0127::resource-name-extraction=disabled - // aip.dev/not-precedent: GetWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) - // (-- api-linter: core::0131::http-uri-name=disabled - // aip.dev/not-precedent: GetWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) - rpc GetWorkerVersioningRules (GetWorkerVersioningRulesRequest) returns (GetWorkerVersioningRulesResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // This request should always be routed to the node holding the root partition of the workflow task queue. - rpc SyncDeploymentUserData (SyncDeploymentUserDataRequest) returns (SyncDeploymentUserDataResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Apply a user data replication event. - rpc ApplyTaskQueueUserDataReplicationEvent (ApplyTaskQueueUserDataReplicationEventRequest) returns (ApplyTaskQueueUserDataReplicationEventResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Gets all task queue names mapped to a given build ID - rpc GetBuildIdTaskQueueMapping (GetBuildIdTaskQueueMappingRequest) returns (GetBuildIdTaskQueueMappingResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - // Force loading a task queue partition. Used by matching node owning root partition. - // When root partition is loaded this is called for all child partitions. - // This addresses the posibility of unloaded child partitions having backlog, - // but not being forwarded/synced to the root partition to find the polling - // worker which triggered the root partition being loaded in the first place. - rpc ForceLoadTaskQueuePartition (ForceLoadTaskQueuePartitionRequest) returns (ForceLoadTaskQueuePartitionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // TODO Shivam - remove this in 123. Present for backwards compatibility. - rpc ForceUnloadTaskQueue (ForceUnloadTaskQueueRequest) returns (ForceUnloadTaskQueueResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Force unloading a task queue partition. - rpc ForceUnloadTaskQueuePartition (ForceUnloadTaskQueuePartitionRequest) returns (ForceUnloadTaskQueuePartitionResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Update task queue user data in owning node for all updates in namespace. - // All user data updates must first go through the task queue owner using the `UpdateWorkerBuildIdCompatibility` - // API. - // (-- api-linter: core::0134::response-message-name=disabled - // aip.dev/not-precedent: UpdateTaskQueueUserData RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::method-signature=disabled - // aip.dev/not-precedent: UpdateTaskQueueUserData RPC doesn't follow Google API format. --) - rpc UpdateTaskQueueUserData(UpdateTaskQueueUserDataRequest) returns (UpdateTaskQueueUserDataResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Replicate task queue user data across clusters, must be done via the owning node for updates in namespace. - rpc ReplicateTaskQueueUserData(ReplicateTaskQueueUserDataRequest) returns (ReplicateTaskQueueUserDataResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Blocks on user data propagation to all loaded partitions. If successful, all loaded - // workflow + activity partitions have the requested version or higher. - // Routed to user data owner (root partition of workflow task queue). - rpc CheckTaskQueueUserDataPropagation(CheckTaskQueueUserDataPropagationRequest) returns (CheckTaskQueueUserDataPropagationResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Create a Nexus endpoint. - // (-- api-linter: core::0133::method-signature=disabled - // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) - // (-- api-linter: core::0133::response-message-name=disabled - // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) - // (-- api-linter: core::0133::http-uri-parent=disabled - // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) - rpc CreateNexusEndpoint(CreateNexusEndpointRequest) returns (CreateNexusEndpointResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - // Optimistically update a Nexus endpoint based on provided version. - // If this request is accepted, the input is considered the "current" state of this service at the time it was - // persisted and the updated version is returned. - // (-- api-linter: core::0134::method-signature=disabled - // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::response-message-name=disabled - // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::request-resource-required=disabled - // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) - rpc UpdateNexusEndpoint(UpdateNexusEndpointRequest) returns (UpdateNexusEndpointResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - // Delete a service by its name. - rpc DeleteNexusEndpoint(DeleteNexusEndpointRequest) returns (DeleteNexusEndpointResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - // List all registered services. - rpc ListNexusEndpoints(ListNexusEndpointsRequest) returns (ListNexusEndpointsResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // RecordWorkerHeartbeat receive heartbeat request from the worker. - rpc RecordWorkerHeartbeat (RecordWorkerHeartbeatRequest) returns (RecordWorkerHeartbeatResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // ListWorkers retrieves a list of workers in the specified namespace that match the provided filters. - // Supports pagination for large result sets. Returns an empty list if no workers match the criteria. - // Returns an error if the namespace doesn't exist. - rpc ListWorkers (ListWorkersRequest) returns (ListWorkersResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // Set the persisted task queue configuration. - // (-- api-linter: core::0134::method-signature=disabled - // aip.dev/not-precedent: UpdateTaskQueueConfig RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::response-message-name=disabled - // aip.dev/not-precedent: UpdateTaskQueueConfig RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::request-resource-required=disabled - // aip.dev/not-precedent: UpdateTaskQueueConfig RPC doesn't follow Google API format. --) - rpc UpdateTaskQueueConfig (UpdateTaskQueueConfigRequest) returns (UpdateTaskQueueConfigResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // DescribeWorker retrieves a worker information in the specified namespace that match the provided instance key. - // Returns an error if the namespace or worker doesn't exist. - rpc DescribeWorker (DescribeWorkerRequest) returns (DescribeWorkerResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // UpdateFairnessState changes the fairness_state stored in UserData for automatically enabling - // priority and fairness. - // (-- api-linter: core::0134::method-signature=disabled - // aip.dev/not-precedent: UpdateFairnessState RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::response-message-name=disabled - // aip.dev/not-precedent: UpdateFairnessState RPC doesn't follow Google API format. --) - // (-- api-linter: core::0134::request-resource-required=disabled - // aip.dev/not-precedent: UpdateFairnessState RPC doesn't follow Google API format. --) - rpc UpdateFairnessState(UpdateFairnessStateRequest) returns (UpdateFairnessStateResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - - // CheckTaskQueueVersionMembership checks if a task queue is part of a specific deployment version. - rpc CheckTaskQueueVersionMembership(CheckTaskQueueVersionMembershipRequest) returns (CheckTaskQueueVersionMembershipResponse) { - option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; - } - + // PollWorkflowTaskQueue is called by frontend to process WorkflowTask from a specific task queue. A + // WorkflowTask is dispatched to callers for active workflow executions, with pending workflow tasks. + rpc PollWorkflowTaskQueue(PollWorkflowTaskQueueRequest) returns (PollWorkflowTaskQueueResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // PollActivityTaskQueue is called by frontend to process ActivityTask from a specific task queue. ActivityTask + // is dispatched to callers whenever a ScheduleTask command is made for a workflow execution. + rpc PollActivityTaskQueue(PollActivityTaskQueueRequest) returns (PollActivityTaskQueueResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // AddWorkflowTask is called by the history service when a workflow task is scheduled, so that it can be dispatched + // by the MatchingEngine. + rpc AddWorkflowTask(AddWorkflowTaskRequest) returns (AddWorkflowTaskResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // AddActivityTask is called by the history service when a workflow task is scheduled, so that it can be dispatched + // by the MatchingEngine. + rpc AddActivityTask(AddActivityTaskRequest) returns (AddActivityTaskResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // QueryWorkflow is called by frontend to query a workflow. + rpc QueryWorkflow(QueryWorkflowRequest) returns (QueryWorkflowResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // RespondQueryTaskCompleted is called by frontend to respond query completed. + rpc RespondQueryTaskCompleted(RespondQueryTaskCompletedRequest) returns (RespondQueryTaskCompletedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Request from frontend to synchronously dispatch a nexus task to a worker. + rpc DispatchNexusTask(DispatchNexusTaskRequest) returns (DispatchNexusTaskResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Request from worker (via frontend) to long poll on nexus tasks. + rpc PollNexusTaskQueue(PollNexusTaskQueueRequest) returns (PollNexusTaskQueueResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_LONG_POLL; + } + + // Response from a worker (via frontend) to a Nexus task, unblocks the corresponding DispatchNexusTask request. + rpc RespondNexusTaskCompleted(RespondNexusTaskCompletedRequest) returns (RespondNexusTaskCompletedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Response from a worker (via frontend) to a Nexus task, unblocks the corresponding DispatchNexusTask request. + rpc RespondNexusTaskFailed(RespondNexusTaskFailedRequest) returns (RespondNexusTaskFailedResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // CancelOutstandingPoll is called by frontend to unblock long polls on matching for zombie pollers. + // Our rpc stack does not support context propagation, so when a client connection goes away frontend sees + // cancellation of context for that handler, but any corresponding calls (long-poll) to matching service does not + // see the cancellation propagated so it can unblock corresponding long-polls on its end. This results is tasks + // being dispatched to zombie pollers in this situation. This API is added so every time frontend makes a long-poll + // api call to matching it passes in a pollerId and then calls this API when it detects client connection is closed + // to unblock long polls for this poller and prevent tasks being sent to these zombie pollers. + rpc CancelOutstandingPoll(CancelOutstandingPollRequest) returns (CancelOutstandingPollResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // CancelOutstandingWorkerPolls cancels any outstanding polls for a given worker instance key. + // These polls could be waiting on different partitions of the task queue. + // This is called during worker shutdown to eagerly cancel polls and avoid giving out tasks to workers that are shutting down. + // Note: This only cancels polls that are currently outstanding. The caller must ensure no new polls + // are issued after calling this RPC, otherwise those polls will not be cancelled. + rpc CancelOutstandingWorkerPolls(CancelOutstandingWorkerPollsRequest) returns (CancelOutstandingWorkerPollsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeTaskQueue returns information about the target task queue, right now this API returns the + // pollers which polled this task queue in last few minutes. + rpc DescribeTaskQueue(DescribeTaskQueueRequest) returns (DescribeTaskQueueResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeTaskQueuePartition returns information about the target task queue partition. + rpc DescribeTaskQueuePartition(DescribeTaskQueuePartitionRequest) returns (DescribeTaskQueuePartitionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeVersionedTaskQueues returns details about the requested versioned task queues. + // It is an internal API; there is no direct user-facing equivalent. + rpc DescribeVersionedTaskQueues(DescribeVersionedTaskQueuesRequest) returns (DescribeVersionedTaskQueuesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ListTaskQueuePartitions returns a map of partitionKey and hostAddress for a task queue. + rpc ListTaskQueuePartitions(ListTaskQueuePartitionsRequest) returns (ListTaskQueuePartitionsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // (-- api-linter: core::0134::response-message-name=disabled + // aip.dev/not-precedent: UpdateWorkerBuildIdCompatibility RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::method-signature=disabled + // aip.dev/not-precedent: UpdateWorkerBuildIdCompatibility RPC doesn't follow Google API format. --) + rpc UpdateWorkerBuildIdCompatibility(UpdateWorkerBuildIdCompatibilityRequest) returns (UpdateWorkerBuildIdCompatibilityResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + rpc GetWorkerBuildIdCompatibility(GetWorkerBuildIdCompatibilityRequest) returns (GetWorkerBuildIdCompatibilityResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Fetch user data for a task queue, this request should always be routed to the node holding the root partition of the workflow task queue. + rpc GetTaskQueueUserData(GetTaskQueueUserDataRequest) returns (GetTaskQueueUserDataResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Allows updating the Build ID assignment and redirect rules for a given Task Queue. + // (-- api-linter: core::0134::method-signature=disabled + // aip.dev/not-precedent: UpdateWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::response-message-name=disabled + // aip.dev/not-precedent: UpdateWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) + rpc UpdateWorkerVersioningRules(UpdateWorkerVersioningRulesRequest) returns (UpdateWorkerVersioningRulesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Fetches the Build ID assignment and redirect rules for a Task Queue + // (-- api-linter: core::0127::resource-name-extraction=disabled + // aip.dev/not-precedent: GetWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) + // (-- api-linter: core::0131::http-uri-name=disabled + // aip.dev/not-precedent: GetWorkerVersioningRulesRequest RPC doesn't follow Google API format. --) + rpc GetWorkerVersioningRules(GetWorkerVersioningRulesRequest) returns (GetWorkerVersioningRulesResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // This request should always be routed to the node holding the root partition of the workflow task queue. + rpc SyncDeploymentUserData(SyncDeploymentUserDataRequest) returns (SyncDeploymentUserDataResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Apply a user data replication event. + rpc ApplyTaskQueueUserDataReplicationEvent(ApplyTaskQueueUserDataReplicationEventRequest) returns (ApplyTaskQueueUserDataReplicationEventResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Gets all task queue names mapped to a given build ID + rpc GetBuildIdTaskQueueMapping(GetBuildIdTaskQueueMappingRequest) returns (GetBuildIdTaskQueueMappingResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + // Force loading a task queue partition. Used by matching node owning root partition. + // When root partition is loaded this is called for all child partitions. + // This addresses the posibility of unloaded child partitions having backlog, + // but not being forwarded/synced to the root partition to find the polling + // worker which triggered the root partition being loaded in the first place. + rpc ForceLoadTaskQueuePartition(ForceLoadTaskQueuePartitionRequest) returns (ForceLoadTaskQueuePartitionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // TODO Shivam - remove this in 123. Present for backwards compatibility. + rpc ForceUnloadTaskQueue(ForceUnloadTaskQueueRequest) returns (ForceUnloadTaskQueueResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Force unloading a task queue partition. + rpc ForceUnloadTaskQueuePartition(ForceUnloadTaskQueuePartitionRequest) returns (ForceUnloadTaskQueuePartitionResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Update task queue user data in owning node for all updates in namespace. + // All user data updates must first go through the task queue owner using the `UpdateWorkerBuildIdCompatibility` + // API. + // (-- api-linter: core::0134::response-message-name=disabled + // aip.dev/not-precedent: UpdateTaskQueueUserData RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::method-signature=disabled + // aip.dev/not-precedent: UpdateTaskQueueUserData RPC doesn't follow Google API format. --) + rpc UpdateTaskQueueUserData(UpdateTaskQueueUserDataRequest) returns (UpdateTaskQueueUserDataResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Replicate task queue user data across clusters, must be done via the owning node for updates in namespace. + rpc ReplicateTaskQueueUserData(ReplicateTaskQueueUserDataRequest) returns (ReplicateTaskQueueUserDataResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Blocks on user data propagation to all loaded partitions. If successful, all loaded + // workflow + activity partitions have the requested version or higher. + // Routed to user data owner (root partition of workflow task queue). + rpc CheckTaskQueueUserDataPropagation(CheckTaskQueueUserDataPropagationRequest) returns (CheckTaskQueueUserDataPropagationResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Create a Nexus endpoint. + // (-- api-linter: core::0133::method-signature=disabled + // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) + // (-- api-linter: core::0133::response-message-name=disabled + // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) + // (-- api-linter: core::0133::http-uri-parent=disabled + // aip.dev/not-precedent: CreateNexusEndpoint RPC doesn't follow Google API format. --) + rpc CreateNexusEndpoint(CreateNexusEndpointRequest) returns (CreateNexusEndpointResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + // Optimistically update a Nexus endpoint based on provided version. + // If this request is accepted, the input is considered the "current" state of this service at the time it was + // persisted and the updated version is returned. + // (-- api-linter: core::0134::method-signature=disabled + // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::response-message-name=disabled + // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::request-resource-required=disabled + // aip.dev/not-precedent: UpdateNexusEndpoint RPC doesn't follow Google API format. --) + rpc UpdateNexusEndpoint(UpdateNexusEndpointRequest) returns (UpdateNexusEndpointResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + // Delete a service by its name. + rpc DeleteNexusEndpoint(DeleteNexusEndpointRequest) returns (DeleteNexusEndpointResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + // List all registered services. + rpc ListNexusEndpoints(ListNexusEndpointsRequest) returns (ListNexusEndpointsResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // RecordWorkerHeartbeat receive heartbeat request from the worker. + rpc RecordWorkerHeartbeat(RecordWorkerHeartbeatRequest) returns (RecordWorkerHeartbeatResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // ListWorkers retrieves a list of workers in the specified namespace that match the provided filters. + // Supports pagination for large result sets. Returns an empty list if no workers match the criteria. + // Returns an error if the namespace doesn't exist. + rpc ListWorkers(ListWorkersRequest) returns (ListWorkersResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // Set the persisted task queue configuration. + // (-- api-linter: core::0134::method-signature=disabled + // aip.dev/not-precedent: UpdateTaskQueueConfig RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::response-message-name=disabled + // aip.dev/not-precedent: UpdateTaskQueueConfig RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::request-resource-required=disabled + // aip.dev/not-precedent: UpdateTaskQueueConfig RPC doesn't follow Google API format. --) + rpc UpdateTaskQueueConfig(UpdateTaskQueueConfigRequest) returns (UpdateTaskQueueConfigResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // DescribeWorker retrieves a worker information in the specified namespace that match the provided instance key. + // Returns an error if the namespace or worker doesn't exist. + rpc DescribeWorker(DescribeWorkerRequest) returns (DescribeWorkerResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // UpdateFairnessState changes the fairness_state stored in UserData for automatically enabling + // priority and fairness. + // (-- api-linter: core::0134::method-signature=disabled + // aip.dev/not-precedent: UpdateFairnessState RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::response-message-name=disabled + // aip.dev/not-precedent: UpdateFairnessState RPC doesn't follow Google API format. --) + // (-- api-linter: core::0134::request-resource-required=disabled + // aip.dev/not-precedent: UpdateFairnessState RPC doesn't follow Google API format. --) + rpc UpdateFairnessState(UpdateFairnessStateRequest) returns (UpdateFairnessStateResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } + + // CheckTaskQueueVersionMembership checks if a task queue is part of a specific deployment version. + rpc CheckTaskQueueVersionMembership(CheckTaskQueueVersionMembershipRequest) returns (CheckTaskQueueVersionMembershipResponse) { + option (temporal.server.api.common.v1.api_category).category = API_CATEGORY_STANDARD; + } } diff --git a/proto/internal/temporal/server/api/metrics/v1/message.proto b/proto/internal/temporal/server/api/metrics/v1/message.proto index 3ec31791e8..5d290db0e8 100644 --- a/proto/internal/temporal/server/api/metrics/v1/message.proto +++ b/proto/internal/temporal/server/api/metrics/v1/message.proto @@ -5,5 +5,5 @@ package temporal.server.api.metrics.v1; option go_package = "go.temporal.io/server/api/metrics/v1;metrics"; message Baggage { - map counters_int = 1; + map counters_int = 1; } diff --git a/proto/internal/temporal/server/api/namespace/v1/message.proto b/proto/internal/temporal/server/api/namespace/v1/message.proto index 5add2b28f3..2b50bb2877 100644 --- a/proto/internal/temporal/server/api/namespace/v1/message.proto +++ b/proto/internal/temporal/server/api/namespace/v1/message.proto @@ -5,10 +5,10 @@ package temporal.server.api.namespace.v1; option go_package = "go.temporal.io/server/api/namespace/v1;namespace"; message NamespaceCacheInfo { - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "in" and "by" are needed here. --) - int64 items_in_cache_by_id_count = 1; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "in" and "by" are needed here. --) - int64 items_in_cache_by_name_count = 2; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "in" and "by" are needed here. --) + int64 items_in_cache_by_id_count = 1; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "in" and "by" are needed here. --) + int64 items_in_cache_by_name_count = 2; } diff --git a/proto/internal/temporal/server/api/persistence/v1/chasm.proto b/proto/internal/temporal/server/api/persistence/v1/chasm.proto index 85db47d9d6..f557cdcb96 100644 --- a/proto/internal/temporal/server/api/persistence/v1/chasm.proto +++ b/proto/internal/temporal/server/api/persistence/v1/chasm.proto @@ -1,68 +1,69 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/timestamp.proto"; import "temporal/api/common/v1/message.proto"; import "temporal/api/failure/v1/message.proto"; import "temporal/server/api/persistence/v1/hsm.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + message ChasmNode { - // Metadata present for all nodes. - ChasmNodeMetadata metadata = 1; + // Metadata present for all nodes. + ChasmNodeMetadata metadata = 1; - // User data for any type of node that stores it. - temporal.api.common.v1.DataBlob data = 2; + // User data for any type of node that stores it. + temporal.api.common.v1.DataBlob data = 2; } message ChasmNodeMetadata { - // Versioned transition when the node was instantiated. - VersionedTransition initial_versioned_transition = 1; - // Versioned transition when the node was last updated. - VersionedTransition last_update_versioned_transition = 2; - - oneof attributes { - ChasmComponentAttributes component_attributes = 11; - ChasmDataAttributes data_attributes = 12; - ChasmCollectionAttributes collection_attributes = 13; - ChasmPointerAttributes pointer_attributes = 14; - } + // Versioned transition when the node was instantiated. + VersionedTransition initial_versioned_transition = 1; + // Versioned transition when the node was last updated. + VersionedTransition last_update_versioned_transition = 2; + + oneof attributes { + ChasmComponentAttributes component_attributes = 11; + ChasmDataAttributes data_attributes = 12; + ChasmCollectionAttributes collection_attributes = 13; + ChasmPointerAttributes pointer_attributes = 14; + } } message ChasmComponentAttributes { - message Task { - // Registered task's type ID. - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 type_id = 1; - string destination = 2; - google.protobuf.Timestamp scheduled_time = 3; - temporal.api.common.v1.DataBlob data = 4; - // Versioned transition of the execution when the task was created. - VersionedTransition versioned_transition = 5; - // The xth task generated in this versioned transition. - // Together with the versioned transition, this is a unique identifier for - // this task. - int64 versioned_transition_offset = 6; - // If a physical task is created for this task in this cluster. - // NOTE: this is a cluster-specific field and can not be replicated. - // Changes to this field also doesn't require an increase in versioned transition. - int32 physical_task_status = 7; - } - - // Registered component's type ID. + message Task { + // Registered task's type ID. // (-- api-linter: core::0141::forbidden-types=disabled --) uint32 type_id = 1; - // Tasks are in their insertion order, - // i.e. by versioned transtion and versioned_transition_offset. - repeated Task side_effect_tasks = 2; - // Tasks are ordered by their scheduled time, breaking ties by - // versioned transition and versioned_transition_offset. - repeated Task pure_tasks = 3; - // When true, this component ignores parent lifecycle validation. - // Detached components can continue operating, accepting writes and executing - // tasks, even when their parent is closed/terminated. - bool detached = 4; + string destination = 2; + google.protobuf.Timestamp scheduled_time = 3; + temporal.api.common.v1.DataBlob data = 4; + // Versioned transition of the execution when the task was created. + VersionedTransition versioned_transition = 5; + // The xth task generated in this versioned transition. + // Together with the versioned transition, this is a unique identifier for + // this task. + int64 versioned_transition_offset = 6; + // If a physical task is created for this task in this cluster. + // NOTE: this is a cluster-specific field and can not be replicated. + // Changes to this field also doesn't require an increase in versioned transition. + int32 physical_task_status = 7; + } + + // Registered component's type ID. + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 type_id = 1; + // Tasks are in their insertion order, + // i.e. by versioned transtion and versioned_transition_offset. + repeated Task side_effect_tasks = 2; + // Tasks are ordered by their scheduled time, breaking ties by + // versioned transition and versioned_transition_offset. + repeated Task pure_tasks = 3; + // When true, this component ignores parent lifecycle validation. + // Detached components can continue operating, accepting writes and executing + // tasks, even when their parent is closed/terminated. + bool detached = 4; } message ChasmDataAttributes {} @@ -70,62 +71,61 @@ message ChasmDataAttributes {} message ChasmCollectionAttributes {} message ChasmPointerAttributes { - repeated string node_path = 1; + repeated string node_path = 1; } - // ChasmTaskInfo includes component-facing task metadata message ChasmTaskInfo { - // Initial versioned transition of the component being referenced. - VersionedTransition component_initial_versioned_transition = 1; + // Initial versioned transition of the component being referenced. + VersionedTransition component_initial_versioned_transition = 1; - // Last updated transition of the component being referenced at the time the - // reference was created. Can be used to invalidate this reference. - VersionedTransition component_last_update_versioned_transition = 2; + // Last updated transition of the component being referenced at the time the + // reference was created. Can be used to invalidate this reference. + VersionedTransition component_last_update_versioned_transition = 2; - // Path to the component. - repeated string path = 3; + // Path to the component. + repeated string path = 3; - // Registered task's type ID. - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 type_id = 4; + // Registered task's type ID. + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 type_id = 4; - // Opaque attached task data. May be nil. Usable by components, not the CHASM - // framework itself. - temporal.api.common.v1.DataBlob data = 5; + // Opaque attached task data. May be nil. Usable by components, not the CHASM + // framework itself. + temporal.api.common.v1.DataBlob data = 5; - // ArchetypeID of the execution that generated this task. - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 6; + // ArchetypeID of the execution that generated this task. + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 6; } // ChasmComponentRef references a specific chasm component. message ChasmComponentRef { - string namespace_id = 1; - string business_id = 2; - string run_id = 3; + string namespace_id = 1; + string business_id = 2; + string run_id = 3; - // Executions's root component's type ID. - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 4; - - VersionedTransition execution_versioned_transition = 5; + // Executions's root component's type ID. + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 4; + + VersionedTransition execution_versioned_transition = 5; - repeated string component_path = 6; - VersionedTransition component_initial_versioned_transition = 7; + repeated string component_path = 6; + VersionedTransition component_initial_versioned_transition = 7; } // ChasmNexusCompletion includes details about a completed Nexus operation. message ChasmNexusCompletion { - oneof outcome { - // Result of a successful operation, only set if state == successful. - temporal.api.common.v1.Payload success = 1; - // Operation failure, only set if state != successful. - temporal.api.failure.v1.Failure failure = 2; - } - // Time when the operation was closed. - google.protobuf.Timestamp close_time = 3; - // Request ID embedded in the NexusOperationScheduledEvent. - // Allows completing a started operation after a workflow has been reset. - string request_id = 4; + oneof outcome { + // Result of a successful operation, only set if state == successful. + temporal.api.common.v1.Payload success = 1; + // Operation failure, only set if state != successful. + temporal.api.failure.v1.Failure failure = 2; + } + // Time when the operation was closed. + google.protobuf.Timestamp close_time = 3; + // Request ID embedded in the NexusOperationScheduledEvent. + // Allows completing a started operation after a workflow has been reset. + string request_id = 4; } diff --git a/proto/internal/temporal/server/api/persistence/v1/chasm_visibility.proto b/proto/internal/temporal/server/api/persistence/v1/chasm_visibility.proto index 6254a50426..cb475fbe19 100644 --- a/proto/internal/temporal/server/api/persistence/v1/chasm_visibility.proto +++ b/proto/internal/temporal/server/api/persistence/v1/chasm_visibility.proto @@ -1,12 +1,13 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; + option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; message ChasmVisibilityData { - int64 transition_count = 1; + int64 transition_count = 1; } message ChasmVisibilityTaskData { - int64 transition_count = 1; + int64 transition_count = 1; } diff --git a/proto/internal/temporal/server/api/persistence/v1/cluster_metadata.proto b/proto/internal/temporal/server/api/persistence/v1/cluster_metadata.proto index 952aefaba2..82f69aa0ea 100644 --- a/proto/internal/temporal/server/api/persistence/v1/cluster_metadata.proto +++ b/proto/internal/temporal/server/api/persistence/v1/cluster_metadata.proto @@ -1,30 +1,31 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "temporal/api/enums/v1/common.proto"; import "temporal/api/version/v1/message.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // data column message ClusterMetadata { - string cluster_name = 1; - int32 history_shard_count = 2; - string cluster_id = 3; - temporal.api.version.v1.VersionInfo version_info = 4; - map index_search_attributes = 5; - string cluster_address = 6; - string http_address = 13; - int64 failover_version_increment = 7; - int64 initial_failover_version = 8; - bool is_global_namespace_enabled = 9; - bool is_connection_enabled = 10; - bool use_cluster_id_membership = 11; - map tags = 12; - // is_replication_enabled controls whether replication streams are active. - bool is_replication_enabled = 14; + string cluster_name = 1; + int32 history_shard_count = 2; + string cluster_id = 3; + temporal.api.version.v1.VersionInfo version_info = 4; + map index_search_attributes = 5; + string cluster_address = 6; + string http_address = 13; + int64 failover_version_increment = 7; + int64 initial_failover_version = 8; + bool is_global_namespace_enabled = 9; + bool is_connection_enabled = 10; + bool use_cluster_id_membership = 11; + map tags = 12; + // is_replication_enabled controls whether replication streams are active. + bool is_replication_enabled = 14; } -message IndexSearchAttributes{ - map custom_search_attributes = 1; +message IndexSearchAttributes { + map custom_search_attributes = 1; } diff --git a/proto/internal/temporal/server/api/persistence/v1/executions.proto b/proto/internal/temporal/server/api/persistence/v1/executions.proto index 0c1f412cfb..d6c26a1af1 100644 --- a/proto/internal/temporal/server/api/persistence/v1/executions.proto +++ b/proto/internal/temporal/server/api/persistence/v1/executions.proto @@ -1,904 +1,901 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; - import "temporal/api/common/v1/message.proto"; +import "temporal/api/deployment/v1/message.proto"; import "temporal/api/enums/v1/common.proto"; import "temporal/api/enums/v1/event_type.proto"; import "temporal/api/enums/v1/failed_cause.proto"; import "temporal/api/enums/v1/workflow.proto"; import "temporal/api/failure/v1/message.proto"; -import "temporal/api/workflow/v1/message.proto"; import "temporal/api/history/v1/message.proto"; -import "temporal/api/deployment/v1/message.proto"; - +import "temporal/api/workflow/v1/message.proto"; import "temporal/server/api/clock/v1/message.proto"; import "temporal/server/api/enums/v1/common.proto"; import "temporal/server/api/enums/v1/nexus.proto"; -import "temporal/server/api/enums/v1/workflow.proto"; import "temporal/server/api/enums/v1/task.proto"; +import "temporal/server/api/enums/v1/workflow.proto"; import "temporal/server/api/enums/v1/workflow_task_type.proto"; import "temporal/server/api/history/v1/message.proto"; import "temporal/server/api/persistence/v1/chasm.proto"; -import "temporal/server/api/persistence/v1/queues.proto"; import "temporal/server/api/persistence/v1/hsm.proto"; +import "temporal/server/api/persistence/v1/queues.proto"; import "temporal/server/api/persistence/v1/update.proto"; import "temporal/server/api/workflow/v1/message.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // shard column message ShardInfo { - int32 shard_id = 1; - int64 range_id = 2; - string owner = 3; - reserved 4; - reserved 5; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "since" is needed here. --) - int32 stolen_since_renew = 6; - google.protobuf.Timestamp update_time = 7; - reserved 8; - reserved 9; - reserved 10; - reserved 11; - reserved 12; - map replication_dlq_ack_level = 13; - reserved 14; - reserved 15; - reserved 16; - map queue_states = 17; + int32 shard_id = 1; + int64 range_id = 2; + string owner = 3; + reserved 4; + reserved 5; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "since" is needed here. --) + int32 stolen_since_renew = 6; + google.protobuf.Timestamp update_time = 7; + reserved 8; + reserved 9; + reserved 10; + reserved 11; + reserved 12; + map replication_dlq_ack_level = 13; + reserved 14; + reserved 15; + reserved 16; + map queue_states = 17; } // execution column message WorkflowExecutionInfo { - string namespace_id = 1; - string workflow_id = 2; - string parent_namespace_id = 3; - string parent_workflow_id = 4; - string parent_run_id = 5; - int64 parent_initiated_id = 6; - int64 completion_event_batch_id = 7; - reserved 8; - string task_queue = 9; - string workflow_type_name = 10; - google.protobuf.Duration workflow_execution_timeout = 11; - google.protobuf.Duration workflow_run_timeout = 12; - google.protobuf.Duration default_workflow_task_timeout = 13; - reserved 14; - reserved 15; - reserved 16; - int64 last_running_clock = 17; - int64 last_first_event_id = 18; - int64 last_completed_workflow_task_started_event_id = 19; - // Deprecated. use `WorkflowExecutionState.start_time` - google.protobuf.Timestamp start_time = 20; - google.protobuf.Timestamp last_update_time = 21; - - // Workflow task fields. - int64 workflow_task_version = 22; - int64 workflow_task_scheduled_event_id = 23; - int64 workflow_task_started_event_id = 24; - google.protobuf.Duration workflow_task_timeout = 25; - int32 workflow_task_attempt = 26; - google.protobuf.Timestamp workflow_task_started_time = 27; - google.protobuf.Timestamp workflow_task_scheduled_time = 28; - google.protobuf.Timestamp workflow_task_original_scheduled_time = 30; - string workflow_task_request_id = 31; - temporal.server.api.enums.v1.WorkflowTaskType workflow_task_type = 68; - bool workflow_task_suggest_continue_as_new = 69; - repeated temporal.api.enums.v1.SuggestContinueAsNewReason workflow_task_suggest_continue_as_new_reasons = 110; - bool workflow_task_target_worker_deployment_version_changed = 112; - int64 workflow_task_history_size_bytes = 70; - // tracks the started build ID for transient/speculative WFT. This info is used for two purposes: - // - verify WFT completes by the same Build ID that started in the latest attempt - // - when persisting transient/speculative WFT, the right Build ID is used in the WFT started event - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - string workflow_task_build_id = 88; - // tracks the started build ID redirect counter for transient/speculative WFT. This info is to - // ensure the right redirect counter is used in the WFT started event created later - // for a transient/speculative WFT. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - int64 workflow_task_build_id_redirect_counter = 89; - // Stamp represents the "version" of the workflow's internal state. - // It increases monotonically when the workflow's options are modified. - // It is used to check if a workflow task is still relevant to the corresponding workflow state machine. - int32 workflow_task_stamp = 109; - // AttemptsSinceLastSuccess tracks the number of workflow task attempts since the last successful workflow task. - // This is carried over when buffered events are applied after workflow task failures. - // Used by the TemporalReportedProblems search attribute to track continuous failure count. - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "since" is needed here. --) - int32 workflow_task_attempts_since_last_success = 111; - - bool cancel_requested = 29; - string cancel_request_id = 32; - string sticky_task_queue = 33; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration sticky_schedule_to_start_timeout = 34; - int32 attempt = 35; - google.protobuf.Duration retry_initial_interval = 36; - google.protobuf.Duration retry_maximum_interval = 37; - int32 retry_maximum_attempts = 38; - double retry_backoff_coefficient = 39; - google.protobuf.Timestamp workflow_execution_expiration_time = 40; - repeated string retry_non_retryable_error_types = 41; - bool has_retry_policy = 42; - string cron_schedule = 43; - reserved 44; - reserved 45; - int64 signal_count = 46; - int64 activity_count = 71; - int64 child_execution_count = 72; - int64 user_timer_count = 73; - int64 request_cancel_external_count = 74; - int64 signal_external_count = 75; - int64 update_count = 77; - reserved 47; - reserved 48; - reserved 49; - reserved 50; - temporal.api.workflow.v1.ResetPoints auto_reset_points = 51; - map search_attributes = 52; - map memo = 53; - temporal.server.api.history.v1.VersionHistories version_histories = 54; - string first_execution_run_id = 55; - ExecutionStats execution_stats = 56; - google.protobuf.Timestamp workflow_run_expiration_time = 57; - // Transaction Id of the first event in the last batch of events. - int64 last_first_event_txn_id = 58; - int64 state_transition_count = 59; - google.protobuf.Timestamp execution_time = 60; - // If continued-as-new, or retried, or cron, holds the new run id. - string new_execution_run_id = 61; - temporal.server.api.clock.v1.VectorClock parent_clock = 62; - // version of child execution initiated event in parent workflow - int64 parent_initiated_version = 63; - // Used to check if transfer close task is processed before deleting the workflow execution. - int64 close_transfer_task_id = 64; - // Used to check if visibility close task is processed before deleting the workflow execution. - int64 close_visibility_task_id = 65; - google.protobuf.Timestamp close_time = 66; - // Relocatable attributes are memo and search attributes. If they were removed, then they are not - // present in the mutable state, and they should be in visibility store. - bool relocatable_attributes_removed = 67; - temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 76; - // If using build-id based versioning: version stamp of the last worker to complete a - // workflow tasks for this workflow. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 78; - // The currently assigned build ID for this execution. Presence of this value means worker versioning is used - // for this execution. Assigned build ID is selected by matching based on Worker Versioning Assignment Rules - // when the first workflow task of the execution is scheduled. If the first workflow task fails and is scheduled - // again, the assigned build ID may change according to the latest versioning rules. - // Assigned build ID can also change in the middle of a execution if Compatible Redirect Rules are applied to - // this execution. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - string assigned_build_id = 85; - // Build ID inherited from a previous/parent execution. If present, assigned_build_id will be set to this, instead - // of using the assignment rules. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - string inherited_build_id = 86; - // Tracks the number of times a redirect rule is applied to this workflow. Used to apply redirects in the right - // order when mutable state is rebuilt from history events. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - int64 build_id_redirect_counter = 87; - - // index of update IDs and pointers to associated history events. - map update_infos = 79; - - // Transition history encodes all transitions a mutable state object has gone through in a compact way. - // Here the transition_count field of VersionedTransition represents the maximum transition count the mutable state object - // has gone through for the corresponding namespace failover version. - // For example, if the transition history is `[{v: 1, t: 3}, {v: 2, t: 5}]`, it means transition 1-3 have failover version 1, - // and transition 4-5 have failover version 2. - // - // Each task generated by the HSM framework is imprinted with the current VersionedTransition at the end of the transaction. - // When a task is being processed, the transition history is compared with the imprinted task information to - // verify that a task is not referencing a stale state or that the task itself is not stale. - // For example, with the same transition history above, task A `{v: 2, t: 4}` **is not** - // referencing stale state because for version `2` transitions `4-5` are valid, while task B `{v: 2, t: 6}` **is** - // referencing stale state because the transition count is out of range for version `2`. - // Furthermore, task C `{v: 1, t: 4}` itself is stale because it is referencing an impossible state, likely due to post - // split-brain reconciliation. - repeated VersionedTransition transition_history = 80; - // Map of state machine type to map of machine by ID. - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "by" is used to clarify the keys and values. --) - map sub_state_machines_by_type = 81; - - // This field is for tracking if the workflow execution timer task is created or not. - // We don't need this field if we always create the execution timer task when the first - // workflow in a workflow chain starts. However, this execution timer logic is later added. - // To maintain backward compatibility, we need to track if the execution timer task is created - // for a workflow chain since later workflows in the chain also need to create the execution - // timer task if it is not created yet. - // NOTE: Task status is clsuter specific information, so when replicating mutable state, this - // field need to be sanitized. - int32 workflow_execution_timer_task_status = 82; - - // The root workflow execution is defined as follows: - // 1. A workflow without parent workflow is its own root workflow. - // 2. A workflow that has a parent workflow has the same root workflow as its parent workflow. - string root_workflow_id = 83; - string root_run_id = 84; - - // Timer tasks emitted from state machines are stored in this array, grouped and sorted by their deadline. Only the - // next state machine timer task is generated at a time per mutable state. When that task is processed it iterates - // this array and triggers timers that are ready. - // NOTE: Task status is cluster specific information, so when replicating mutable state, this field needs to be - // sanitized. - repeated StateMachineTimerGroup state_machine_timers = 90; - - // The shard clock's timestamp at the time the first valid task was created for this mutable state (either for a new - // mutable state or when rebuilding from events). The field should be updated whenever we refresh tasks, marking - // older generation tasks obsolete. - // This field is used for task staleness checks when mutable state is rebuilt. - // NOTE: Task status is cluster specific information, so when replicating mutable state, this field needs to be - // sanitized. - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: Ignoring api-linter rules for clarity --) - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: This is a vector clock, not a timestamp --) - int64 task_generation_shard_clock_timestamp = 91; - - VersionedTransition workflow_task_last_update_versioned_transition = 92; - VersionedTransition visibility_last_update_versioned_transition = 93; - VersionedTransition signal_request_ids_last_update_versioned_transition = 94; - - repeated StateMachineTombstoneBatch sub_state_machine_tombstone_batches = 95; - - // The workflow has been reset. - bool workflow_was_reset = 96; - - // Reset Run ID points to the new nun when this execution is reset. If the execution is reset multiple times, it points to the latest run. - string reset_run_id = 97; - - // When present, it means the workflow execution is versioned, or is transitioning from - // unversioned workers to versioned ones. - // Note: Deployment objects inside versioning info are immutable, never change their fields. - // (-- api-linter: core::0203::immutable=disabled - // aip.dev/not-precedent: field_behavior annotation is not yet used in this repo --) - temporal.api.workflow.v1.WorkflowExecutionVersioningInfo versioning_info = 98; - - // This is the run id when the WorkflowExecutionStarted event was written. - // A workflow reset changes the execution run_id, but preserves this field so that we have a reference to the original workflow execution that was reset. - string original_execution_run_id = 99; - - // These two fields are to record the transition history when the transition history is cleaned up due to disabling transition history - // Should be deprecated once the transition history is fully launched - repeated VersionedTransition previous_transition_history = 100; - VersionedTransition last_transition_history_break_point = 101; - - // This is a set of child workflows that were initialized after the reset point in the parent workflow. - // The children are identified by the key "workflow_type:workflow_id". When the parent starts to make progress after reset, it uses this data to - // determine the right start policy to apply to the child. This list will include children initiated in continue-as-new runs. - map children_initialized_post_reset_point = 102; - // The worker deployment that completed the last WFT. - string worker_deployment_name = 103; - - // Priority contains metadata that controls relative ordering of task processing - // when tasks are backed up in a queue. - temporal.api.common.v1.Priority priority = 104; - - // Run ID of the execution that supersedes this one (via terminate or continue-as-new). - string successor_run_id = 105; - - // Pause info contains the details of the request to pause the workflow. - WorkflowPauseInfo pause_info = 106; - - // Last workflow task failure category and cause are used to track the last workflow task failure category and cause. - oneof last_workflow_task_failure { - temporal.api.enums.v1.WorkflowTaskFailedCause last_workflow_task_failure_cause = 107; - temporal.api.enums.v1.TimeoutType last_workflow_task_timed_out_type = 108; - } - - // The last target version for which the server set targetDeploymentVersionChanged - // to true on a workflow task started event. Updated on each workflow task start, - // set only when the server decides to set the targetDeploymentVersionChanged flag - // to true. - // - // This is a wrapper message to distinguish "never notified" (nil wrapper) from - // "notified about an unversioned target" (non-nil wrapper with nil deployment_version). - // - // Read at continue-as-new time: if set, it becomes the declined_target_version_upgrade - // for the next run. If nil, the existing declined value is preserved (CaN chain). - LastNotifiedTargetVersion last_notified_target_version = 113; - - // The target version that the SDK previously declined to upgrade to. Inherited - // from a previous run via continue-as-new or retry. At CaN time, computed as: - // if last_notified_target_version != nil → use that (latest signal was declined) - // else → preserve existing declined value (CaN chain, never re-signaled) - // - // Wrapper distinguishes "never declined" (nil) from "declined unversioned" (non-nil, nil version). - temporal.api.history.v1.DeclinedTargetVersionUpgrade declined_target_version_upgrade = 114; + string namespace_id = 1; + string workflow_id = 2; + string parent_namespace_id = 3; + string parent_workflow_id = 4; + string parent_run_id = 5; + int64 parent_initiated_id = 6; + int64 completion_event_batch_id = 7; + reserved 8; + string task_queue = 9; + string workflow_type_name = 10; + google.protobuf.Duration workflow_execution_timeout = 11; + google.protobuf.Duration workflow_run_timeout = 12; + google.protobuf.Duration default_workflow_task_timeout = 13; + reserved 14; + reserved 15; + reserved 16; + int64 last_running_clock = 17; + int64 last_first_event_id = 18; + int64 last_completed_workflow_task_started_event_id = 19; + // Deprecated. use `WorkflowExecutionState.start_time` + google.protobuf.Timestamp start_time = 20; + google.protobuf.Timestamp last_update_time = 21; + + // Workflow task fields. + int64 workflow_task_version = 22; + int64 workflow_task_scheduled_event_id = 23; + int64 workflow_task_started_event_id = 24; + google.protobuf.Duration workflow_task_timeout = 25; + int32 workflow_task_attempt = 26; + google.protobuf.Timestamp workflow_task_started_time = 27; + google.protobuf.Timestamp workflow_task_scheduled_time = 28; + google.protobuf.Timestamp workflow_task_original_scheduled_time = 30; + string workflow_task_request_id = 31; + temporal.server.api.enums.v1.WorkflowTaskType workflow_task_type = 68; + bool workflow_task_suggest_continue_as_new = 69; + repeated temporal.api.enums.v1.SuggestContinueAsNewReason workflow_task_suggest_continue_as_new_reasons = 110; + bool workflow_task_target_worker_deployment_version_changed = 112; + int64 workflow_task_history_size_bytes = 70; + // tracks the started build ID for transient/speculative WFT. This info is used for two purposes: + // - verify WFT completes by the same Build ID that started in the latest attempt + // - when persisting transient/speculative WFT, the right Build ID is used in the WFT started event + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + string workflow_task_build_id = 88; + // tracks the started build ID redirect counter for transient/speculative WFT. This info is to + // ensure the right redirect counter is used in the WFT started event created later + // for a transient/speculative WFT. + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + int64 workflow_task_build_id_redirect_counter = 89; + // Stamp represents the "version" of the workflow's internal state. + // It increases monotonically when the workflow's options are modified. + // It is used to check if a workflow task is still relevant to the corresponding workflow state machine. + int32 workflow_task_stamp = 109; + // AttemptsSinceLastSuccess tracks the number of workflow task attempts since the last successful workflow task. + // This is carried over when buffered events are applied after workflow task failures. + // Used by the TemporalReportedProblems search attribute to track continuous failure count. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "since" is needed here. --) + int32 workflow_task_attempts_since_last_success = 111; + + bool cancel_requested = 29; + string cancel_request_id = 32; + string sticky_task_queue = 33; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration sticky_schedule_to_start_timeout = 34; + int32 attempt = 35; + google.protobuf.Duration retry_initial_interval = 36; + google.protobuf.Duration retry_maximum_interval = 37; + int32 retry_maximum_attempts = 38; + double retry_backoff_coefficient = 39; + google.protobuf.Timestamp workflow_execution_expiration_time = 40; + repeated string retry_non_retryable_error_types = 41; + bool has_retry_policy = 42; + string cron_schedule = 43; + reserved 44; + reserved 45; + int64 signal_count = 46; + int64 activity_count = 71; + int64 child_execution_count = 72; + int64 user_timer_count = 73; + int64 request_cancel_external_count = 74; + int64 signal_external_count = 75; + int64 update_count = 77; + reserved 47; + reserved 48; + reserved 49; + reserved 50; + temporal.api.workflow.v1.ResetPoints auto_reset_points = 51; + map search_attributes = 52; + map memo = 53; + temporal.server.api.history.v1.VersionHistories version_histories = 54; + string first_execution_run_id = 55; + ExecutionStats execution_stats = 56; + google.protobuf.Timestamp workflow_run_expiration_time = 57; + // Transaction Id of the first event in the last batch of events. + int64 last_first_event_txn_id = 58; + int64 state_transition_count = 59; + google.protobuf.Timestamp execution_time = 60; + // If continued-as-new, or retried, or cron, holds the new run id. + string new_execution_run_id = 61; + temporal.server.api.clock.v1.VectorClock parent_clock = 62; + // version of child execution initiated event in parent workflow + int64 parent_initiated_version = 63; + // Used to check if transfer close task is processed before deleting the workflow execution. + int64 close_transfer_task_id = 64; + // Used to check if visibility close task is processed before deleting the workflow execution. + int64 close_visibility_task_id = 65; + google.protobuf.Timestamp close_time = 66; + // Relocatable attributes are memo and search attributes. If they were removed, then they are not + // present in the mutable state, and they should be in visibility store. + bool relocatable_attributes_removed = 67; + temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 76; + // If using build-id based versioning: version stamp of the last worker to complete a + // workflow tasks for this workflow. + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 78; + // The currently assigned build ID for this execution. Presence of this value means worker versioning is used + // for this execution. Assigned build ID is selected by matching based on Worker Versioning Assignment Rules + // when the first workflow task of the execution is scheduled. If the first workflow task fails and is scheduled + // again, the assigned build ID may change according to the latest versioning rules. + // Assigned build ID can also change in the middle of a execution if Compatible Redirect Rules are applied to + // this execution. + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + string assigned_build_id = 85; + // Build ID inherited from a previous/parent execution. If present, assigned_build_id will be set to this, instead + // of using the assignment rules. + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + string inherited_build_id = 86; + // Tracks the number of times a redirect rule is applied to this workflow. Used to apply redirects in the right + // order when mutable state is rebuilt from history events. + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + int64 build_id_redirect_counter = 87; + + // index of update IDs and pointers to associated history events. + map update_infos = 79; + + // Transition history encodes all transitions a mutable state object has gone through in a compact way. + // Here the transition_count field of VersionedTransition represents the maximum transition count the mutable state object + // has gone through for the corresponding namespace failover version. + // For example, if the transition history is `[{v: 1, t: 3}, {v: 2, t: 5}]`, it means transition 1-3 have failover version 1, + // and transition 4-5 have failover version 2. + // + // Each task generated by the HSM framework is imprinted with the current VersionedTransition at the end of the transaction. + // When a task is being processed, the transition history is compared with the imprinted task information to + // verify that a task is not referencing a stale state or that the task itself is not stale. + // For example, with the same transition history above, task A `{v: 2, t: 4}` **is not** + // referencing stale state because for version `2` transitions `4-5` are valid, while task B `{v: 2, t: 6}` **is** + // referencing stale state because the transition count is out of range for version `2`. + // Furthermore, task C `{v: 1, t: 4}` itself is stale because it is referencing an impossible state, likely due to post + // split-brain reconciliation. + repeated VersionedTransition transition_history = 80; + // Map of state machine type to map of machine by ID. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "by" is used to clarify the keys and values. --) + map sub_state_machines_by_type = 81; + + // This field is for tracking if the workflow execution timer task is created or not. + // We don't need this field if we always create the execution timer task when the first + // workflow in a workflow chain starts. However, this execution timer logic is later added. + // To maintain backward compatibility, we need to track if the execution timer task is created + // for a workflow chain since later workflows in the chain also need to create the execution + // timer task if it is not created yet. + // NOTE: Task status is clsuter specific information, so when replicating mutable state, this + // field need to be sanitized. + int32 workflow_execution_timer_task_status = 82; + + // The root workflow execution is defined as follows: + // 1. A workflow without parent workflow is its own root workflow. + // 2. A workflow that has a parent workflow has the same root workflow as its parent workflow. + string root_workflow_id = 83; + string root_run_id = 84; + + // Timer tasks emitted from state machines are stored in this array, grouped and sorted by their deadline. Only the + // next state machine timer task is generated at a time per mutable state. When that task is processed it iterates + // this array and triggers timers that are ready. + // NOTE: Task status is cluster specific information, so when replicating mutable state, this field needs to be + // sanitized. + repeated StateMachineTimerGroup state_machine_timers = 90; + + // The shard clock's timestamp at the time the first valid task was created for this mutable state (either for a new + // mutable state or when rebuilding from events). The field should be updated whenever we refresh tasks, marking + // older generation tasks obsolete. + // This field is used for task staleness checks when mutable state is rebuilt. + // NOTE: Task status is cluster specific information, so when replicating mutable state, this field needs to be + // sanitized. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: Ignoring api-linter rules for clarity --) + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: This is a vector clock, not a timestamp --) + int64 task_generation_shard_clock_timestamp = 91; + + VersionedTransition workflow_task_last_update_versioned_transition = 92; + VersionedTransition visibility_last_update_versioned_transition = 93; + VersionedTransition signal_request_ids_last_update_versioned_transition = 94; + + repeated StateMachineTombstoneBatch sub_state_machine_tombstone_batches = 95; + + // The workflow has been reset. + bool workflow_was_reset = 96; + + // Reset Run ID points to the new nun when this execution is reset. If the execution is reset multiple times, it points to the latest run. + string reset_run_id = 97; + + // When present, it means the workflow execution is versioned, or is transitioning from + // unversioned workers to versioned ones. + // Note: Deployment objects inside versioning info are immutable, never change their fields. + // (-- api-linter: core::0203::immutable=disabled + // aip.dev/not-precedent: field_behavior annotation is not yet used in this repo --) + temporal.api.workflow.v1.WorkflowExecutionVersioningInfo versioning_info = 98; + + // This is the run id when the WorkflowExecutionStarted event was written. + // A workflow reset changes the execution run_id, but preserves this field so that we have a reference to the original workflow execution that was reset. + string original_execution_run_id = 99; + + // These two fields are to record the transition history when the transition history is cleaned up due to disabling transition history + // Should be deprecated once the transition history is fully launched + repeated VersionedTransition previous_transition_history = 100; + VersionedTransition last_transition_history_break_point = 101; + + // This is a set of child workflows that were initialized after the reset point in the parent workflow. + // The children are identified by the key "workflow_type:workflow_id". When the parent starts to make progress after reset, it uses this data to + // determine the right start policy to apply to the child. This list will include children initiated in continue-as-new runs. + map children_initialized_post_reset_point = 102; + // The worker deployment that completed the last WFT. + string worker_deployment_name = 103; + + // Priority contains metadata that controls relative ordering of task processing + // when tasks are backed up in a queue. + temporal.api.common.v1.Priority priority = 104; + + // Run ID of the execution that supersedes this one (via terminate or continue-as-new). + string successor_run_id = 105; + + // Pause info contains the details of the request to pause the workflow. + WorkflowPauseInfo pause_info = 106; + + // Last workflow task failure category and cause are used to track the last workflow task failure category and cause. + oneof last_workflow_task_failure { + temporal.api.enums.v1.WorkflowTaskFailedCause last_workflow_task_failure_cause = 107; + temporal.api.enums.v1.TimeoutType last_workflow_task_timed_out_type = 108; + } + + // The last target version for which the server set targetDeploymentVersionChanged + // to true on a workflow task started event. Updated on each workflow task start, + // set only when the server decides to set the targetDeploymentVersionChanged flag + // to true. + // + // This is a wrapper message to distinguish "never notified" (nil wrapper) from + // "notified about an unversioned target" (non-nil wrapper with nil deployment_version). + // + // Read at continue-as-new time: if set, it becomes the declined_target_version_upgrade + // for the next run. If nil, the existing declined value is preserved (CaN chain). + LastNotifiedTargetVersion last_notified_target_version = 113; + + // The target version that the SDK previously declined to upgrade to. Inherited + // from a previous run via continue-as-new or retry. At CaN time, computed as: + // if last_notified_target_version != nil → use that (latest signal was declined) + // else → preserve existing declined value (CaN chain, never re-signaled) + // + // Wrapper distinguishes "never declined" (nil) from "declined unversioned" (non-nil, nil version). + temporal.api.history.v1.DeclinedTargetVersionUpgrade declined_target_version_upgrade = 114; } // Internal wrapper message to distinguish "never notified" (nil wrapper) from // "notified about an unversioned target" (non-nil wrapper with nil deployment_version). // Used only within server persistence; never flows to the public API. message LastNotifiedTargetVersion { - temporal.api.deployment.v1.WorkerDeploymentVersion deployment_version = 1; + temporal.api.deployment.v1.WorkerDeploymentVersion deployment_version = 1; } message ExecutionStats { - int64 history_size = 1; - // Total size in bytes of all external payloads referenced in the entire history tree of the execution, not just the current branch. - // This number doesn't include payloads in buffered events. - int64 external_payload_size = 2; - // Total count of external payloads referenced in the entire history tree of the execution, not just the current branch. - // This number doesn't include payloads in buffered events. - int64 external_payload_count = 3; + int64 history_size = 1; + // Total size in bytes of all external payloads referenced in the entire history tree of the execution, not just the current branch. + // This number doesn't include payloads in buffered events. + int64 external_payload_size = 2; + // Total count of external payloads referenced in the entire history tree of the execution, not just the current branch. + // This number doesn't include payloads in buffered events. + int64 external_payload_count = 3; } // execution_state column message WorkflowExecutionState { - string create_request_id = 1; - string run_id = 2; - temporal.server.api.enums.v1.WorkflowExecutionState state = 3; - temporal.api.enums.v1.WorkflowExecutionStatus status = 4; - VersionedTransition last_update_versioned_transition = 5; - google.protobuf.Timestamp start_time = 6; - // Request IDs that are attached to the workflow execution. It can be the request ID that started - // the workflow execution or request IDs that were attached to an existing running workflow - // execution via StartWorkflowExecutionRequest.OnConflictOptions. - map request_ids = 7; + string create_request_id = 1; + string run_id = 2; + temporal.server.api.enums.v1.WorkflowExecutionState state = 3; + temporal.api.enums.v1.WorkflowExecutionStatus status = 4; + VersionedTransition last_update_versioned_transition = 5; + google.protobuf.Timestamp start_time = 6; + // Request IDs that are attached to the workflow execution. It can be the request ID that started + // the workflow execution or request IDs that were attached to an existing running workflow + // execution via StartWorkflowExecutionRequest.OnConflictOptions. + map request_ids = 7; } message RequestIDInfo { - temporal.api.enums.v1.EventType event_type = 1; - int64 event_id = 2; + temporal.api.enums.v1.EventType event_type = 1; + int64 event_id = 2; } // transfer column message TransferTaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.enums.v1.TaskType task_type = 4; - string target_namespace_id = 5; - string target_workflow_id = 6; - string target_run_id = 7; - string task_queue = 8; - bool target_child_workflow_only = 9; - int64 scheduled_event_id = 10; - int64 version = 11; - int64 task_id = 12; - google.protobuf.Timestamp visibility_time = 13; - reserved 14; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "after" is used to indicate sequence of actions. --) - bool delete_after_close = 15; - message CloseExecutionTaskDetails { - // can_skip_visibility_archival is set to true when we can guarantee that visibility records will be archived - // by some other task, so this task doesn't need to worry about it. - bool can_skip_visibility_archival = 1; - } - oneof task_details { - CloseExecutionTaskDetails close_execution_task_details = 16; - - // If the task addresses a CHASM component, this field will be set. - ChasmTaskInfo chasm_task_info = 18; - } - // Stamp represents the "version" of the entity's internal state for which the transfer task was created. - // It increases monotonically when the entity's options are modified. - // It is used to check if a task is still relevant to the entity's corresponding state machine. - int32 stamp = 17; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.enums.v1.TaskType task_type = 4; + string target_namespace_id = 5; + string target_workflow_id = 6; + string target_run_id = 7; + string task_queue = 8; + bool target_child_workflow_only = 9; + int64 scheduled_event_id = 10; + int64 version = 11; + int64 task_id = 12; + google.protobuf.Timestamp visibility_time = 13; + reserved 14; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "after" is used to indicate sequence of actions. --) + bool delete_after_close = 15; + message CloseExecutionTaskDetails { + // can_skip_visibility_archival is set to true when we can guarantee that visibility records will be archived + // by some other task, so this task doesn't need to worry about it. + bool can_skip_visibility_archival = 1; + } + oneof task_details { + CloseExecutionTaskDetails close_execution_task_details = 16; + + // If the task addresses a CHASM component, this field will be set. + ChasmTaskInfo chasm_task_info = 18; + } + // Stamp represents the "version" of the entity's internal state for which the transfer task was created. + // It increases monotonically when the entity's options are modified. + // It is used to check if a task is still relevant to the entity's corresponding state machine. + int32 stamp = 17; } // replication column message ReplicationTaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.enums.v1.TaskType task_type = 4; - int64 version = 5; - int64 first_event_id = 6; - int64 next_event_id = 7; - int64 scheduled_event_id = 8; - reserved 9; - reserved 10; - bytes branch_token = 11; - reserved 12; - bytes new_run_branch_token = 13; - reserved 14; - int64 task_id = 15; - google.protobuf.Timestamp visibility_time = 16; - string new_run_id = 17; - temporal.server.api.enums.v1.TaskPriority priority = 18; - VersionedTransition versioned_transition = 19; - // A list of event-based replication tasks that, together, are equivalent - // to this state-based task. - // TODO: Remove this field when state-based replication is stable and - // doesn't need to be disabled. - repeated ReplicationTaskInfo task_equivalents = 20; - history.v1.VersionHistoryItem last_version_history_item = 21; - bool is_first_task = 22; - repeated string target_clusters = 23; - bool is_force_replication = 24; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 25; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.enums.v1.TaskType task_type = 4; + int64 version = 5; + int64 first_event_id = 6; + int64 next_event_id = 7; + int64 scheduled_event_id = 8; + reserved 9; + reserved 10; + bytes branch_token = 11; + reserved 12; + bytes new_run_branch_token = 13; + reserved 14; + int64 task_id = 15; + google.protobuf.Timestamp visibility_time = 16; + string new_run_id = 17; + temporal.server.api.enums.v1.TaskPriority priority = 18; + VersionedTransition versioned_transition = 19; + // A list of event-based replication tasks that, together, are equivalent + // to this state-based task. + // TODO: Remove this field when state-based replication is stable and + // doesn't need to be disabled. + repeated ReplicationTaskInfo task_equivalents = 20; + history.v1.VersionHistoryItem last_version_history_item = 21; + bool is_first_task = 22; + repeated string target_clusters = 23; + bool is_force_replication = 24; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 25; } // visibility_task_data column message VisibilityTaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.enums.v1.TaskType task_type = 4; - int64 version = 5; - int64 task_id = 6; - google.protobuf.Timestamp visibility_time = 7; - reserved 8; - reserved 9; - int64 close_visibility_task_id = 10; - google.protobuf.Timestamp close_time = 11; - - oneof task_details { - // If the task addresses a CHASM component, this field will be set. - ChasmTaskInfo chasm_task_info = 12; - } + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.enums.v1.TaskType task_type = 4; + int64 version = 5; + int64 task_id = 6; + google.protobuf.Timestamp visibility_time = 7; + reserved 8; + reserved 9; + int64 close_visibility_task_id = 10; + google.protobuf.Timestamp close_time = 11; + + oneof task_details { + // If the task addresses a CHASM component, this field will be set. + ChasmTaskInfo chasm_task_info = 12; + } } // timer column message TimerTaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.enums.v1.TaskType task_type = 4; - temporal.api.enums.v1.TimeoutType timeout_type = 5; - temporal.server.api.enums.v1.WorkflowBackoffType workflow_backoff_type = 6; - int64 version = 7; - int32 schedule_attempt = 8; - int64 event_id = 9; - int64 task_id = 10; - google.protobuf.Timestamp visibility_time = 11; - bytes branch_token = 12; - // If this is true, we can bypass archival before deleting. Only defined for DeleteHistoryEventTasks. - bool already_archived = 13; - - // Number of transitions on the corresponding mutable state object. Used to verify that a task is not referencing a - // stale state or, in some situations, that the task itself is not stale. - // If task addresses a sub-statemachine (e.g. callback), this field will be set. - int64 mutable_state_transition_count = 14; - - // If specified, the task is a for a workflow chain instead of a specific workflow run. - // A workflow chain is identified by the run_id of the first workflow in the chain. - string first_run_id = 15; - - // Stamp represents the "version" of the entity's internal state for which the timer task was created. - // It increases monotonically when the entity's options are modified. - // It is used to check if a task is still relevant to the entity's corresponding state machine. - int32 stamp = 16; - - oneof task_details { - // If the task addresses a CHASM component, this field will be set. - ChasmTaskInfo chasm_task_info = 17; - } + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.enums.v1.TaskType task_type = 4; + temporal.api.enums.v1.TimeoutType timeout_type = 5; + temporal.server.api.enums.v1.WorkflowBackoffType workflow_backoff_type = 6; + int64 version = 7; + int32 schedule_attempt = 8; + int64 event_id = 9; + int64 task_id = 10; + google.protobuf.Timestamp visibility_time = 11; + bytes branch_token = 12; + // If this is true, we can bypass archival before deleting. Only defined for DeleteHistoryEventTasks. + bool already_archived = 13; + + // Number of transitions on the corresponding mutable state object. Used to verify that a task is not referencing a + // stale state or, in some situations, that the task itself is not stale. + // If task addresses a sub-statemachine (e.g. callback), this field will be set. + int64 mutable_state_transition_count = 14; + + // If specified, the task is a for a workflow chain instead of a specific workflow run. + // A workflow chain is identified by the run_id of the first workflow in the chain. + string first_run_id = 15; + + // Stamp represents the "version" of the entity's internal state for which the timer task was created. + // It increases monotonically when the entity's options are modified. + // It is used to check if a task is still relevant to the entity's corresponding state machine. + int32 stamp = 16; + + oneof task_details { + // If the task addresses a CHASM component, this field will be set. + ChasmTaskInfo chasm_task_info = 17; + } } message ArchivalTaskInfo { - int64 task_id = 1; - string namespace_id = 2; - string workflow_id = 3; - string run_id = 4; - temporal.server.api.enums.v1.TaskType task_type = 5; - int64 version = 6; - google.protobuf.Timestamp visibility_time = 7; + int64 task_id = 1; + string namespace_id = 2; + string workflow_id = 3; + string run_id = 4; + temporal.server.api.enums.v1.TaskType task_type = 5; + int64 version = 6; + google.protobuf.Timestamp visibility_time = 7; } message OutboundTaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; - temporal.server.api.enums.v1.TaskType task_type = 4; - int64 task_id = 5; - google.protobuf.Timestamp visibility_time = 6; + temporal.server.api.enums.v1.TaskType task_type = 4; + int64 task_id = 5; + google.protobuf.Timestamp visibility_time = 6; - // Destination of this task (e.g. protocol+host+port for callbacks). - // Outbound tasks are grouped by this field (and the namespace ID) when scheduling. - string destination = 7; + // Destination of this task (e.g. protocol+host+port for callbacks). + // Outbound tasks are grouped by this field (and the namespace ID) when scheduling. + string destination = 7; - oneof task_details { - // If task addresses a sub-statemachine (e.g. callback), this field will be set. - StateMachineTaskInfo state_machine_info = 8; + oneof task_details { + // If task addresses a sub-statemachine (e.g. callback), this field will be set. + StateMachineTaskInfo state_machine_info = 8; - // If the task addresses a CHASM component, this field will be set. - ChasmTaskInfo chasm_task_info = 9; - } + // If the task addresses a CHASM component, this field will be set. + ChasmTaskInfo chasm_task_info = 9; + } } message NexusInvocationTaskInfo { - int32 attempt = 1; + int32 attempt = 1; } message NexusCancelationTaskInfo { - int32 attempt = 1; + int32 attempt = 1; } // activity_map column message ActivityInfo { - - int64 version = 1; - int64 scheduled_event_batch_id = 2; - reserved 3; - google.protobuf.Timestamp scheduled_time = 4; - int64 started_event_id = 5; - reserved 6; - google.protobuf.Timestamp started_time = 7; - string activity_id = 8; - string request_id = 9; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_start_timeout = 10; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_close_timeout = 11; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration start_to_close_timeout = 12; - google.protobuf.Duration heartbeat_timeout = 13; - bool cancel_requested = 14; - int64 cancel_request_id = 15; - int32 timer_task_status = 16; - int32 attempt = 17; - string task_queue = 18; - string started_identity = 19; - bool has_retry_policy = 20; - google.protobuf.Duration retry_initial_interval = 21; - google.protobuf.Duration retry_maximum_interval = 22; - int32 retry_maximum_attempts = 23; - google.protobuf.Timestamp retry_expiration_time = 24; - double retry_backoff_coefficient = 25; - repeated string retry_non_retryable_error_types = 26; - temporal.api.failure.v1.Failure retry_last_failure = 27; - string retry_last_worker_identity = 28; - reserved 29; - int64 scheduled_event_id = 30; - temporal.api.common.v1.Payloads last_heartbeat_details = 31; - google.protobuf.Timestamp last_heartbeat_update_time = 32; - // When true, it means the activity is assigned to the build ID of its workflow (only set for old versioning) - // Deprecated. use `use_workflow_build_id` + int64 version = 1; + int64 scheduled_event_batch_id = 2; + reserved 3; + google.protobuf.Timestamp scheduled_time = 4; + int64 started_event_id = 5; + reserved 6; + google.protobuf.Timestamp started_time = 7; + string activity_id = 8; + string request_id = 9; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_start_timeout = 10; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_close_timeout = 11; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration start_to_close_timeout = 12; + google.protobuf.Duration heartbeat_timeout = 13; + bool cancel_requested = 14; + int64 cancel_request_id = 15; + int32 timer_task_status = 16; + int32 attempt = 17; + string task_queue = 18; + string started_identity = 19; + bool has_retry_policy = 20; + google.protobuf.Duration retry_initial_interval = 21; + google.protobuf.Duration retry_maximum_interval = 22; + int32 retry_maximum_attempts = 23; + google.protobuf.Timestamp retry_expiration_time = 24; + double retry_backoff_coefficient = 25; + repeated string retry_non_retryable_error_types = 26; + temporal.api.failure.v1.Failure retry_last_failure = 27; + string retry_last_worker_identity = 28; + reserved 29; + int64 scheduled_event_id = 30; + temporal.api.common.v1.Payloads last_heartbeat_details = 31; + google.protobuf.Timestamp last_heartbeat_update_time = 32; + // When true, it means the activity is assigned to the build ID of its workflow (only set for old versioning) + // Deprecated. use `use_workflow_build_id` + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + bool use_compatible_version = 33; + temporal.api.common.v1.ActivityType activity_type = 34; + // Absence of `assigned_build_id` generally means this task is on an "unversioned" task queue. + // In rare cases, it can also mean that the task queue is versioned but we failed to write activity's + // independently-assigned build ID to the database. This case heals automatically once the task is dispatched. + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + oneof build_id_info { + // When present, it means this activity is assigned to the build ID of its workflow. // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - bool use_compatible_version = 33; - temporal.api.common.v1.ActivityType activity_type = 34; - // Absence of `assigned_build_id` generally means this task is on an "unversioned" task queue. - // In rare cases, it can also mean that the task queue is versioned but we failed to write activity's - // independently-assigned build ID to the database. This case heals automatically once the task is dispatched. + UseWorkflowBuildIdInfo use_workflow_build_id_info = 35; + // This means the activity is independently versioned and not bound to the build ID of its workflow. + // If the task fails and is scheduled again, the assigned build ID may change according to the latest versioning + // rules. This value also updates if a redirect rule is applied to the activity task to reflect the build ID + // of the worker who received the task. // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - oneof build_id_info { - // When present, it means this activity is assigned to the build ID of its workflow. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - UseWorkflowBuildIdInfo use_workflow_build_id_info = 35; - // This means the activity is independently versioned and not bound to the build ID of its workflow. - // If the task fails and is scheduled again, the assigned build ID may change according to the latest versioning - // rules. This value also updates if a redirect rule is applied to the activity task to reflect the build ID - // of the worker who received the task. - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - string last_independently_assigned_build_id = 36; - } - // The version stamp of the worker to whom this activity was most-recently dispatched - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - temporal.api.common.v1.WorkerVersionStamp last_worker_version_stamp = 37; - VersionedTransition last_update_versioned_transition = 38; + string last_independently_assigned_build_id = 36; + } + // The version stamp of the worker to whom this activity was most-recently dispatched + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + temporal.api.common.v1.WorkerVersionStamp last_worker_version_stamp = 37; + VersionedTransition last_update_versioned_transition = 38; + + // Deprecated. Clean up with versioning-2. [cleanup-old-wv] + message UseWorkflowBuildIdInfo { + // build ID of the wf when this activity started last time (which is the build ID of + // the worker who received this activity) + string last_used_build_id = 1; + // workflows redirect_counter value when this activity started last time + int64 last_redirect_counter = 2; + } + + // The first time the activity was scheduled. + google.protobuf.Timestamp first_scheduled_time = 39; + // The last time an activity attempt completion was recorded by the server. + google.protobuf.Timestamp last_attempt_complete_time = 40; + + // Stamp represents the “version” of the activity's internal state and can/will be changed with Activity API. + // It increases monotonically when the activity's options are modified. + // It is used to check if an activity task is still relevant to the corresponding activity state machine. + int32 stamp = 41; + + // Paused state. When activity is paused it will not advance until unpaused. + // Iw will not be scheduled, timer tasks will not be processed, etc. + // Note: it still can be cancelled/completed. + bool paused = 42; + + // The deployment this activity was dispatched to most recently. Present only if the activity + // was dispatched to a versioned worker. + // Deprecated. Replaced by last_worker_deployment_version. + temporal.api.deployment.v1.Deployment last_started_deployment = 43; + + // The deployment this activity was dispatched to most recently. Present only if the activity + // was dispatched to a versioned worker. + // Deprecated. Clean up with versioning-3.1. [cleanup-old-wv] + string last_worker_deployment_version = 44; + + // The deployment version this activity was dispatched to most recently. Present only if the activity + // was dispatched to a versioned worker. + temporal.api.deployment.v1.WorkerDeploymentVersion last_deployment_version = 49; + + // Priority metadata. If this message is not present, or any fields are not + // present, they inherit the values from the workflow. + temporal.api.common.v1.Priority priority = 45; + + message PauseInfo { + // The time when the activity was paused. + google.protobuf.Timestamp pause_time = 1; - // Deprecated. Clean up with versioning-2. [cleanup-old-wv] - message UseWorkflowBuildIdInfo { - // build ID of the wf when this activity started last time (which is the build ID of - // the worker who received this activity) - string last_used_build_id = 1; - // workflows redirect_counter value when this activity started last time - int64 last_redirect_counter = 2; + message Manual { + // The identity of the actor that paused the activity. + string identity = 1; + // Reason for pausing the activity. + string reason = 2; } - // The first time the activity was scheduled. - google.protobuf.Timestamp first_scheduled_time = 39; - // The last time an activity attempt completion was recorded by the server. - google.protobuf.Timestamp last_attempt_complete_time = 40; - - // Stamp represents the “version” of the activity's internal state and can/will be changed with Activity API. - // It increases monotonically when the activity's options are modified. - // It is used to check if an activity task is still relevant to the corresponding activity state machine. - int32 stamp = 41; - - // Paused state. When activity is paused it will not advance until unpaused. - // Iw will not be scheduled, timer tasks will not be processed, etc. - // Note: it still can be cancelled/completed. - bool paused = 42; - - // The deployment this activity was dispatched to most recently. Present only if the activity - // was dispatched to a versioned worker. - // Deprecated. Replaced by last_worker_deployment_version. - temporal.api.deployment.v1.Deployment last_started_deployment = 43; - - // The deployment this activity was dispatched to most recently. Present only if the activity - // was dispatched to a versioned worker. - // Deprecated. Clean up with versioning-3.1. [cleanup-old-wv] - string last_worker_deployment_version = 44; - - // The deployment version this activity was dispatched to most recently. Present only if the activity - // was dispatched to a versioned worker. - temporal.api.deployment.v1.WorkerDeploymentVersion last_deployment_version = 49; - - - // Priority metadata. If this message is not present, or any fields are not - // present, they inherit the values from the workflow. - temporal.api.common.v1.Priority priority = 45; - - message PauseInfo { - // The time when the activity was paused. - google.protobuf.Timestamp pause_time = 1; - - message Manual { - // The identity of the actor that paused the activity. - string identity = 1; - // Reason for pausing the activity. - string reason = 2; - } - - oneof paused_by { - // activity was paused by the manual intervention - Manual manual = 2; - - // Id of the rule that paused the activity. - string rule_id = 3; - } + oneof paused_by { + // activity was paused by the manual intervention + Manual manual = 2; + + // Id of the rule that paused the activity. + string rule_id = 3; } + } - PauseInfo pause_info = 46; + PauseInfo pause_info = 46; - // set to true if there was an activity reset while activity is still running on the worker - bool activity_reset = 47; + // set to true if there was an activity reset while activity is still running on the worker + bool activity_reset = 47; - // set to true if reset heartbeat flag was set with an activity reset - bool reset_heartbeats = 48; + // set to true if reset heartbeat flag was set with an activity reset + bool reset_heartbeats = 48; - int64 start_version = 50; + int64 start_version = 50; } // timer_map column message TimerInfo { - int64 version = 1; - int64 started_event_id = 2; - google.protobuf.Timestamp expiry_time = 3; - int64 task_status = 4; - // timerId serves the purpose of indicating whether a timer task is generated for this timer info. - string timer_id = 5; - VersionedTransition last_update_versioned_transition = 6; + int64 version = 1; + int64 started_event_id = 2; + google.protobuf.Timestamp expiry_time = 3; + int64 task_status = 4; + // timerId serves the purpose of indicating whether a timer task is generated for this timer info. + string timer_id = 5; + VersionedTransition last_update_versioned_transition = 6; } // child_executions_map column message ChildExecutionInfo { - int64 version = 1; - int64 initiated_event_batch_id = 2; - int64 started_event_id = 3; - reserved 4; - string started_workflow_id = 5; - string started_run_id = 6; - reserved 7; - string create_request_id = 8; - string namespace = 9; - string workflow_type_name = 10; - temporal.api.enums.v1.ParentClosePolicy parent_close_policy = 11; - int64 initiated_event_id = 12; - temporal.server.api.clock.v1.VectorClock clock = 13; - string namespace_id = 14; - VersionedTransition last_update_versioned_transition = 15; - temporal.api.common.v1.Priority priority = 16; + int64 version = 1; + int64 initiated_event_batch_id = 2; + int64 started_event_id = 3; + reserved 4; + string started_workflow_id = 5; + string started_run_id = 6; + reserved 7; + string create_request_id = 8; + string namespace = 9; + string workflow_type_name = 10; + temporal.api.enums.v1.ParentClosePolicy parent_close_policy = 11; + int64 initiated_event_id = 12; + temporal.server.api.clock.v1.VectorClock clock = 13; + string namespace_id = 14; + VersionedTransition last_update_versioned_transition = 15; + temporal.api.common.v1.Priority priority = 16; } // request_cancel_map column message RequestCancelInfo { - int64 version = 1; - int64 initiated_event_batch_id = 2; - string cancel_request_id = 3; - int64 initiated_event_id = 4; - VersionedTransition last_update_versioned_transition = 5; + int64 version = 1; + int64 initiated_event_batch_id = 2; + string cancel_request_id = 3; + int64 initiated_event_id = 4; + VersionedTransition last_update_versioned_transition = 5; } // signal_map column message SignalInfo { - int64 version = 1; - int64 initiated_event_batch_id = 2; - string request_id = 3; - reserved 4; - reserved 5; - reserved 6; - int64 initiated_event_id = 7; - reserved 8; - VersionedTransition last_update_versioned_transition = 9; + int64 version = 1; + int64 initiated_event_batch_id = 2; + string request_id = 3; + reserved 4; + reserved 5; + reserved 6; + int64 initiated_event_id = 7; + reserved 8; + VersionedTransition last_update_versioned_transition = 9; } // checksum column message Checksum { - int32 version = 1; - temporal.server.api.enums.v1.ChecksumFlavor flavor = 2; - bytes value = 3; + int32 version = 1; + temporal.server.api.enums.v1.ChecksumFlavor flavor = 2; + bytes value = 3; } message Callback { - message Nexus { - // Callback URL. - // (-- api-linter: core::0140::uri=disabled - // aip.dev/not-precedent: Not respecting aip here. --) - string url = 1; - // Header to attach to callback request. - map header = 2; - } - - message HSM { - // namespace id of the target state machine. - string namespace_id = 1; - // ID of the workflow that the target state machine is attached to. - string workflow_id = 2; - // Run id of said workflow. - string run_id = 3; - // A reference to the state machine. - temporal.server.api.persistence.v1.StateMachineRef ref = 4; - // The method name to invoke. Methods must be explicitly registered for the target state machine in the state - // machine registry, and accept an argument type of HistoryEvent that is the completion event of the completed - // workflow. - string method = 5; - } - - reserved 1; // For a generic callback mechanism to be added later. - oneof variant { - Nexus nexus = 2; - HSM hsm = 3; - } - - repeated temporal.api.common.v1.Link links = 100; -} - -message HSMCompletionCallbackArg { - // namespace ID of the workflow that just completed. + message Nexus { + // Callback URL. + // (-- api-linter: core::0140::uri=disabled + // aip.dev/not-precedent: Not respecting aip here. --) + string url = 1; + // Header to attach to callback request. + map header = 2; + } + + message HSM { + // namespace id of the target state machine. string namespace_id = 1; - // ID of the workflow that just completed. + // ID of the workflow that the target state machine is attached to. string workflow_id = 2; - // run ID of the workflow that just completed. + // Run id of said workflow. string run_id = 3; - // Last event of the completed workflow. - temporal.api.history.v1.HistoryEvent last_event = 4; + // A reference to the state machine. + temporal.server.api.persistence.v1.StateMachineRef ref = 4; + // The method name to invoke. Methods must be explicitly registered for the target state machine in the state + // machine registry, and accept an argument type of HistoryEvent that is the completion event of the completed + // workflow. + string method = 5; + } + + reserved 1; // For a generic callback mechanism to be added later. + oneof variant { + Nexus nexus = 2; + HSM hsm = 3; + } + + repeated temporal.api.common.v1.Link links = 100; +} + +message HSMCompletionCallbackArg { + // namespace ID of the workflow that just completed. + string namespace_id = 1; + // ID of the workflow that just completed. + string workflow_id = 2; + // run ID of the workflow that just completed. + string run_id = 3; + // Last event of the completed workflow. + temporal.api.history.v1.HistoryEvent last_event = 4; } message CallbackInfo { - // Trigger for when the workflow is closed. - message WorkflowClosed {} + // Trigger for when the workflow is closed. + message WorkflowClosed {} - message Trigger { - oneof variant { - WorkflowClosed workflow_closed = 1; - } + message Trigger { + oneof variant { + WorkflowClosed workflow_closed = 1; } - - // Information on how this callback should be invoked (e.g. its URL and type). - Callback callback = 1; - // Trigger for this callback. - Trigger trigger = 2; - // The time when the callback was registered. - google.protobuf.Timestamp registration_time = 3; - - temporal.server.api.enums.v1.CallbackState state = 4; - // The number of attempts made to deliver the callback. - // This number represents a minimum bound since the attempt is incremented after the callback request completes. - int32 attempt = 5; - - // The time when the last attempt completed. - google.protobuf.Timestamp last_attempt_complete_time = 6; - // The last attempt's failure, if any. - temporal.api.failure.v1.Failure last_attempt_failure = 7; - // The time when the next attempt is scheduled. - google.protobuf.Timestamp next_attempt_schedule_time = 8; - - // Request ID that added the callback. - string request_id = 9; + } + + // Information on how this callback should be invoked (e.g. its URL and type). + Callback callback = 1; + // Trigger for this callback. + Trigger trigger = 2; + // The time when the callback was registered. + google.protobuf.Timestamp registration_time = 3; + + temporal.server.api.enums.v1.CallbackState state = 4; + // The number of attempts made to deliver the callback. + // This number represents a minimum bound since the attempt is incremented after the callback request completes. + int32 attempt = 5; + + // The time when the last attempt completed. + google.protobuf.Timestamp last_attempt_complete_time = 6; + // The last attempt's failure, if any. + temporal.api.failure.v1.Failure last_attempt_failure = 7; + // The time when the next attempt is scheduled. + google.protobuf.Timestamp next_attempt_schedule_time = 8; + + // Request ID that added the callback. + string request_id = 9; } // NexusOperationInfo contains the state of a nexus operation. message NexusOperationInfo { - // Endpoint name. - // Resolved the endpoint registry for this workflow's namespace. - string endpoint = 1; + // Endpoint name. + // Resolved the endpoint registry for this workflow's namespace. + string endpoint = 1; - // Service name. - string service = 2; - // Operation name. - string operation = 3; + // Service name. + string service = 2; + // Operation name. + string operation = 3; - // reserved due to removal of delete_on_completion - reserved 4; + // reserved due to removal of delete_on_completion + reserved 4; - // Token for fetching the scheduled event. - bytes scheduled_event_token = 5; + // Token for fetching the scheduled event. + bytes scheduled_event_token = 5; - // Operation token. Only set for asynchronous operations after a successful StartOperation call. - string operation_token = 6; + // Operation token. Only set for asynchronous operations after a successful StartOperation call. + string operation_token = 6; - // Schedule-to-close timeout for this operation. - // This is the only timeout settable by a workflow. - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "since" is needed here. --) - google.protobuf.Duration schedule_to_close_timeout = 7; + // Schedule-to-close timeout for this operation. + // This is the only timeout settable by a workflow. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "since" is needed here. --) + google.protobuf.Duration schedule_to_close_timeout = 7; - // The time when the operation was scheduled. - google.protobuf.Timestamp scheduled_time = 8; + // The time when the operation was scheduled. + google.protobuf.Timestamp scheduled_time = 8; - // Unique request ID allocated for all retry attempts of the StartOperation request. - string request_id = 9; + // Unique request ID allocated for all retry attempts of the StartOperation request. + string request_id = 9; - temporal.server.api.enums.v1.NexusOperationState state = 10; + temporal.server.api.enums.v1.NexusOperationState state = 10; - // The number of attempts made to deliver the start operation request. - // This number represents a minimum bound since the attempt is incremented after the request completes. - int32 attempt = 11; + // The number of attempts made to deliver the start operation request. + // This number represents a minimum bound since the attempt is incremented after the request completes. + int32 attempt = 11; - // The time when the last attempt completed. - google.protobuf.Timestamp last_attempt_complete_time = 12; - // The last attempt's failure, if any. - temporal.api.failure.v1.Failure last_attempt_failure = 13; - // The time when the next attempt is scheduled. - google.protobuf.Timestamp next_attempt_schedule_time = 14; + // The time when the last attempt completed. + google.protobuf.Timestamp last_attempt_complete_time = 12; + // The last attempt's failure, if any. + temporal.api.failure.v1.Failure last_attempt_failure = 13; + // The time when the next attempt is scheduled. + google.protobuf.Timestamp next_attempt_schedule_time = 14; - // Endpoint ID, the name is also stored here (field 1) but we use the ID internally to avoid failing operation - // requests when an endpoint is renamed. - string endpoint_id = 15; + // Endpoint ID, the name is also stored here (field 1) but we use the ID internally to avoid failing operation + // requests when an endpoint is renamed. + string endpoint_id = 15; - // Schedule-to-start timeout for this operation. - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration schedule_to_start_timeout = 16; + // Schedule-to-start timeout for this operation. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration schedule_to_start_timeout = 16; - // Start-to-close timeout for this operation. - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "to" is used to indicate interval. --) - google.protobuf.Duration start_to_close_timeout = 17; + // Start-to-close timeout for this operation. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "to" is used to indicate interval. --) + google.protobuf.Duration start_to_close_timeout = 17; - // Time the operation was started (only available for async operations). - google.protobuf.Timestamp started_time = 18; + // Time the operation was started (only available for async operations). + google.protobuf.Timestamp started_time = 18; } // NexusOperationCancellationInfo contains the state of a nexus operation cancelation. message NexusOperationCancellationInfo { - // The time when cancelation was requested. - google.protobuf.Timestamp requested_time = 1; + // The time when cancelation was requested. + google.protobuf.Timestamp requested_time = 1; - temporal.api.enums.v1.NexusOperationCancellationState state = 2; + temporal.api.enums.v1.NexusOperationCancellationState state = 2; - // The number of attempts made to deliver the cancel operation request. - // This number represents a minimum bound since the attempt is incremented after the request completes. - int32 attempt = 3; + // The number of attempts made to deliver the cancel operation request. + // This number represents a minimum bound since the attempt is incremented after the request completes. + int32 attempt = 3; - // The time when the last attempt completed. - google.protobuf.Timestamp last_attempt_complete_time = 4; - // The last attempt's failure, if any. - temporal.api.failure.v1.Failure last_attempt_failure = 5; - // The time when the next attempt is scheduled. - google.protobuf.Timestamp next_attempt_schedule_time = 6; + // The time when the last attempt completed. + google.protobuf.Timestamp last_attempt_complete_time = 4; + // The last attempt's failure, if any. + temporal.api.failure.v1.Failure last_attempt_failure = 5; + // The time when the next attempt is scheduled. + google.protobuf.Timestamp next_attempt_schedule_time = 6; - // The event ID of the NEXUS_OPERATION_CANCEL_REQUESTED event for this cancelation. - int64 requested_event_id = 7; + // The event ID of the NEXUS_OPERATION_CANCEL_REQUESTED event for this cancelation. + int64 requested_event_id = 7; } // ResetChildInfo contains the state and actions to be performed on children when a parent workflow resumes after reset. message ResetChildInfo { - // If true, the parent workflow should terminate the child before starting it. - bool should_terminate_and_start = 1; + // If true, the parent workflow should terminate the child before starting it. + bool should_terminate_and_start = 1; } message WorkflowPauseInfo { - // The time when the workflow was paused. - google.protobuf.Timestamp pause_time = 1; + // The time when the workflow was paused. + google.protobuf.Timestamp pause_time = 1; - // The identity of the actor that paused the workflow. - string identity = 2; + // The identity of the actor that paused the workflow. + string identity = 2; - // The reason for pausing the workflow. - string reason = 3; + // The reason for pausing the workflow. + string reason = 3; - // A unique identifier for this pause request (for idempotency checks) - string request_id = 4; + // A unique identifier for this pause request (for idempotency checks) + string request_id = 4; } diff --git a/proto/internal/temporal/server/api/persistence/v1/history_tree.proto b/proto/internal/temporal/server/api/persistence/v1/history_tree.proto index b53883f10d..f60a67dbbe 100644 --- a/proto/internal/temporal/server/api/persistence/v1/history_tree.proto +++ b/proto/internal/temporal/server/api/persistence/v1/history_tree.proto @@ -1,34 +1,35 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/timestamp.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // branch column message HistoryTreeInfo { - HistoryBranch branch_info = 1; - // For fork operation to prevent race condition of leaking event data when forking branches fail. Also can be used for clean up leaked data. - google.protobuf.Timestamp fork_time = 2; - // For lookup back to workflow during debugging, also background cleanup when fork operation cannot finish self cleanup due to crash. - string info = 3; - // Deprecating branch token in favor of branch info. - bytes branch_token = 4 [deprecated = true]; + HistoryBranch branch_info = 1; + // For fork operation to prevent race condition of leaking event data when forking branches fail. Also can be used for clean up leaked data. + google.protobuf.Timestamp fork_time = 2; + // For lookup back to workflow during debugging, also background cleanup when fork operation cannot finish self cleanup due to crash. + string info = 3; + // Deprecating branch token in favor of branch info. + bytes branch_token = 4 [deprecated = true]; } // For history persistence to serialize/deserialize branch details. message HistoryBranch { - string tree_id = 1; - string branch_id = 2; - repeated HistoryBranchRange ancestors = 3; + string tree_id = 1; + string branch_id = 2; + repeated HistoryBranchRange ancestors = 3; } // HistoryBranchRange represents a piece of range for a branch. message HistoryBranchRange { - // BranchId of original branch forked from. - string branch_id = 1; - // Beginning node for the range, inclusive. - int64 begin_node_id = 2; - // Ending node for the range, exclusive. - int64 end_node_id = 3; + // BranchId of original branch forked from. + string branch_id = 1; + // Beginning node for the range, inclusive. + int64 begin_node_id = 2; + // Ending node for the range, exclusive. + int64 end_node_id = 3; } diff --git a/proto/internal/temporal/server/api/persistence/v1/hsm.proto b/proto/internal/temporal/server/api/persistence/v1/hsm.proto index c980cd3a1b..0cd062adee 100644 --- a/proto/internal/temporal/server/api/persistence/v1/hsm.proto +++ b/proto/internal/temporal/server/api/persistence/v1/hsm.proto @@ -1,142 +1,142 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/timestamp.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // A node in a hierarchical state machine tree. message StateMachineNode { - // Serialized data of the underlying state machine. - bytes data = 1; - // Map of state machine type to a map of machines by ID. - map children = 2; - - // Versioned transition when the node was instantiated. - // This field, plus node path uniquely identifies a state machine node in a mutable state instance. - // This field will always be set even when transition history is disabled. - // NOTE: If transition history is disabled, the transition_count field will be 0 and - // cannot be used to uniquely identify a node. - // NOTE: Node deletion is not yet implemented at the time of writing so we can still uniquely identify a node just - // with the initial namespace failover version. - VersionedTransition initial_versioned_transition = 3; - - // Versioned transition when the node was last updated. - // This field will always be set even when transition history is disabled. - // NOTE: If transition history is disabled, the transition_count field will be 0 and - // cannot be used for non-concurrent task staleness check or to determine whether this node should be synced - // during state replication. - VersionedTransition last_update_versioned_transition = 4; - - // Number of transitions on this state machine object. - // Used to verify that a task is not stale if the state machine does not allow concurrent task execution. - // The transition count monotonically increases with each state transition and only resets when the entire - // mutable state was rebuilt. This case is handled by the task_generation_shard_clock_timestamp field in - // WorkflowExecutionInfo. - // NOTE: This field is cluster specific and cannot be replicated. - // NOTE: This field will be made obsolete when transition history is enabled in favor of - // last_update_versioned_transition. - int64 transition_count = 100; + // Serialized data of the underlying state machine. + bytes data = 1; + // Map of state machine type to a map of machines by ID. + map children = 2; + + // Versioned transition when the node was instantiated. + // This field, plus node path uniquely identifies a state machine node in a mutable state instance. + // This field will always be set even when transition history is disabled. + // NOTE: If transition history is disabled, the transition_count field will be 0 and + // cannot be used to uniquely identify a node. + // NOTE: Node deletion is not yet implemented at the time of writing so we can still uniquely identify a node just + // with the initial namespace failover version. + VersionedTransition initial_versioned_transition = 3; + + // Versioned transition when the node was last updated. + // This field will always be set even when transition history is disabled. + // NOTE: If transition history is disabled, the transition_count field will be 0 and + // cannot be used for non-concurrent task staleness check or to determine whether this node should be synced + // during state replication. + VersionedTransition last_update_versioned_transition = 4; + + // Number of transitions on this state machine object. + // Used to verify that a task is not stale if the state machine does not allow concurrent task execution. + // The transition count monotonically increases with each state transition and only resets when the entire + // mutable state was rebuilt. This case is handled by the task_generation_shard_clock_timestamp field in + // WorkflowExecutionInfo. + // NOTE: This field is cluster specific and cannot be replicated. + // NOTE: This field will be made obsolete when transition history is enabled in favor of + // last_update_versioned_transition. + int64 transition_count = 100; } // Map of state machine ID to StateMachineNode. message StateMachineMap { - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "by" is used to clarify the keys and values. --) - map machines_by_id = 1; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "by" is used to clarify the keys and values. --) + map machines_by_id = 1; } message StateMachineKey { - // Addressable type of the corresponding state machine in a single tree level. - string type = 1; - // Addressable ID of the corresponding state machine in a single tree level. - string id = 2; + // Addressable type of the corresponding state machine in a single tree level. + string type = 1; + // Addressable ID of the corresponding state machine in a single tree level. + string id = 2; } // A reference to a state machine at a point in time. message StateMachineRef { - // Nested path to a state machine. - repeated StateMachineKey path = 1; - - // Versioned transition of the ref was instantiated. - // Used to verify that the ref is not referencing a stale state or, in some situations, - // that the ref itself is not stale. - // NOTE: If transition history is disabled, the field will not be specified and - // cannot be used for staleness check. - VersionedTransition mutable_state_versioned_transition = 2; - - // Versioned transition when the state machine node was instantiated. - // This field, plus node path uniquely identifies a state machine node in a mutable state instance. - // This field will always be set even when transition history is disabled. - // NOTE: If transition history is disabled, the transition_count field will be 0 and - // cannot be used to uniquely identify a node. - // NOTE: Node deletion is not yet implemented at the time of writing so we can still uniquely identify a node just - // with the initial namespace failover version. - VersionedTransition machine_initial_versioned_transition = 3; - - // Versioned transition when the state machine node was last updated. - // If not specified, this reference is considered non-concurrent, - // and should match the last_update_versioned_transition on the corresponding state machine node. - // NOTE: If transition history is disabled, the transition_count field will be 0 and - // cannot be used for non-concurrent task staleness check. - VersionedTransition machine_last_update_versioned_transition = 4; - - // Number of transitions executed on the referenced state machine node at the time this Ref is instantiated. - // If non-zero, this reference is considered non-concurrent and this number should match the number of state - // transitions on the corresponding state machine node. - // This field will be obsolete once mutable state transition history is productionized. - int64 machine_transition_count = 100; + // Nested path to a state machine. + repeated StateMachineKey path = 1; + + // Versioned transition of the ref was instantiated. + // Used to verify that the ref is not referencing a stale state or, in some situations, + // that the ref itself is not stale. + // NOTE: If transition history is disabled, the field will not be specified and + // cannot be used for staleness check. + VersionedTransition mutable_state_versioned_transition = 2; + + // Versioned transition when the state machine node was instantiated. + // This field, plus node path uniquely identifies a state machine node in a mutable state instance. + // This field will always be set even when transition history is disabled. + // NOTE: If transition history is disabled, the transition_count field will be 0 and + // cannot be used to uniquely identify a node. + // NOTE: Node deletion is not yet implemented at the time of writing so we can still uniquely identify a node just + // with the initial namespace failover version. + VersionedTransition machine_initial_versioned_transition = 3; + + // Versioned transition when the state machine node was last updated. + // If not specified, this reference is considered non-concurrent, + // and should match the last_update_versioned_transition on the corresponding state machine node. + // NOTE: If transition history is disabled, the transition_count field will be 0 and + // cannot be used for non-concurrent task staleness check. + VersionedTransition machine_last_update_versioned_transition = 4; + + // Number of transitions executed on the referenced state machine node at the time this Ref is instantiated. + // If non-zero, this reference is considered non-concurrent and this number should match the number of state + // transitions on the corresponding state machine node. + // This field will be obsolete once mutable state transition history is productionized. + int64 machine_transition_count = 100; } message StateMachineTaskInfo { - // Reference to a state machine. - StateMachineRef ref = 1; - // Task type. Not to be confused with the state machine's type in the `ref` field. - string type = 2; - // Opaque data attached to this task. May be nil. Deserialized by a registered TaskSerializer for this type. - bytes data = 3; + // Reference to a state machine. + StateMachineRef ref = 1; + // Task type. Not to be confused with the state machine's type in the `ref` field. + string type = 2; + // Opaque data attached to this task. May be nil. Deserialized by a registered TaskSerializer for this type. + bytes data = 3; } // A group of state machine timer tasks for a given deadline, used for collapsing state machine timer tasks. message StateMachineTimerGroup { - // Task information. - repeated StateMachineTaskInfo infos = 1; - // When this timer should be fired. - // (-- api-linter: core::0142::time-field-names=disabled - // aip.dev/not-precedent: Ignoring lint rules. --) - google.protobuf.Timestamp deadline = 2; - // Whether or not a task was put in the queue for this group's deadline. - bool scheduled = 3; + // Task information. + repeated StateMachineTaskInfo infos = 1; + // When this timer should be fired. + // (-- api-linter: core::0142::time-field-names=disabled + // aip.dev/not-precedent: Ignoring lint rules. --) + google.protobuf.Timestamp deadline = 2; + // Whether or not a task was put in the queue for this group's deadline. + bool scheduled = 3; } // VersionedTransition is a unique identifier for a specific mutable state transition. message VersionedTransition { - // The namespace failover version at transition time. - int64 namespace_failover_version = 1; - // State transition count perceived during the specified namespace_failover_version. - int64 transition_count = 2; + // The namespace failover version at transition time. + int64 namespace_failover_version = 1; + // State transition count perceived during the specified namespace_failover_version. + int64 transition_count = 2; } message StateMachineTombstoneBatch { - // The versioned transition in which the tombstones were created. - VersionedTransition versioned_transition = 1; - repeated StateMachineTombstone state_machine_tombstones = 2; + // The versioned transition in which the tombstones were created. + VersionedTransition versioned_transition = 1; + repeated StateMachineTombstone state_machine_tombstones = 2; } message StateMachineTombstone { - oneof state_machine_key { - int64 activity_scheduled_event_id = 1; - string timer_id = 2; - int64 child_execution_initiated_event_id = 3; - int64 request_cancel_initiated_event_id = 4; - int64 signal_external_initiated_event_id = 5; - string update_id = 6; - StateMachinePath state_machine_path = 7; - string chasm_node_path = 8; - } + oneof state_machine_key { + int64 activity_scheduled_event_id = 1; + string timer_id = 2; + int64 child_execution_initiated_event_id = 3; + int64 request_cancel_initiated_event_id = 4; + int64 signal_external_initiated_event_id = 5; + string update_id = 6; + StateMachinePath state_machine_path = 7; + string chasm_node_path = 8; + } } message StateMachinePath { - repeated StateMachineKey path = 1; + repeated StateMachineKey path = 1; } - diff --git a/proto/internal/temporal/server/api/persistence/v1/namespaces.proto b/proto/internal/temporal/server/api/persistence/v1/namespaces.proto index 64daf4910e..86591a9e3b 100644 --- a/proto/internal/temporal/server/api/persistence/v1/namespaces.proto +++ b/proto/internal/temporal/server/api/persistence/v1/namespaces.proto @@ -1,56 +1,56 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; - import "temporal/api/enums/v1/namespace.proto"; import "temporal/api/namespace/v1/message.proto"; import "temporal/api/rules/v1/message.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // detail column message NamespaceDetail { - NamespaceInfo info = 1; - NamespaceConfig config = 2; - NamespaceReplicationConfig replication_config = 3; - int64 config_version = 4; - int64 failover_notification_version = 5; - int64 failover_version = 6; - google.protobuf.Timestamp failover_end_time = 7; + NamespaceInfo info = 1; + NamespaceConfig config = 2; + NamespaceReplicationConfig replication_config = 3; + int64 config_version = 4; + int64 failover_notification_version = 5; + int64 failover_version = 6; + google.protobuf.Timestamp failover_end_time = 7; } message NamespaceInfo { - string id = 1; - temporal.api.enums.v1.NamespaceState state = 2; - string name = 3; - string description = 4; - string owner = 5; - map data = 6; + string id = 1; + temporal.api.enums.v1.NamespaceState state = 2; + string name = 3; + string description = 4; + string owner = 5; + map data = 6; } message NamespaceConfig { - google.protobuf.Duration retention = 1; - string archival_bucket = 2; - temporal.api.namespace.v1.BadBinaries bad_binaries = 3; - temporal.api.enums.v1.ArchivalState history_archival_state = 4; - string history_archival_uri = 5; - temporal.api.enums.v1.ArchivalState visibility_archival_state = 6; - string visibility_archival_uri = 7; - map custom_search_attribute_aliases = 8; - map workflow_rules = 9; + google.protobuf.Duration retention = 1; + string archival_bucket = 2; + temporal.api.namespace.v1.BadBinaries bad_binaries = 3; + temporal.api.enums.v1.ArchivalState history_archival_state = 4; + string history_archival_uri = 5; + temporal.api.enums.v1.ArchivalState visibility_archival_state = 6; + string visibility_archival_uri = 7; + map custom_search_attribute_aliases = 8; + map workflow_rules = 9; } message NamespaceReplicationConfig { - string active_cluster_name = 1; - repeated string clusters = 2; - temporal.api.enums.v1.ReplicationState state = 3; - repeated FailoverStatus failover_history = 8; + string active_cluster_name = 1; + repeated string clusters = 2; + temporal.api.enums.v1.ReplicationState state = 3; + repeated FailoverStatus failover_history = 8; } // Represents a historical replication status of a Namespace message FailoverStatus { - google.protobuf.Timestamp failover_time = 1; - int64 failover_version = 2; + google.protobuf.Timestamp failover_time = 1; + int64 failover_version = 2; } diff --git a/proto/internal/temporal/server/api/persistence/v1/nexus.proto b/proto/internal/temporal/server/api/persistence/v1/nexus.proto index ce8f39c0c0..38fa5f8cbd 100644 --- a/proto/internal/temporal/server/api/persistence/v1/nexus.proto +++ b/proto/internal/temporal/server/api/persistence/v1/nexus.proto @@ -1,70 +1,71 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/timestamp.proto"; import "temporal/api/common/v1/message.proto"; import "temporal/server/api/clock/v1/message.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // Contains mutable fields for an Endpoint. Duplicated from the public API's temporal.api.nexus.v1.EndpointSpec where // the worker target has a namespace name. // We store an ID in persistence to prevent namespace renames from breaking references. message NexusEndpointSpec { - // Endpoint name, unique for this cluster. Must match `[a-zA-Z_][a-zA-Z0-9_]*`. - // Renaming an endpoint breaks all workflow callers that reference this endpoint, causing operations to fail. - string name = 1; - temporal.api.common.v1.Payload description = 2; + // Endpoint name, unique for this cluster. Must match `[a-zA-Z_][a-zA-Z0-9_]*`. + // Renaming an endpoint breaks all workflow callers that reference this endpoint, causing operations to fail. + string name = 1; + temporal.api.common.v1.Payload description = 2; - // Target to route requests to. - NexusEndpointTarget target = 3; + // Target to route requests to. + NexusEndpointTarget target = 3; } // Target to route requests to. // Duplicated from the public API's temporal.api.nexus.v1.EndpointTarget where the worker target has a namespace name. // We store an ID in persistence to prevent namespace renames from breaking references. message NexusEndpointTarget { - // Target a worker polling on a Nexus task queue in a specific namespace. - message Worker { - // Namespace ID to route requests to. - string namespace_id = 1; - // Nexus task queue to route requests to. - string task_queue = 2; - } - - // Target an external server by URL. - // At a later point, this will support providing credentials, in the meantime, an http.RoundTripper can be injected - // into the server to modify the request. - message External { - // URL to call. - // (-- api-linter: core::0140::uri=disabled - // aip.dev/not-precedent: Not following linter rules. --) - string url = 1; - } + // Target a worker polling on a Nexus task queue in a specific namespace. + message Worker { + // Namespace ID to route requests to. + string namespace_id = 1; + // Nexus task queue to route requests to. + string task_queue = 2; + } - oneof variant { - Worker worker = 1; - External external = 2; - } + // Target an external server by URL. + // At a later point, this will support providing credentials, in the meantime, an http.RoundTripper can be injected + // into the server to modify the request. + message External { + // URL to call. + // (-- api-linter: core::0140::uri=disabled + // aip.dev/not-precedent: Not following linter rules. --) + string url = 1; + } + + oneof variant { + Worker worker = 1; + External external = 2; + } } message NexusEndpoint { - // The last recorded cluster-local Hybrid Logical Clock timestamp for _this_ endpoint. - // Updated whenever the endpoint is directly updated due to a user action but not when applying replication events. - // The clock is referenced when new timestamps are generated to ensure it produces monotonically increasing - // timestamps. - temporal.server.api.clock.v1.HybridLogicalClock clock = 1; - // Endpoint specification. This is a mirror of the public API and is intended to be mutable. - NexusEndpointSpec spec = 2; - // The date and time when the endpoint was created. - // (-- api-linter: core::0142::time-field-names=disabled - // aip.dev/not-precedent: Not following linter rules. --) - google.protobuf.Timestamp created_time = 3; + // The last recorded cluster-local Hybrid Logical Clock timestamp for _this_ endpoint. + // Updated whenever the endpoint is directly updated due to a user action but not when applying replication events. + // The clock is referenced when new timestamps are generated to ensure it produces monotonically increasing + // timestamps. + temporal.server.api.clock.v1.HybridLogicalClock clock = 1; + // Endpoint specification. This is a mirror of the public API and is intended to be mutable. + NexusEndpointSpec spec = 2; + // The date and time when the endpoint was created. + // (-- api-linter: core::0142::time-field-names=disabled + // aip.dev/not-precedent: Not following linter rules. --) + google.protobuf.Timestamp created_time = 3; } // Container for a version, a UUID, and a NexusEndpoint. message NexusEndpointEntry { - int64 version = 1; - string id = 2; - NexusEndpoint endpoint = 3; + int64 version = 1; + string id = 2; + NexusEndpoint endpoint = 3; } diff --git a/proto/internal/temporal/server/api/persistence/v1/predicates.proto b/proto/internal/temporal/server/api/persistence/v1/predicates.proto index e3d47d526b..b6d2c71f39 100644 --- a/proto/internal/temporal/server/api/persistence/v1/predicates.proto +++ b/proto/internal/temporal/server/api/persistence/v1/predicates.proto @@ -1,66 +1,65 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "temporal/server/api/enums/v1/predicate.proto"; import "temporal/server/api/enums/v1/task.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + message Predicate { - temporal.server.api.enums.v1.PredicateType predicate_type = 1; - oneof attributes { - UniversalPredicateAttributes universal_predicate_attributes = 2; - EmptyPredicateAttributes empty_predicate_attributes = 3; - AndPredicateAttributes and_predicate_attributes = 4; - OrPredicateAttributes or_predicate_attributes = 5; - NotPredicateAttributes not_predicate_attributes = 6; - NamespaceIdPredicateAttributes namespace_id_predicate_attributes = 7; - TaskTypePredicateAttributes task_type_predicate_attributes = 8; - DestinationPredicateAttributes destination_predicate_attributes = 9; - OutboundTaskGroupPredicateAttributes outbound_task_group_predicate_attributes = 10; - OutboundTaskPredicateAttributes outbound_task_predicate_attributes = 11; - } + temporal.server.api.enums.v1.PredicateType predicate_type = 1; + oneof attributes { + UniversalPredicateAttributes universal_predicate_attributes = 2; + EmptyPredicateAttributes empty_predicate_attributes = 3; + AndPredicateAttributes and_predicate_attributes = 4; + OrPredicateAttributes or_predicate_attributes = 5; + NotPredicateAttributes not_predicate_attributes = 6; + NamespaceIdPredicateAttributes namespace_id_predicate_attributes = 7; + TaskTypePredicateAttributes task_type_predicate_attributes = 8; + DestinationPredicateAttributes destination_predicate_attributes = 9; + OutboundTaskGroupPredicateAttributes outbound_task_group_predicate_attributes = 10; + OutboundTaskPredicateAttributes outbound_task_predicate_attributes = 11; + } } -message UniversalPredicateAttributes { -} +message UniversalPredicateAttributes {} -message EmptyPredicateAttributes { -} +message EmptyPredicateAttributes {} message AndPredicateAttributes { - repeated Predicate predicates = 1; + repeated Predicate predicates = 1; } message OrPredicateAttributes { - repeated Predicate predicates = 1; + repeated Predicate predicates = 1; } message NotPredicateAttributes { - Predicate predicate = 1; + Predicate predicate = 1; } message NamespaceIdPredicateAttributes { - repeated string namespace_ids = 1; + repeated string namespace_ids = 1; } message TaskTypePredicateAttributes { - repeated temporal.server.api.enums.v1.TaskType task_types = 1; + repeated temporal.server.api.enums.v1.TaskType task_types = 1; } message DestinationPredicateAttributes { - repeated string destinations = 1; + repeated string destinations = 1; } message OutboundTaskGroupPredicateAttributes { - repeated string groups = 1; + repeated string groups = 1; } message OutboundTaskPredicateAttributes { - message Group { - string task_group = 1; - string namespace_id = 2; - string destination = 3; - } - repeated Group groups = 1; + message Group { + string task_group = 1; + string namespace_id = 2; + string destination = 3; + } + repeated Group groups = 1; } diff --git a/proto/internal/temporal/server/api/persistence/v1/queue_metadata.proto b/proto/internal/temporal/server/api/persistence/v1/queue_metadata.proto index 76a061fba4..c7c3538f43 100644 --- a/proto/internal/temporal/server/api/persistence/v1/queue_metadata.proto +++ b/proto/internal/temporal/server/api/persistence/v1/queue_metadata.proto @@ -1,9 +1,10 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; + option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; // data column message QueueMetadata { - map cluster_ack_levels = 1; + map cluster_ack_levels = 1; } diff --git a/proto/internal/temporal/server/api/persistence/v1/queues.proto b/proto/internal/temporal/server/api/persistence/v1/queues.proto index c87b9c6084..dc62e05408 100644 --- a/proto/internal/temporal/server/api/persistence/v1/queues.proto +++ b/proto/internal/temporal/server/api/persistence/v1/queues.proto @@ -1,54 +1,54 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "temporal/api/common/v1/message.proto"; import "temporal/server/api/persistence/v1/predicates.proto"; import "temporal/server/api/persistence/v1/tasks.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + message QueueState { - map reader_states = 1; - TaskKey exclusive_reader_high_watermark = 2; + map reader_states = 1; + TaskKey exclusive_reader_high_watermark = 2; } message QueueReaderState { - repeated QueueSliceScope scopes = 1; + repeated QueueSliceScope scopes = 1; } message QueueSliceScope { - QueueSliceRange range = 1; - Predicate predicate = 2; + QueueSliceRange range = 1; + Predicate predicate = 2; } message QueueSliceRange { - TaskKey inclusive_min = 1; - TaskKey exclusive_max = 2; + TaskKey inclusive_min = 1; + TaskKey exclusive_max = 2; } message ReadQueueMessagesNextPageToken { - int64 last_read_message_id = 1; + int64 last_read_message_id = 1; } message ListQueuesNextPageToken { - int64 last_read_queue_number = 1; + int64 last_read_queue_number = 1; } // HistoryTask represents an internal history service task for a particular shard. We use a blob because there is no // common proto for all task proto types. message HistoryTask { - // shard_id that this task belonged to when it was created. Technically, you can derive this from the task data - // blob, but it's useful to have it here for quick access and to avoid deserializing the blob. Note that this may be - // different from the shard id of this task in the current cluster because it could have come from a cluster with a - // different shard id. This will always be the shard id of the task in its original cluster. - int32 shard_id = 1; - // blob that contains the history task proto. There is a GoLang-specific generic deserializer for this blob, but - // there is no common proto for all task proto types, so deserializing in other languages will require a custom - // switch on the task category, which should be available from the metadata for the queue that this task came from. - temporal.api.common.v1.DataBlob blob = 2; + // shard_id that this task belonged to when it was created. Technically, you can derive this from the task data + // blob, but it's useful to have it here for quick access and to avoid deserializing the blob. Note that this may be + // different from the shard id of this task in the current cluster because it could have come from a cluster with a + // different shard id. This will always be the shard id of the task in its original cluster. + int32 shard_id = 1; + // blob that contains the history task proto. There is a GoLang-specific generic deserializer for this blob, but + // there is no common proto for all task proto types, so deserializing in other languages will require a custom + // switch on the task category, which should be available from the metadata for the queue that this task came from. + temporal.api.common.v1.DataBlob blob = 2; } - message QueuePartition { // min_message_id is less than or equal to the id of every message in the queue. The min_message_id is mainly used to // skip over tombstones in Cassandra: let's say we deleted the first 1K messages from a queue with 1.1K messages. If diff --git a/proto/internal/temporal/server/api/persistence/v1/task_queues.proto b/proto/internal/temporal/server/api/persistence/v1/task_queues.proto index ea9d317eaf..536f0887ba 100644 --- a/proto/internal/temporal/server/api/persistence/v1/task_queues.proto +++ b/proto/internal/temporal/server/api/persistence/v1/task_queues.proto @@ -1,7 +1,6 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "temporal/api/deployment/v1/message.proto"; import "temporal/api/taskqueue/v1/message.proto"; @@ -9,122 +8,124 @@ import "temporal/server/api/clock/v1/message.proto"; import "temporal/server/api/deployment/v1/message.proto"; import "temporal/server/api/enums/v1/fairness_state.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // BuildId is an identifier with a timestamped status used to identify workers for task queue versioning purposes. message BuildId { - enum State { - STATE_UNSPECIFIED = 0; - STATE_ACTIVE = 1; - STATE_DELETED = 2; - }; - - string id = 1; - State state = 2; - // HLC timestamp representing when the state was updated or the when build ID was originally inserted. - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock state_update_timestamp = 3; - // HLC timestamp representing when this build ID was last made default in its version set. - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock became_default_timestamp = 4; + enum State { + STATE_UNSPECIFIED = 0; + STATE_ACTIVE = 1; + STATE_DELETED = 2; + } + + string id = 1; + State state = 2; + // HLC timestamp representing when the state was updated or the when build ID was originally inserted. + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock state_update_timestamp = 3; + // HLC timestamp representing when this build ID was last made default in its version set. + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock became_default_timestamp = 4; } // An internal representation of temporal.api.taskqueue.v1.CompatibleVersionSet message CompatibleVersionSet { - // Set IDs are used internally by matching. - // A set typically has one set ID and extra care is taken to enforce this. - // In some situations, including: - // - Replication race between task queue user data and history events - // - Replication split-brain + later merge - // - Delayed user data propagation between partitions - // - Cross-task-queue activities/child workflows/CAN where the user has not set up parallel - // versioning data - // we have to guess the set id for a build ID. If that happens, and then the build ID is - // discovered to be in a different set, then the sets will be merged and both (or more) - // build ids will be preserved, so that we don't lose tasks. - // The first set id is considered the "primary", and the others are "demoted". Once a build - // id is demoted, it cannot be made the primary again. - repeated string set_ids = 1; - // All the compatible versions, unordered except for the last element, which is considered the set "default". - repeated BuildId build_ids = 2; - // HLC timestamp representing when this set was last made the default for the queue. - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock became_default_timestamp = 4; + // Set IDs are used internally by matching. + // A set typically has one set ID and extra care is taken to enforce this. + // In some situations, including: + // - Replication race between task queue user data and history events + // - Replication split-brain + later merge + // - Delayed user data propagation between partitions + // - Cross-task-queue activities/child workflows/CAN where the user has not set up parallel + // versioning data + // we have to guess the set id for a build ID. If that happens, and then the build ID is + // discovered to be in a different set, then the sets will be merged and both (or more) + // build ids will be preserved, so that we don't lose tasks. + // The first set id is considered the "primary", and the others are "demoted". Once a build + // id is demoted, it cannot be made the primary again. + repeated string set_ids = 1; + // All the compatible versions, unordered except for the last element, which is considered the set "default". + repeated BuildId build_ids = 2; + // HLC timestamp representing when this set was last made the default for the queue. + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock became_default_timestamp = 4; } message AssignmentRule { - temporal.api.taskqueue.v1.BuildIdAssignmentRule rule = 1; - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock create_timestamp = 2; - // when delete_timestamp is present the rule should be treated as deleted - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock delete_timestamp = 3; + temporal.api.taskqueue.v1.BuildIdAssignmentRule rule = 1; + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock create_timestamp = 2; + // when delete_timestamp is present the rule should be treated as deleted + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock delete_timestamp = 3; } message RedirectRule { - temporal.api.taskqueue.v1.CompatibleBuildIdRedirectRule rule = 1; - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock create_timestamp = 2; - // when delete_timestamp is present the rule should be treated as deleted - // (-- api-linter: core::0142::time-field-type=disabled - // aip.dev/not-precedent: Using HLC instead of wall clock. --) - temporal.server.api.clock.v1.HybridLogicalClock delete_timestamp = 3; + temporal.api.taskqueue.v1.CompatibleBuildIdRedirectRule rule = 1; + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock create_timestamp = 2; + // when delete_timestamp is present the rule should be treated as deleted + // (-- api-linter: core::0142::time-field-type=disabled + // aip.dev/not-precedent: Using HLC instead of wall clock. --) + temporal.server.api.clock.v1.HybridLogicalClock delete_timestamp = 3; } // Holds all the data related to worker versioning for a task queue. // Backwards-incompatible changes cannot be made, as this would make existing stored data unreadable. message VersioningData { - // All the incompatible version sets, unordered except for the last element, which is considered the set "default". - repeated CompatibleVersionSet version_sets = 1; - // Ordered list of assignment rules. Also contains recently-deleted rules. - repeated AssignmentRule assignment_rules = 2; - // Unordered list of redirect rules. Also contains recently-deleted rules. - repeated RedirectRule redirect_rules = 3; + // All the incompatible version sets, unordered except for the last element, which is considered the set "default". + repeated CompatibleVersionSet version_sets = 1; + // Ordered list of assignment rules. Also contains recently-deleted rules. + repeated AssignmentRule assignment_rules = 2; + // Unordered list of redirect rules. Also contains recently-deleted rules. + repeated RedirectRule redirect_rules = 3; } message DeploymentData { - reserved 1; - // Set of worker deployment versions that this task queue belongs to. - // Current Version is defined implicitly as the version with `current_since_time!=nil` and the most - // recent `routing_update_time`. - // Ramping Version is defined implicitly as the version with `ramping_since_time!=nil` and the most - // recent `routing_update_time`. - // The Ramping Version receives a share of unversioned/unpinned tasks according to its - // `ramp_percentage`. If there is no Ramping Version, all the unversioned/unpinned tasks are - // routed to the Current Version. If there is no Current Version, any poller with UNVERSIONED - // (or unspecified) WorkflowVersioningMode will receive the tasks. - // Remove after `AsyncSetCurrentAndRamping` workflow version is irreversibly enabled. - repeated temporal.server.api.deployment.v1.DeploymentVersionData versions = 2 [deprecated = true]; - - // Present if the task queue's ramping version is unversioned. - // Remove after `AsyncSetCurrentAndRamping` workflow version is irreversibly enabled. - temporal.server.api.deployment.v1.DeploymentVersionData unversioned_ramp_data = 3 [deprecated = true]; - - // Routing and version membership data for all worker deployments that this task queue belongs to. - // Key is the deployment name. - map deployments_data = 4; + reserved 1; + // Set of worker deployment versions that this task queue belongs to. + // Current Version is defined implicitly as the version with `current_since_time!=nil` and the most + // recent `routing_update_time`. + // Ramping Version is defined implicitly as the version with `ramping_since_time!=nil` and the most + // recent `routing_update_time`. + // The Ramping Version receives a share of unversioned/unpinned tasks according to its + // `ramp_percentage`. If there is no Ramping Version, all the unversioned/unpinned tasks are + // routed to the Current Version. If there is no Current Version, any poller with UNVERSIONED + // (or unspecified) WorkflowVersioningMode will receive the tasks. + // Remove after `AsyncSetCurrentAndRamping` workflow version is irreversibly enabled. + repeated temporal.server.api.deployment.v1.DeploymentVersionData versions = 2 [deprecated = true]; + + // Present if the task queue's ramping version is unversioned. + // Remove after `AsyncSetCurrentAndRamping` workflow version is irreversibly enabled. + temporal.server.api.deployment.v1.DeploymentVersionData unversioned_ramp_data = 3 [deprecated = true]; + + // Routing and version membership data for all worker deployments that this task queue belongs to. + // Key is the deployment name. + map deployments_data = 4; } // Routing config and version membership data for a given worker deployment that a TQ should know. message WorkerDeploymentData { - temporal.api.deployment.v1.RoutingConfig routing_config = 1; - // This map tracks the membership of the task queue in the deployment versions. A version is - // present here iff the task queue has ever been polled from the version. - // Key is the build id. - map versions = 2; + temporal.api.deployment.v1.RoutingConfig routing_config = 1; + // This map tracks the membership of the task queue in the deployment versions. A version is + // present here iff the task queue has ever been polled from the version. + // Key is the build id. + map versions = 2; } // Container for all persistent user data that varies per task queue type within a family. message TaskQueueTypeUserData { - DeploymentData deployment_data = 1; + DeploymentData deployment_data = 1; - temporal.api.taskqueue.v1.TaskQueueConfig config = 2; + temporal.api.taskqueue.v1.TaskQueueConfig config = 2; - temporal.server.api.enums.v1.FairnessState fairness_state = 3; + temporal.server.api.enums.v1.FairnessState fairness_state = 3; } // Container for all persistent user provided data for a task queue family. @@ -133,21 +134,21 @@ message TaskQueueTypeUserData { // This data must all fit in a single DB column and is kept cached in-memory, take extra care to ensure data added here // has reasonable size limits imposed on it. message TaskQueueUserData { - // The last recorded cluster-local Hybrid Logical Clock timestamp for _this_ task queue family. - // Updated whenever user data is directly updated due to a user action but not when applying replication events. - // The clock is referenced when new timestamps are generated to ensure it produces monotonically increasing - // timestamps. - temporal.server.api.clock.v1.HybridLogicalClock clock = 1; - VersioningData versioning_data = 2; + // The last recorded cluster-local Hybrid Logical Clock timestamp for _this_ task queue family. + // Updated whenever user data is directly updated due to a user action but not when applying replication events. + // The clock is referenced when new timestamps are generated to ensure it produces monotonically increasing + // timestamps. + temporal.server.api.clock.v1.HybridLogicalClock clock = 1; + VersioningData versioning_data = 2; - // Map from task queue type (workflow, activity, nexus) to per-type data. - map per_type = 3; + // Map from task queue type (workflow, activity, nexus) to per-type data. + map per_type = 3; - // For future use: description, rate limits, manual partition control, etc... +// For future use: description, rate limits, manual partition control, etc... } // Simple wrapper that includes a TaskQueueUserData and its storage version. message VersionedTaskQueueUserData { - TaskQueueUserData data = 1; - int64 version = 2; + TaskQueueUserData data = 1; + int64 version = 2; } diff --git a/proto/internal/temporal/server/api/persistence/v1/tasks.proto b/proto/internal/temporal/server/api/persistence/v1/tasks.proto index 0026b4f40f..fae67cc7d0 100644 --- a/proto/internal/temporal/server/api/persistence/v1/tasks.proto +++ b/proto/internal/temporal/server/api/persistence/v1/tasks.proto @@ -1,111 +1,110 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "google/protobuf/timestamp.proto"; - import "temporal/api/common/v1/message.proto"; import "temporal/api/enums/v1/task_queue.proto"; - import "temporal/server/api/clock/v1/message.proto"; import "temporal/server/api/taskqueue/v1/message.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // task column message AllocatedTaskInfo { - TaskInfo data = 1; - int64 task_pass = 3; - int64 task_id = 2; + TaskInfo data = 1; + int64 task_pass = 3; + int64 task_id = 2; } message TaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - int64 scheduled_event_id = 4; - google.protobuf.Timestamp create_time = 5; - google.protobuf.Timestamp expiry_time = 6; - temporal.server.api.clock.v1.VectorClock clock = 7; - // How this task should be directed. (Missing means the default for - // TaskVersionDirective, which is unversioned.) - temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 8; - // Stamp field allows to differentiate between different instances of the same task - int32 stamp = 9; - temporal.api.common.v1.Priority priority = 10; - // Reference to any chasm component associated with this task - bytes component_ref = 11; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + int64 scheduled_event_id = 4; + google.protobuf.Timestamp create_time = 5; + google.protobuf.Timestamp expiry_time = 6; + temporal.server.api.clock.v1.VectorClock clock = 7; + // How this task should be directed. (Missing means the default for + // TaskVersionDirective, which is unversioned.) + temporal.server.api.taskqueue.v1.TaskVersionDirective version_directive = 8; + // Stamp field allows to differentiate between different instances of the same task + int32 stamp = 9; + temporal.api.common.v1.Priority priority = 10; + // Reference to any chasm component associated with this task + bytes component_ref = 11; } // task_queue column message TaskQueueInfo { - string namespace_id = 1; - string name = 2; - temporal.api.enums.v1.TaskQueueType task_type = 3; - temporal.api.enums.v1.TaskQueueKind kind = 4; - // After data is migrated into subqueues, this contains a copy of the ack level for subqueue 0. - int64 ack_level = 5; - google.protobuf.Timestamp expiry_time = 6; - google.protobuf.Timestamp last_update_time = 7; - // After data is migrated into subqueues, this contains a copy of the count for subqueue 0. - int64 approximate_backlog_count = 8; - - // Subqueues contains one entry for each subqueue in this physical task queue. - // Tasks are split into subqueues to implement priority and fairness. - // Subqueues are indexed starting from 0, the zero subqueue is always present - // and corresponds to the "main" queue before subqueues were introduced. - // - // The message at index n describes the subqueue at index n. - // - // Each subqueue has its own ack level and approx backlog count, but they share - // the range id. For compatibility, ack level and backlog count for subqueue 0 - // is copied into TaskQueueInfo. - repeated SubqueueInfo subqueues = 9; - - // For transitioning from tasks (v1) to tasks_v2 and back: - // - // If this TaskQueueInfo is in v1 and this is set, then v2 may have tasks. - // If this TaskQueueInfo is in v2 and this is set, then v1 may have tasks. - // - // New metadata starts with this flag set (we could skip this when useNewMatcher is off). - // Whenever locking any metadata as the inactive one (drain-only), this should be set. - // If the flag is true, no tasks should be written to the active table until the inactive - // table has also been locked (and the flag set there for a potential reverse transition). - // After determinining that the inactive table has no more tasks left, then this - // can be cleared on the active table. - bool other_has_tasks = 10; + string namespace_id = 1; + string name = 2; + temporal.api.enums.v1.TaskQueueType task_type = 3; + temporal.api.enums.v1.TaskQueueKind kind = 4; + // After data is migrated into subqueues, this contains a copy of the ack level for subqueue 0. + int64 ack_level = 5; + google.protobuf.Timestamp expiry_time = 6; + google.protobuf.Timestamp last_update_time = 7; + // After data is migrated into subqueues, this contains a copy of the count for subqueue 0. + int64 approximate_backlog_count = 8; + + // Subqueues contains one entry for each subqueue in this physical task queue. + // Tasks are split into subqueues to implement priority and fairness. + // Subqueues are indexed starting from 0, the zero subqueue is always present + // and corresponds to the "main" queue before subqueues were introduced. + // + // The message at index n describes the subqueue at index n. + // + // Each subqueue has its own ack level and approx backlog count, but they share + // the range id. For compatibility, ack level and backlog count for subqueue 0 + // is copied into TaskQueueInfo. + repeated SubqueueInfo subqueues = 9; + + // For transitioning from tasks (v1) to tasks_v2 and back: + // + // If this TaskQueueInfo is in v1 and this is set, then v2 may have tasks. + // If this TaskQueueInfo is in v2 and this is set, then v1 may have tasks. + // + // New metadata starts with this flag set (we could skip this when useNewMatcher is off). + // Whenever locking any metadata as the inactive one (drain-only), this should be set. + // If the flag is true, no tasks should be written to the active table until the inactive + // table has also been locked (and the flag set there for a potential reverse transition). + // After determinining that the inactive table has no more tasks left, then this + // can be cleared on the active table. + bool other_has_tasks = 10; } message SubqueueInfo { - // Key is the information used by a splitting algorithm to decide which tasks should go in - // this subqueue. It should not change after being registered in TaskQueueInfo. - SubqueueKey key = 1; + // Key is the information used by a splitting algorithm to decide which tasks should go in + // this subqueue. It should not change after being registered in TaskQueueInfo. + SubqueueKey key = 1; - // The rest are mutable state for the subqueue: - int64 ack_level = 2; - temporal.server.api.taskqueue.v1.FairLevel fair_ack_level = 4; + // The rest are mutable state for the subqueue: + int64 ack_level = 2; + temporal.server.api.taskqueue.v1.FairLevel fair_ack_level = 4; - int64 approximate_backlog_count = 3; + int64 approximate_backlog_count = 3; - // Max read level keeps track of the highest task level ever written, but is only - // maintained best-effort. Do not trust these values. - temporal.server.api.taskqueue.v1.FairLevel fair_max_read_level = 5; + // Max read level keeps track of the highest task level ever written, but is only + // maintained best-effort. Do not trust these values. + temporal.server.api.taskqueue.v1.FairLevel fair_max_read_level = 5; - // We can persist a limited number of fairness key counts in task queue - // metadata so they're not lost on migration. - repeated FairnessKeyCount top_k_fairness_counts = 6; + // We can persist a limited number of fairness key counts in task queue + // metadata so they're not lost on migration. + repeated FairnessKeyCount top_k_fairness_counts = 6; } message FairnessKeyCount { - string key = 1; - int64 count = 2; + string key = 1; + int64 count = 2; } message SubqueueKey { - // Each subqueue contains tasks from only one priority level. - int32 priority = 1; + // Each subqueue contains tasks from only one priority level. + int32 priority = 1; } message TaskKey { - google.protobuf.Timestamp fire_time = 1; - int64 task_id = 2; + google.protobuf.Timestamp fire_time = 1; + int64 task_id = 2; } diff --git a/proto/internal/temporal/server/api/persistence/v1/update.proto b/proto/internal/temporal/server/api/persistence/v1/update.proto index 334111f09c..05405cd50e 100644 --- a/proto/internal/temporal/server/api/persistence/v1/update.proto +++ b/proto/internal/temporal/server/api/persistence/v1/update.proto @@ -1,53 +1,54 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "temporal/server/api/persistence/v1/hsm.proto"; +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; + // UpdateAdmissionInfo contains information about a durably admitted update. Note that updates in Admitted state are typically // non-durable (i.e. do not have a corresponding event in history). Durably admitted updates arise as a result of // workflow reset or history event replication conflict: in these cases a WorkflowExecutionUpdateAdmittedEvent event is // created when an accepted update (on one branch of workflow history) is converted into an admitted update (on another // branch). message UpdateAdmissionInfo { - message HistoryPointer { - // the event ID of the WorkflowExecutionUpdateAdmittedEvent - int64 event_id = 1; - // the ID of the event batch containing the event_id - int64 event_batch_id = 2; - } - - oneof location { - HistoryPointer history_pointer = 1; - } + message HistoryPointer { + // the event ID of the WorkflowExecutionUpdateAdmittedEvent + int64 event_id = 1; + // the ID of the event batch containing the event_id + int64 event_batch_id = 2; + } + + oneof location { + HistoryPointer history_pointer = 1; + } } // UpdateAcceptanceInfo contains information about an accepted update message UpdateAcceptanceInfo { - // the event ID of the WorkflowExecutionUpdateAcceptedEvent - int64 event_id = 1; + // the event ID of the WorkflowExecutionUpdateAcceptedEvent + int64 event_id = 1; } // UpdateCompletionInfo contains information about a completed update message UpdateCompletionInfo { - // the event ID of the WorkflowExecutionUpdateCompletedEvent - int64 event_id = 1; + // the event ID of the WorkflowExecutionUpdateCompletedEvent + int64 event_id = 1; - // the ID of the event batch containing the event_id above - int64 event_batch_id = 2; + // the ID of the event batch containing the event_id above + int64 event_batch_id = 2; } // UpdateInfo is the persistent state of a single update message UpdateInfo { - oneof value { - // update has been accepted and this is the acceptance metadata - UpdateAcceptanceInfo acceptance = 1; - // update has been completed and this is the completion metadata - UpdateCompletionInfo completion = 2; - // update has been admitted and this is the admission metadata - UpdateAdmissionInfo admission = 3; - } - - VersionedTransition last_update_versioned_transition = 4; + oneof value { + // update has been accepted and this is the acceptance metadata + UpdateAcceptanceInfo acceptance = 1; + // update has been completed and this is the completion metadata + UpdateCompletionInfo completion = 2; + // update has been admitted and this is the admission metadata + UpdateAdmissionInfo admission = 3; + } + + VersionedTransition last_update_versioned_transition = 4; } diff --git a/proto/internal/temporal/server/api/persistence/v1/workflow_mutable_state.proto b/proto/internal/temporal/server/api/persistence/v1/workflow_mutable_state.proto index d52ca747e6..127a2b8ca0 100644 --- a/proto/internal/temporal/server/api/persistence/v1/workflow_mutable_state.proto +++ b/proto/internal/temporal/server/api/persistence/v1/workflow_mutable_state.proto @@ -1,67 +1,67 @@ syntax = "proto3"; package temporal.server.api.persistence.v1; -option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; import "temporal/api/history/v1/message.proto"; +import "temporal/server/api/persistence/v1/chasm.proto"; import "temporal/server/api/persistence/v1/executions.proto"; import "temporal/server/api/persistence/v1/hsm.proto"; -import "temporal/server/api/persistence/v1/chasm.proto"; import "temporal/server/api/persistence/v1/update.proto"; -message WorkflowMutableState{ - map activity_infos = 1; - map timer_infos = 2; - map child_execution_infos = 3; - map request_cancel_infos = 4; - map signal_infos = 5; - map chasm_nodes = 12; - repeated string signal_requested_ids = 6; - WorkflowExecutionInfo execution_info = 7; - WorkflowExecutionState execution_state = 8; - int64 next_event_id = 9; - repeated temporal.api.history.v1.HistoryEvent buffered_events = 10; - Checksum checksum = 11; -} +option go_package = "go.temporal.io/server/api/persistence/v1;persistence"; -message WorkflowMutableStateMutation{ +message WorkflowMutableState { + map activity_infos = 1; + map timer_infos = 2; + map child_execution_infos = 3; + map request_cancel_infos = 4; + map signal_infos = 5; + map chasm_nodes = 12; + repeated string signal_requested_ids = 6; + WorkflowExecutionInfo execution_info = 7; + WorkflowExecutionState execution_state = 8; + int64 next_event_id = 9; + repeated temporal.api.history.v1.HistoryEvent buffered_events = 10; + Checksum checksum = 11; +} - message StateMachineNodeMutation{ - StateMachinePath path = 1; - bytes data = 2; - VersionedTransition initial_versioned_transition = 3; - VersionedTransition last_update_versioned_transition = 4; - } +message WorkflowMutableStateMutation { + message StateMachineNodeMutation { + StateMachinePath path = 1; + bytes data = 2; + VersionedTransition initial_versioned_transition = 3; + VersionedTransition last_update_versioned_transition = 4; + } - // The following updated_* fields are computed based on the - // lastUpdateVersionedTransition field of each sub state machine. - map updated_activity_infos = 1; - map updated_timer_infos = 2; - map updated_child_execution_infos = 3; - map updated_request_cancel_infos = 4; - map updated_signal_infos = 5; - map updated_update_infos = 6; - repeated StateMachineNodeMutation updated_sub_state_machines = 7; - map updated_chasm_nodes = 19; + // The following updated_* fields are computed based on the + // lastUpdateVersionedTransition field of each sub state machine. + map updated_activity_infos = 1; + map updated_timer_infos = 2; + map updated_child_execution_infos = 3; + map updated_request_cancel_infos = 4; + map updated_signal_infos = 5; + map updated_update_infos = 6; + repeated StateMachineNodeMutation updated_sub_state_machines = 7; + map updated_chasm_nodes = 19; - reserved 8; - reserved 9; - reserved 10; - reserved 11; - reserved 12; - reserved 13; - reserved 14; + reserved 8; + reserved 9; + reserved 10; + reserved 11; + reserved 12; + reserved 13; + reserved 14; - repeated string signal_requested_ids = 15; - // Partial WorkflowExecutionInfo. Some fields, such as - // update_infos and sub_state_machines_by_type, are not populated here. - // Instead, only diffs are synced in the deleted_* and updated_* fields above. - WorkflowExecutionInfo execution_info = 16; - WorkflowExecutionState execution_state = 17; + repeated string signal_requested_ids = 15; + // Partial WorkflowExecutionInfo. Some fields, such as + // update_infos and sub_state_machines_by_type, are not populated here. + // Instead, only diffs are synced in the deleted_* and updated_* fields above. + WorkflowExecutionInfo execution_info = 16; + WorkflowExecutionState execution_state = 17; - repeated StateMachineTombstoneBatch sub_state_machine_tombstone_batches = 18; + repeated StateMachineTombstoneBatch sub_state_machine_tombstone_batches = 18; - // TODO: uncomment buffered_events field when we are ready to replicate - // mutable state as well. - // repeated temporal.api.history.v1.HistoryEvent buffered_events = 20; +// TODO: uncomment buffered_events field when we are ready to replicate +// mutable state as well. +// repeated temporal.api.history.v1.HistoryEvent buffered_events = 20; } diff --git a/proto/internal/temporal/server/api/replication/v1/message.proto b/proto/internal/temporal/server/api/replication/v1/message.proto index de1b5e07fc..83d4566c59 100644 --- a/proto/internal/temporal/server/api/replication/v1/message.proto +++ b/proto/internal/temporal/server/api/replication/v1/message.proto @@ -2,11 +2,12 @@ syntax = "proto3"; package temporal.server.api.replication.v1; -option go_package = "go.temporal.io/server/api/replication/v1;repication"; - -import "google/protobuf/timestamp.proto"; import "google/protobuf/duration.proto"; - +import "google/protobuf/timestamp.proto"; +import "temporal/api/common/v1/message.proto"; +import "temporal/api/failure/v1/message.proto"; +import "temporal/api/namespace/v1/message.proto"; +import "temporal/api/replication/v1/message.proto"; import "temporal/server/api/enums/v1/replication.proto"; import "temporal/server/api/enums/v1/task.proto"; import "temporal/server/api/history/v1/message.proto"; @@ -14,258 +15,255 @@ import "temporal/server/api/persistence/v1/executions.proto"; import "temporal/server/api/persistence/v1/hsm.proto"; import "temporal/server/api/persistence/v1/task_queues.proto"; import "temporal/server/api/persistence/v1/workflow_mutable_state.proto"; - -import "temporal/api/common/v1/message.proto"; -import "temporal/api/namespace/v1/message.proto"; -import "temporal/api/replication/v1/message.proto"; -import "temporal/api/failure/v1/message.proto"; import "temporal/server/api/workflow/v1/message.proto"; +option go_package = "go.temporal.io/server/api/replication/v1;repication"; + message ReplicationTask { - temporal.server.api.enums.v1.ReplicationTaskType task_type = 1; - int64 source_task_id = 2; - reserved 4; - reserved 7; - oneof attributes { - NamespaceTaskAttributes namespace_task_attributes = 3; - SyncShardStatusTaskAttributes sync_shard_status_task_attributes = 5; - SyncActivityTaskAttributes sync_activity_task_attributes = 6; - HistoryTaskAttributes history_task_attributes = 8; - SyncWorkflowStateTaskAttributes sync_workflow_state_task_attributes = 10; - TaskQueueUserDataAttributes task_queue_user_data_attributes = 11; - SyncHSMAttributes sync_hsm_attributes = 14; - BackfillHistoryTaskAttributes backfill_history_task_attributes = 16; - VerifyVersionedTransitionTaskAttributes verify_versioned_transition_task_attributes = 18; - SyncVersionedTransitionTaskAttributes sync_versioned_transition_task_attributes = 19; - } - // All attributes should be deprecated and replaced by this field. - // The task_type + data provide more flexibility in future use cases. - temporal.api.common.v1.DataBlob data = 12; - google.protobuf.Timestamp visibility_time = 9; - temporal.server.api.enums.v1.TaskPriority priority = 13; - temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 15; - temporal.server.api.persistence.v1.ReplicationTaskInfo raw_task_info = 17; + temporal.server.api.enums.v1.ReplicationTaskType task_type = 1; + int64 source_task_id = 2; + reserved 4; + reserved 7; + oneof attributes { + NamespaceTaskAttributes namespace_task_attributes = 3; + SyncShardStatusTaskAttributes sync_shard_status_task_attributes = 5; + SyncActivityTaskAttributes sync_activity_task_attributes = 6; + HistoryTaskAttributes history_task_attributes = 8; + SyncWorkflowStateTaskAttributes sync_workflow_state_task_attributes = 10; + TaskQueueUserDataAttributes task_queue_user_data_attributes = 11; + SyncHSMAttributes sync_hsm_attributes = 14; + BackfillHistoryTaskAttributes backfill_history_task_attributes = 16; + VerifyVersionedTransitionTaskAttributes verify_versioned_transition_task_attributes = 18; + SyncVersionedTransitionTaskAttributes sync_versioned_transition_task_attributes = 19; + } + // All attributes should be deprecated and replaced by this field. + // The task_type + data provide more flexibility in future use cases. + temporal.api.common.v1.DataBlob data = 12; + google.protobuf.Timestamp visibility_time = 9; + temporal.server.api.enums.v1.TaskPriority priority = 13; + temporal.server.api.persistence.v1.VersionedTransition versioned_transition = 15; + temporal.server.api.persistence.v1.ReplicationTaskInfo raw_task_info = 17; } message ReplicationToken { - int32 shard_id = 1; - // lastRetrievedMessageId is where the next fetch should begin with. - int64 last_retrieved_message_id = 2; - // lastProcessedMessageId is the last messageId that is processed on the passive side. - // This can be different than lastRetrievedMessageId if passive side supports prefetching messages. - int64 last_processed_message_id = 3; - // The VisibilityTime of last processed ReplicationTask - google.protobuf.Timestamp last_processed_visibility_time = 4; + int32 shard_id = 1; + // lastRetrievedMessageId is where the next fetch should begin with. + int64 last_retrieved_message_id = 2; + // lastProcessedMessageId is the last messageId that is processed on the passive side. + // This can be different than lastRetrievedMessageId if passive side supports prefetching messages. + int64 last_processed_message_id = 3; + // The VisibilityTime of last processed ReplicationTask + google.protobuf.Timestamp last_processed_visibility_time = 4; } message SyncShardStatus { - google.protobuf.Timestamp status_time = 1; + google.protobuf.Timestamp status_time = 1; } message SyncReplicationState { - // deprecated in favor of using ReplicationState object - int64 inclusive_low_watermark = 1; - // deprecated in favor of using ReplicationState object - google.protobuf.Timestamp inclusive_low_watermark_time = 2; - ReplicationState high_priority_state = 3; - ReplicationState low_priority_state = 4; + // deprecated in favor of using ReplicationState object + int64 inclusive_low_watermark = 1; + // deprecated in favor of using ReplicationState object + google.protobuf.Timestamp inclusive_low_watermark_time = 2; + ReplicationState high_priority_state = 3; + ReplicationState low_priority_state = 4; } message ReplicationState { - int64 inclusive_low_watermark = 1; - google.protobuf.Timestamp inclusive_low_watermark_time = 2; - temporal.server.api.enums.v1.ReplicationFlowControlCommand flow_control_command = 3; + int64 inclusive_low_watermark = 1; + google.protobuf.Timestamp inclusive_low_watermark_time = 2; + temporal.server.api.enums.v1.ReplicationFlowControlCommand flow_control_command = 3; } message ReplicationMessages { - repeated ReplicationTask replication_tasks = 1; - // This can be different than the last taskId in the above list, because sender can decide to skip tasks (e.g. for completed workflows). - int64 last_retrieved_message_id = 2; - // Hint for flow control. - bool has_more = 3; - SyncShardStatus sync_shard_status = 4; + repeated ReplicationTask replication_tasks = 1; + // This can be different than the last taskId in the above list, because sender can decide to skip tasks (e.g. for completed workflows). + int64 last_retrieved_message_id = 2; + // Hint for flow control. + bool has_more = 3; + SyncShardStatus sync_shard_status = 4; } message WorkflowReplicationMessages { - repeated ReplicationTask replication_tasks = 1; - // This can be different than the last taskId in the above list, because sender can decide to skip tasks (e.g. for completed workflows). - int64 exclusive_high_watermark = 2; - google.protobuf.Timestamp exclusive_high_watermark_time = 3; - temporal.server.api.enums.v1.TaskPriority priority = 4; + repeated ReplicationTask replication_tasks = 1; + // This can be different than the last taskId in the above list, because sender can decide to skip tasks (e.g. for completed workflows). + int64 exclusive_high_watermark = 2; + google.protobuf.Timestamp exclusive_high_watermark_time = 3; + temporal.server.api.enums.v1.TaskPriority priority = 4; } // TODO: Deprecate this definition, it only used by the deprecated replication DLQ v1 logic message ReplicationTaskInfo { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.enums.v1.TaskType task_type = 4; - int64 task_id = 5; - int64 version = 6; - int64 first_event_id = 7; - int64 next_event_id = 8; - int64 scheduled_event_id = 9; - temporal.server.api.enums.v1.TaskPriority priority = 10; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.enums.v1.TaskType task_type = 4; + int64 task_id = 5; + int64 version = 6; + int64 first_event_id = 7; + int64 next_event_id = 8; + int64 scheduled_event_id = 9; + temporal.server.api.enums.v1.TaskPriority priority = 10; } message NamespaceTaskAttributes { - temporal.server.api.enums.v1.NamespaceOperation namespace_operation = 1; - string id = 2; - temporal.api.namespace.v1.NamespaceInfo info = 3; - temporal.api.namespace.v1.NamespaceConfig config = 4; - temporal.api.replication.v1.NamespaceReplicationConfig replication_config = 5; - int64 config_version = 6; - int64 failover_version = 7; - repeated temporal.api.replication.v1.FailoverStatus failover_history = 8; + temporal.server.api.enums.v1.NamespaceOperation namespace_operation = 1; + string id = 2; + temporal.api.namespace.v1.NamespaceInfo info = 3; + temporal.api.namespace.v1.NamespaceConfig config = 4; + temporal.api.replication.v1.NamespaceReplicationConfig replication_config = 5; + int64 config_version = 6; + int64 failover_version = 7; + repeated temporal.api.replication.v1.FailoverStatus failover_history = 8; } message SyncShardStatusTaskAttributes { - string source_cluster = 1; - int32 shard_id = 2; - google.protobuf.Timestamp status_time = 3; + string source_cluster = 1; + int32 shard_id = 2; + google.protobuf.Timestamp status_time = 3; } message SyncActivityTaskAttributes { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - int64 version = 4; - int64 scheduled_event_id = 5; - google.protobuf.Timestamp scheduled_time = 6; - int64 started_event_id = 7; - google.protobuf.Timestamp started_time = 8; - google.protobuf.Timestamp last_heartbeat_time = 9; - temporal.api.common.v1.Payloads details = 10; - int32 attempt = 11; - temporal.api.failure.v1.Failure last_failure = 12; - string last_worker_identity = 13; - temporal.server.api.history.v1.VersionHistory version_history = 14; - temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 15; - // build ID of the worker who received this activity last time - string last_started_build_id = 16; - // workflows redirect_counter value when this activity started last time - int64 last_started_redirect_counter = 17; - // The first time the activity was scheduled. - google.protobuf.Timestamp first_scheduled_time = 18; - // The last time an activity attempt completion was recorded by the server. - google.protobuf.Timestamp last_attempt_complete_time = 19; - // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. - // It monotonically increments when the activity options are changed. - int32 stamp = 20; - // Flag indicating whether the activity is currently paused. - bool paused = 21; - // Retry policy for the activity. It needs to be replicated now, since the activity properties can be updated. - google.protobuf.Duration retry_initial_interval = 22; - google.protobuf.Duration retry_maximum_interval = 23; - int32 retry_maximum_attempts = 24; - double retry_backoff_coefficient = 25; - int64 start_version = 26; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + int64 version = 4; + int64 scheduled_event_id = 5; + google.protobuf.Timestamp scheduled_time = 6; + int64 started_event_id = 7; + google.protobuf.Timestamp started_time = 8; + google.protobuf.Timestamp last_heartbeat_time = 9; + temporal.api.common.v1.Payloads details = 10; + int32 attempt = 11; + temporal.api.failure.v1.Failure last_failure = 12; + string last_worker_identity = 13; + temporal.server.api.history.v1.VersionHistory version_history = 14; + temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 15; + // build ID of the worker who received this activity last time + string last_started_build_id = 16; + // workflows redirect_counter value when this activity started last time + int64 last_started_redirect_counter = 17; + // The first time the activity was scheduled. + google.protobuf.Timestamp first_scheduled_time = 18; + // The last time an activity attempt completion was recorded by the server. + google.protobuf.Timestamp last_attempt_complete_time = 19; + // Stamp represents the internal “version” of the activity options and can/will be changed with Activity API. + // It monotonically increments when the activity options are changed. + int32 stamp = 20; + // Flag indicating whether the activity is currently paused. + bool paused = 21; + // Retry policy for the activity. It needs to be replicated now, since the activity properties can be updated. + google.protobuf.Duration retry_initial_interval = 22; + google.protobuf.Duration retry_maximum_interval = 23; + int32 retry_maximum_attempts = 24; + double retry_backoff_coefficient = 25; + int64 start_version = 26; } message HistoryTaskAttributes { - reserved 1; - string namespace_id = 2; - string workflow_id = 3; - string run_id = 4; - repeated temporal.server.api.history.v1.VersionHistoryItem version_history_items = 5; + reserved 1; + string namespace_id = 2; + string workflow_id = 3; + string run_id = 4; + repeated temporal.server.api.history.v1.VersionHistoryItem version_history_items = 5; - // to be deprecated in favor of using events_batches - temporal.api.common.v1.DataBlob events = 6; - // New run events does not need version history since there is no prior events. - temporal.api.common.v1.DataBlob new_run_events = 7; - temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 8; - string new_run_id = 9; - repeated temporal.api.common.v1.DataBlob events_batches = 10; + // to be deprecated in favor of using events_batches + temporal.api.common.v1.DataBlob events = 6; + // New run events does not need version history since there is no prior events. + temporal.api.common.v1.DataBlob new_run_events = 7; + temporal.server.api.workflow.v1.BaseExecutionInfo base_execution_info = 8; + string new_run_id = 9; + repeated temporal.api.common.v1.DataBlob events_batches = 10; } message SyncWorkflowStateTaskAttributes { - temporal.server.api.persistence.v1.WorkflowMutableState workflow_state = 1; - bool is_force_replication = 2; - bool is_close_transfer_task_acked = 3; + temporal.server.api.persistence.v1.WorkflowMutableState workflow_state = 1; + bool is_force_replication = 2; + bool is_close_transfer_task_acked = 3; } message TaskQueueUserDataAttributes { - string namespace_id = 1; - string task_queue_name = 2; - temporal.server.api.persistence.v1.TaskQueueUserData user_data = 3; + string namespace_id = 1; + string task_queue_name = 2; + temporal.server.api.persistence.v1.TaskQueueUserData user_data = 3; } message SyncHSMAttributes { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - temporal.server.api.history.v1.VersionHistory version_history = 4; - temporal.server.api.persistence.v1.StateMachineNode state_machine_node = 5; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + temporal.server.api.history.v1.VersionHistory version_history = 4; + temporal.server.api.persistence.v1.StateMachineNode state_machine_node = 5; } message BackfillHistoryTaskAttributes { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; - repeated temporal.server.api.history.v1.VersionHistoryItem event_version_history = 5; - repeated temporal.api.common.v1.DataBlob event_batches = 6; - NewRunInfo new_run_info = 7; + repeated temporal.server.api.history.v1.VersionHistoryItem event_version_history = 5; + repeated temporal.api.common.v1.DataBlob event_batches = 6; + NewRunInfo new_run_info = 7; } message NewRunInfo { - string run_id = 1; - temporal.api.common.v1.DataBlob event_batch = 2; + string run_id = 1; + temporal.api.common.v1.DataBlob event_batch = 2; } message SyncWorkflowStateMutationAttributes { - temporal.server.api.persistence.v1.VersionedTransition exclusive_start_versioned_transition = 1; - temporal.server.api.persistence.v1.WorkflowMutableStateMutation state_mutation = 2; + temporal.server.api.persistence.v1.VersionedTransition exclusive_start_versioned_transition = 1; + temporal.server.api.persistence.v1.WorkflowMutableStateMutation state_mutation = 2; } message SyncWorkflowStateSnapshotAttributes { - temporal.server.api.persistence.v1.WorkflowMutableState state = 1; + temporal.server.api.persistence.v1.WorkflowMutableState state = 1; } message VerifyVersionedTransitionTaskAttributes { - string namespace_id = 1; - string workflow_id = 2; - string run_id = 3; - int64 next_event_id = 4; - repeated temporal.server.api.history.v1.VersionHistoryItem event_version_history = 5; - string new_run_id = 6; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 7; + string namespace_id = 1; + string workflow_id = 2; + string run_id = 3; + int64 next_event_id = 4; + repeated temporal.server.api.history.v1.VersionHistoryItem event_version_history = 5; + string new_run_id = 6; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 7; } message SyncVersionedTransitionTaskAttributes { - reserved 1; - reserved 2; - reserved 3; - reserved 4; - VersionedTransitionArtifact versioned_transition_artifact = 5; - string namespace_id = 6; - string workflow_id = 7; - string run_id = 8; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 9; + reserved 1; + reserved 2; + reserved 3; + reserved 4; + VersionedTransitionArtifact versioned_transition_artifact = 5; + string namespace_id = 6; + string workflow_id = 7; + string run_id = 8; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 9; } message VersionedTransitionArtifact { - oneof state_attributes { - SyncWorkflowStateMutationAttributes sync_workflow_state_mutation_attributes = 1; - SyncWorkflowStateSnapshotAttributes sync_workflow_state_snapshot_attributes = 2; - } - repeated temporal.api.common.v1.DataBlob event_batches = 3; - NewRunInfo new_run_info = 4; - bool is_first_sync = 5; - bool is_close_transfer_task_acked = 6; - bool is_force_replication = 7; + oneof state_attributes { + SyncWorkflowStateMutationAttributes sync_workflow_state_mutation_attributes = 1; + SyncWorkflowStateSnapshotAttributes sync_workflow_state_snapshot_attributes = 2; + } + repeated temporal.api.common.v1.DataBlob event_batches = 3; + NewRunInfo new_run_info = 4; + bool is_first_sync = 5; + bool is_close_transfer_task_acked = 6; + bool is_force_replication = 7; } // MigrationExecutionInfo is only used in unit tests for validation compatibility. // Remove it after v1.30 is released. message MigrationExecutionInfo { - // The proto json name of this field needs to be "workflowId", - // to be backward compatibility with commonpb.WorkflowExecution, - // which is what used to be used in migration workflow's activity - // input/output. - string business_id = 1 [json_name = "workflowId"]; - string run_id = 2; - // (-- api-linter: core::0141::forbidden-types=disabled --) - uint32 archetype_id = 3; + // The proto json name of this field needs to be "workflowId", + // to be backward compatibility with commonpb.WorkflowExecution, + // which is what used to be used in migration workflow's activity + // input/output. + string business_id = 1 [json_name = "workflowId"]; + string run_id = 2; + // (-- api-linter: core::0141::forbidden-types=disabled --) + uint32 archetype_id = 3; } diff --git a/proto/internal/temporal/server/api/routing/v1/extension.proto b/proto/internal/temporal/server/api/routing/v1/extension.proto index c06e44e34f..6464fb7fde 100644 --- a/proto/internal/temporal/server/api/routing/v1/extension.proto +++ b/proto/internal/temporal/server/api/routing/v1/extension.proto @@ -2,17 +2,19 @@ syntax = "proto3"; package temporal.server.api.routing.v1; -option go_package = "go.temporal.io/server/api/routing/v1;routing"; - import "google/protobuf/descriptor.proto"; -extend google.protobuf.MethodOptions { optional RoutingOptions routing = 7234; } +option go_package = "go.temporal.io/server/api/routing/v1;routing"; + +extend google.protobuf.MethodOptions { + optional RoutingOptions routing = 7234; +} message RoutingOptions { - // Requests will be routed to a random shard. - bool random = 1; - // Requests may specify how to obtain the namespace ID. Defaults to the "namespace_id" field. - string namespace_id = 2; - // Request will be routed by resolving the namespace ID and business ID to a given shard. - string business_id = 3; + // Requests will be routed to a random shard. + bool random = 1; + // Requests may specify how to obtain the namespace ID. Defaults to the "namespace_id" field. + string namespace_id = 2; + // Request will be routed by resolving the namespace ID and business ID to a given shard. + string business_id = 3; } diff --git a/proto/internal/temporal/server/api/schedule/v1/message.proto b/proto/internal/temporal/server/api/schedule/v1/message.proto index 91e7a2ed0a..71b9792293 100644 --- a/proto/internal/temporal/server/api/schedule/v1/message.proto +++ b/proto/internal/temporal/server/api/schedule/v1/message.proto @@ -2,8 +2,7 @@ syntax = "proto3"; package temporal.server.api.schedule.v1; -option go_package = "go.temporal.io/server/api/schedule/v1;schedule"; - +import "google/protobuf/timestamp.proto"; import "temporal/api/common/v1/message.proto"; import "temporal/api/enums/v1/schedule.proto"; import "temporal/api/enums/v1/workflow.proto"; @@ -11,156 +10,155 @@ import "temporal/api/failure/v1/message.proto"; import "temporal/api/schedule/v1/message.proto"; import "temporal/api/workflowservice/v1/request_response.proto"; -import "google/protobuf/timestamp.proto"; +option go_package = "go.temporal.io/server/api/schedule/v1;schedule"; message BufferedStart { - // Nominal (pre-jitter) and Actual (post-jitter) time of action - google.protobuf.Timestamp nominal_time = 1; - google.protobuf.Timestamp actual_time = 2; - // Desired time is usually nil, which should be interpreted as == actual time, but for starts - // that are blocked behind another action, it is set to the close time of the previous action - // for more meaningful metrics. - google.protobuf.Timestamp desired_time = 5; - // Overridden overlap policy - temporal.api.enums.v1.ScheduleOverlapPolicy overlap_policy = 3; - // Trigger-immediately or backfill - bool manual = 4; - // An ID generated when the action is buffered for deduplication during - // execution. Only used by the CHASM scheduler (otherwise left empty). - string request_id = 6; - // Initially 0. Once a BufferedStart is ready to execute (overlap policies - // are resolved), its attempt count is set to 1. If a BufferedStart fails - // execution, its attempt count here is incremented. Only used by the CHASM - // scheduler (otherwise left empty). - int64 attempt = 7; - // If a BufferedStart is rate limited or needs to backoff while retrying, - // this time will be set, and the start will be held in the buffer until - // the backoff time has passed. Only used by the CHASM scheduler (otherwise - // ignored). - google.protobuf.Timestamp backoff_time = 8; - // The precomputed workflow ID that should be used (as-is) when executing - // this start. Only used by the CHASM scheduler (otherwise ignored). - string workflow_id = 9; - // The run ID of the started workflow. Populated when the workflow is - // successfully started. Only used by the CHASM scheduler. - string run_id = 10; - // The actual time the workflow was started. Populated when the workflow is - // successfully started. Only used by the CHASM scheduler. - google.protobuf.Timestamp start_time = 11; - // Populated when the workflow execution completes. Presence indicates the - // action is complete and retained for history. Only used by the CHASM scheduler. - CompletedResult completed = 12; - // True when a running BufferedStart is known to have a Nexus callback - // attached. False when a BufferedStart originated from a migrated V1 - // workflow. Only used by CHASM scheduler, for migration from V1. - bool has_callback = 13; + // Nominal (pre-jitter) and Actual (post-jitter) time of action + google.protobuf.Timestamp nominal_time = 1; + google.protobuf.Timestamp actual_time = 2; + // Desired time is usually nil, which should be interpreted as == actual time, but for starts + // that are blocked behind another action, it is set to the close time of the previous action + // for more meaningful metrics. + google.protobuf.Timestamp desired_time = 5; + // Overridden overlap policy + temporal.api.enums.v1.ScheduleOverlapPolicy overlap_policy = 3; + // Trigger-immediately or backfill + bool manual = 4; + // An ID generated when the action is buffered for deduplication during + // execution. Only used by the CHASM scheduler (otherwise left empty). + string request_id = 6; + // Initially 0. Once a BufferedStart is ready to execute (overlap policies + // are resolved), its attempt count is set to 1. If a BufferedStart fails + // execution, its attempt count here is incremented. Only used by the CHASM + // scheduler (otherwise left empty). + int64 attempt = 7; + // If a BufferedStart is rate limited or needs to backoff while retrying, + // this time will be set, and the start will be held in the buffer until + // the backoff time has passed. Only used by the CHASM scheduler (otherwise + // ignored). + google.protobuf.Timestamp backoff_time = 8; + // The precomputed workflow ID that should be used (as-is) when executing + // this start. Only used by the CHASM scheduler (otherwise ignored). + string workflow_id = 9; + // The run ID of the started workflow. Populated when the workflow is + // successfully started. Only used by the CHASM scheduler. + string run_id = 10; + // The actual time the workflow was started. Populated when the workflow is + // successfully started. Only used by the CHASM scheduler. + google.protobuf.Timestamp start_time = 11; + // Populated when the workflow execution completes. Presence indicates the + // action is complete and retained for history. Only used by the CHASM scheduler. + CompletedResult completed = 12; + // True when a running BufferedStart is known to have a Nexus callback + // attached. False when a BufferedStart originated from a migrated V1 + // workflow. Only used by CHASM scheduler, for migration from V1. + bool has_callback = 13; } // Result when a workflow execution has completed. // Only used by the CHASM scheduler. message CompletedResult { - // The final status of the workflow execution. - temporal.api.enums.v1.WorkflowExecutionStatus status = 1; - // The time the workflow closed. - google.protobuf.Timestamp close_time = 2; + // The final status of the workflow execution. + temporal.api.enums.v1.WorkflowExecutionStatus status = 1; + // The time the workflow closed. + google.protobuf.Timestamp close_time = 2; } message InternalState { - string namespace = 1; - string namespace_id = 2; - string schedule_id = 8; + string namespace = 1; + string namespace_id = 2; + string schedule_id = 8; - google.protobuf.Timestamp last_processed_time = 3; - repeated BufferedStart buffered_starts = 4; - repeated temporal.api.schedule.v1.BackfillRequest ongoing_backfills = 10; + google.protobuf.Timestamp last_processed_time = 3; + repeated BufferedStart buffered_starts = 4; + repeated temporal.api.schedule.v1.BackfillRequest ongoing_backfills = 10; - // last completion/failure - temporal.api.common.v1.Payloads last_completion_result = 5; - temporal.api.failure.v1.Failure continued_failure = 6; + // last completion/failure + temporal.api.common.v1.Payloads last_completion_result = 5; + temporal.api.failure.v1.Failure continued_failure = 6; - // conflict token is implemented as simple sequence number - int64 conflict_token = 7; + // conflict token is implemented as simple sequence number + int64 conflict_token = 7; - bool need_refresh = 9; + bool need_refresh = 9; - bool pending_migration = 11; + bool pending_migration = 11; } message StartScheduleArgs { - temporal.api.schedule.v1.Schedule schedule = 1; - temporal.api.schedule.v1.ScheduleInfo info = 2; - temporal.api.schedule.v1.SchedulePatch initial_patch = 3; - InternalState state = 4; + temporal.api.schedule.v1.Schedule schedule = 1; + temporal.api.schedule.v1.ScheduleInfo info = 2; + temporal.api.schedule.v1.SchedulePatch initial_patch = 3; + InternalState state = 4; } message FullUpdateRequest { - temporal.api.schedule.v1.Schedule schedule = 1; - int64 conflict_token = 2; - temporal.api.common.v1.SearchAttributes search_attributes = 3; + temporal.api.schedule.v1.Schedule schedule = 1; + int64 conflict_token = 2; + temporal.api.common.v1.SearchAttributes search_attributes = 3; } message DescribeResponse { - temporal.api.schedule.v1.Schedule schedule = 1; - temporal.api.schedule.v1.ScheduleInfo info = 2; - int64 conflict_token = 3; + temporal.api.schedule.v1.Schedule schedule = 1; + temporal.api.schedule.v1.ScheduleInfo info = 2; + int64 conflict_token = 3; } message WatchWorkflowRequest { - // Note: this will be sent to the activity with empty execution.run_id, and - // the run id that we started in first_execution_run_id. - temporal.api.common.v1.WorkflowExecution execution = 3; - string first_execution_run_id = 4; - bool long_poll = 5; + // Note: this will be sent to the activity with empty execution.run_id, and + // the run id that we started in first_execution_run_id. + temporal.api.common.v1.WorkflowExecution execution = 3; + string first_execution_run_id = 4; + bool long_poll = 5; } message WatchWorkflowResponse { - temporal.api.enums.v1.WorkflowExecutionStatus status = 1; - oneof result_failure { - temporal.api.common.v1.Payloads result = 2; - temporal.api.failure.v1.Failure failure = 3; - } - // Timestamp of close event - google.protobuf.Timestamp close_time = 4; + temporal.api.enums.v1.WorkflowExecutionStatus status = 1; + oneof result_failure { + temporal.api.common.v1.Payloads result = 2; + temporal.api.failure.v1.Failure failure = 3; + } + // Timestamp of close event + google.protobuf.Timestamp close_time = 4; } message StartWorkflowRequest { - temporal.api.workflowservice.v1.StartWorkflowExecutionRequest request = 2; - reserved 3, 4, 5; - bool completed_rate_limit_sleep = 6; + temporal.api.workflowservice.v1.StartWorkflowExecutionRequest request = 2; + reserved 3, 4, 5; + bool completed_rate_limit_sleep = 6; } message StartWorkflowResponse { - string run_id = 1; - google.protobuf.Timestamp real_start_time = 2; + string run_id = 1; + google.protobuf.Timestamp real_start_time = 2; } message CancelWorkflowRequest { - string request_id = 3; - string identity = 4; - // Note: run id in execution is first execution run id - temporal.api.common.v1.WorkflowExecution execution = 5; - string reason = 6; + string request_id = 3; + string identity = 4; + // Note: run id in execution is first execution run id + temporal.api.common.v1.WorkflowExecution execution = 5; + string reason = 6; } message TerminateWorkflowRequest { - string request_id = 3; - string identity = 4; - // Note: run id in execution is first execution run id - temporal.api.common.v1.WorkflowExecution execution = 5; - string reason = 6; + string request_id = 3; + string identity = 4; + // Note: run id in execution is first execution run id + temporal.api.common.v1.WorkflowExecution execution = 5; + string reason = 6; } message NextTimeCache { - // workflow logic version (invalidate when changed) - int64 version = 1; - // start time that the results were calculated from - google.protobuf.Timestamp start_time = 2; - // next_times and nominal_times are a series of timestamp pairs, encoded as a nanosecond - // offset from start_time. next_times has one value for each time in the cache. - // nominal_times may have up to the same number of values, but it may also be shorter (or - // empty), if the corresponding nominal time is equal to the next time. - repeated int64 next_times = 3; - repeated int64 nominal_times = 4; - bool completed = 5; + // workflow logic version (invalidate when changed) + int64 version = 1; + // start time that the results were calculated from + google.protobuf.Timestamp start_time = 2; + // next_times and nominal_times are a series of timestamp pairs, encoded as a nanosecond + // offset from start_time. next_times has one value for each time in the cache. + // nominal_times may have up to the same number of values, but it may also be shorter (or + // empty), if the corresponding nominal time is equal to the next time. + repeated int64 next_times = 3; + repeated int64 nominal_times = 4; + bool completed = 5; } - diff --git a/proto/internal/temporal/server/api/taskqueue/v1/message.proto b/proto/internal/temporal/server/api/taskqueue/v1/message.proto index 9570654193..77dfe38c2f 100644 --- a/proto/internal/temporal/server/api/taskqueue/v1/message.proto +++ b/proto/internal/temporal/server/api/taskqueue/v1/message.proto @@ -2,133 +2,132 @@ syntax = "proto3"; package temporal.server.api.taskqueue.v1; -option go_package = "go.temporal.io/server/api/taskqueue/v1;taskqueue"; - import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; - import "temporal/api/deployment/v1/message.proto"; import "temporal/api/enums/v1/task_queue.proto"; import "temporal/api/enums/v1/workflow.proto"; import "temporal/api/taskqueue/v1/message.proto"; -import "temporal/server/api/enums/v1/task.proto"; import "temporal/server/api/deployment/v1/message.proto"; +import "temporal/server/api/enums/v1/task.proto"; + +option go_package = "go.temporal.io/server/api/taskqueue/v1;taskqueue"; // TaskVersionDirective controls how matching should direct a task. message TaskVersionDirective { - // Default (if build_id is not present) is "unversioned": - // Use the unversioned task queue, even if the task queue has versioning data. - // Absent value means the task is the non-starting task of an unversioned execution so it should remain unversioned. - // Deprecated. Use deployment_version. - oneof build_id { - // If use_assignment_rules is present, matching should use the assignment rules - // to determine the build ID. - // WV1: the task should be assigned the default version for the task queue. [cleanup-old-wv] - google.protobuf.Empty use_assignment_rules = 1; - - // This means the task is already assigned to `build_id` - // WV1: If assigned_build_id is present, use the default version in the compatible set - // containing this build ID. [cleanup-old-wv] - string assigned_build_id = 2; - } - - // Workflow's effective behavior when the task is scheduled. - temporal.api.enums.v1.VersioningBehavior behavior = 3; - // Workflow's effective deployment when the task is scheduled. - // Deprecated. Use deployment_version. - temporal.api.deployment.v1.Deployment deployment = 4; - // Workflow's effective deployment version when the task is scheduled. - temporal.server.api.deployment.v1.WorkerDeploymentVersion deployment_version = 5; - // Counter copied from the workflow execution's WorkflowExecutionVersioningInfo - // during enqueue time. - int64 revision_number = 6; + // Default (if build_id is not present) is "unversioned": + // Use the unversioned task queue, even if the task queue has versioning data. + // Absent value means the task is the non-starting task of an unversioned execution so it should remain unversioned. + // Deprecated. Use deployment_version. + oneof build_id { + // If use_assignment_rules is present, matching should use the assignment rules + // to determine the build ID. + // WV1: the task should be assigned the default version for the task queue. [cleanup-old-wv] + google.protobuf.Empty use_assignment_rules = 1; + + // This means the task is already assigned to `build_id` + // WV1: If assigned_build_id is present, use the default version in the compatible set + // containing this build ID. [cleanup-old-wv] + string assigned_build_id = 2; + } + + // Workflow's effective behavior when the task is scheduled. + temporal.api.enums.v1.VersioningBehavior behavior = 3; + // Workflow's effective deployment when the task is scheduled. + // Deprecated. Use deployment_version. + temporal.api.deployment.v1.Deployment deployment = 4; + // Workflow's effective deployment version when the task is scheduled. + temporal.server.api.deployment.v1.WorkerDeploymentVersion deployment_version = 5; + // Counter copied from the workflow execution's WorkflowExecutionVersioningInfo + // during enqueue time. + int64 revision_number = 6; } message FairLevel { - int64 task_pass = 1; - int64 task_id = 2; + int64 task_pass = 1; + int64 task_id = 2; } message InternalTaskQueueStatus { - int64 read_level = 1; - FairLevel fair_read_level = 7; - int64 ack_level = 2; - FairLevel fair_ack_level = 8; - temporal.api.taskqueue.v1.TaskIdBlock task_id_block = 3; - int64 loaded_tasks = 4; - int64 approximate_backlog_count = 5; - int64 max_read_level = 6; - FairLevel fair_max_read_level = 9; - // Draining means that this status is from a queue that is being drained to - // migrate from v1 to v2 tasks persistence (or backwards). - bool draining = 10; - // BacklogDrained means this queue has an empty backlog at the time this status - // was generated. This is inherently racy — new tasks may arrive after this - // check. Consumers must use version-based validation (see scaleManager) to - // ensure correctness. - bool backlog_drained = 11; + int64 read_level = 1; + FairLevel fair_read_level = 7; + int64 ack_level = 2; + FairLevel fair_ack_level = 8; + temporal.api.taskqueue.v1.TaskIdBlock task_id_block = 3; + int64 loaded_tasks = 4; + int64 approximate_backlog_count = 5; + int64 max_read_level = 6; + FairLevel fair_max_read_level = 9; + // Draining means that this status is from a queue that is being drained to + // migrate from v1 to v2 tasks persistence (or backwards). + bool draining = 10; + // BacklogDrained means this queue has an empty backlog at the time this status + // was generated. This is inherently racy — new tasks may arrive after this + // check. Consumers must use version-based validation (see scaleManager) to + // ensure correctness. + bool backlog_drained = 11; } message TaskQueueVersionInfoInternal { - PhysicalTaskQueueInfo physical_task_queue_info = 2; + PhysicalTaskQueueInfo physical_task_queue_info = 2; } message PhysicalTaskQueueInfo { - // Unversioned workers (with `useVersioning=false`) are reported in unversioned result even if they set a Build ID. - repeated temporal.api.taskqueue.v1.PollerInfo pollers = 1; - repeated InternalTaskQueueStatus internal_task_queue_status = 3; - temporal.api.taskqueue.v1.TaskQueueStats task_queue_stats = 2; - // (-- api-linter: core::0140::prepositions=disabled - // aip.dev/not-precedent: "by" is used to clarify the keys. --) - map task_queue_stats_by_priority_key = 4; + // Unversioned workers (with `useVersioning=false`) are reported in unversioned result even if they set a Build ID. + repeated temporal.api.taskqueue.v1.PollerInfo pollers = 1; + repeated InternalTaskQueueStatus internal_task_queue_status = 3; + temporal.api.taskqueue.v1.TaskQueueStats task_queue_stats = 2; + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: "by" is used to clarify the keys. --) + map task_queue_stats_by_priority_key = 4; } // Represents a normal or sticky partition of a task queue. message TaskQueuePartition { - // This is the user-facing name for this task queue - string task_queue = 1; - temporal.api.enums.v1.TaskQueueType task_queue_type = 2; - // Absent means normal root partition (normal_partition_id=0) - oneof partition_id { - int32 normal_partition_id = 3; - string sticky_name = 4; - } + // This is the user-facing name for this task queue + string task_queue = 1; + temporal.api.enums.v1.TaskQueueType task_queue_type = 2; + // Absent means normal root partition (normal_partition_id=0) + oneof partition_id { + int32 normal_partition_id = 3; + string sticky_name = 4; + } } // Information about redirect intention sent by Matching to History in Record*TaskStarted calls. // Deprecated. message BuildIdRedirectInfo { - // build ID asked by History in the directive or the one calculated based on the assignment rules. - // this is the source of the redirect rule chain applied. (the target of the redirect rule chain is - // the poller's build ID reported in WorkerVersionCapabilities) - string assigned_build_id = 1; + // build ID asked by History in the directive or the one calculated based on the assignment rules. + // this is the source of the redirect rule chain applied. (the target of the redirect rule chain is + // the poller's build ID reported in WorkerVersionCapabilities) + string assigned_build_id = 1; } // Information about task forwarding from one partition to its parent. message TaskForwardInfo { - // RPC name of the partition forwarded the task. - // In case of multiple hops, this is the source partition of the last hop. - string source_partition = 1; - temporal.server.api.enums.v1.TaskSource task_source = 2; - // The partition where the task was initially forwarded from. - // Unlike source_partition which gets overwritten at each hop, origin_partition - // persists across all forwarding hops. - string origin_partition = 6; - // For tasks that are forwarded, we should keep the original creation time that comes from the - // source partition. Used for dispatch latency metrics. - google.protobuf.Timestamp create_time = 7; - // Redirect info is not present for Query and Nexus tasks. Versioning decisions for activity/workflow - // tasks are made at the source partition and sent to the parent partition in this message so that parent partition - // does not have to make versioning decision again. For Query/Nexus tasks, this works differently as the child's - // versioning decision is ignored and the parent partition makes a fresh decision. - // Deprecated. [cleanup-old-wv] - BuildIdRedirectInfo redirect_info = 3; - // Build ID that should be used to dispatch the task to. Ignored in Query and Nexus tasks. - // Deprecated. [cleanup-old-wv] - string dispatch_build_id = 4; - // Only used for old versioning. [cleanup-old-wv] - // Deprecated. [cleanup-old-wv] - string dispatch_version_set = 5; + // RPC name of the partition forwarded the task. + // In case of multiple hops, this is the source partition of the last hop. + string source_partition = 1; + temporal.server.api.enums.v1.TaskSource task_source = 2; + // The partition where the task was initially forwarded from. + // Unlike source_partition which gets overwritten at each hop, origin_partition + // persists across all forwarding hops. + string origin_partition = 6; + // For tasks that are forwarded, we should keep the original creation time that comes from the + // source partition. Used for dispatch latency metrics. + google.protobuf.Timestamp create_time = 7; + // Redirect info is not present for Query and Nexus tasks. Versioning decisions for activity/workflow + // tasks are made at the source partition and sent to the parent partition in this message so that parent partition + // does not have to make versioning decision again. For Query/Nexus tasks, this works differently as the child's + // versioning decision is ignored and the parent partition makes a fresh decision. + // Deprecated. [cleanup-old-wv] + BuildIdRedirectInfo redirect_info = 3; + // Build ID that should be used to dispatch the task to. Ignored in Query and Nexus tasks. + // Deprecated. [cleanup-old-wv] + string dispatch_build_id = 4; + // Only used for old versioning. [cleanup-old-wv] + // Deprecated. [cleanup-old-wv] + string dispatch_version_set = 5; } // EphemeralData is data that we want to propagate among task queue partitions, but is not persisted. @@ -136,23 +135,23 @@ message TaskForwardInfo { // task queue family (all queues with the same name, across types), ephemeral data applies only to // one type at a time. message EphemeralData { - message ByVersion { - // Key for this data. Data for the unversioned queue has no version field present. - // All following fields are data associated with this versioned queue. - temporal.server.api.deployment.v1.WorkerDeploymentVersion version = 1; - - // This is a bit field of priority levels that have "significant" backlog (defined by - // the server configuration). Priority key k corresponds to 1<= MS NextEventID.", tag.WorkflowNextEventID(mutableState.GetNextEventID()), ) diff --git a/service/history/queues/executable.go b/service/history/queues/executable.go index a50ae23113..5231da819b 100644 --- a/service/history/queues/executable.go +++ b/service/history/queues/executable.go @@ -85,12 +85,6 @@ var ( dependencyTaskNotCompletedReschedulePolicy = common.CreateDependencyTaskNotCompletedReschedulePolicy() ) -var defaultExecutableMetricsTags = []metrics.Tag{ - metrics.NamespaceUnknownTag(), - metrics.TaskTypeTag("__unknown__"), - metrics.OperationTag("__unknown__"), -} - const ( // resubmitMaxAttempts is the max number of attempts we may skip rescheduler when a task is Nacked. // check the comment in shouldResubmitOnNack() for more details @@ -112,19 +106,21 @@ type ( sync.Mutex state ctasks.State - executor Executor - scheduler Scheduler - rescheduler Rescheduler - priorityAssigner PriorityAssigner - timeSource clock.TimeSource - namespaceRegistry namespace.Registry - clusterMetadata cluster.Metadata - chasmRegistry *chasm.Registry - taskTypeTagProvider TaskTypeTagProvider - logger log.Logger - metricsHandler metrics.Handler - tracer trace.Tracer - dlqWriter *DLQWriter + executor Executor + scheduler Scheduler + rescheduler Rescheduler + priorityAssigner PriorityAssigner + timeSource clock.TimeSource + namespaceRegistry namespace.Registry + clusterMetadata cluster.Metadata + chasmRegistry *chasm.Registry + taskTypeTagProvider TaskTypeTagProvider + logger log.Logger + baseMetricsHandler metrics.Handler + defaultMetricsHandler metrics.Handler + chasmMetricsHandler metrics.Handler // contains archetype tag + tracer trace.Tracer + dlqWriter *DLQWriter readerID int64 attempt int @@ -211,13 +207,7 @@ func NewExecutable( return tasks.Tags(task) }, ), - metricsHandler: metricsHandler.WithTags(estimateTaskMetricTags( - task, - namespaceRegistry, - clusterMetadata.GetCurrentClusterName(), - chasmRegistry, - taskTypeTagProvider, - )...), + baseMetricsHandler: metricsHandler, tracer: tracer, dlqWriter: params.DLQWriter, dlqEnabled: params.DLQEnabled, @@ -225,10 +215,11 @@ func NewExecutable( dlqInternalErrors: params.DLQInternalErrors, dlqErrorPattern: params.DLQErrorPattern, } + e.refreshMetricsHandlers(nil) e.priority = priorityAssigner.Assign(e) loadTime := util.MaxTime(timeSource.Now(), task.GetKey().FireTime) - metrics.TaskLoadLatency.With(e.metricsHandler).Record( + metrics.TaskLoadLatency.With(e.chasmMetricsHandler).Record( loadTime.Sub(task.GetVisibilityTime()), metrics.QueueReaderIDTag(readerID), ) @@ -304,14 +295,7 @@ func (e *executableImpl) Execute() (retErr error) { // we need to guess the metrics tags here as we don't know which execution logic // is actually used which is upto the executor implementation - e.metricsHandler = e.metricsHandler.WithTags( - estimateTaskMetricTags( - e.GetTask(), - e.namespaceRegistry, - e.clusterMetadata.GetCurrentClusterName(), - e.chasmRegistry, - e.taskTypeTagProvider, - )...) + e.refreshMetricsHandlers(nil) } attemptUserLatency := time.Duration(0) @@ -322,17 +306,17 @@ func (e *executableImpl) Execute() (retErr error) { attemptLatency := e.timeSource.Now().Sub(startTime) e.attemptNoUserLatency = attemptLatency - attemptUserLatency // emit total attempt latency so that we know how much time a task will occpy a worker goroutine - metrics.TaskProcessingLatency.With(e.metricsHandler).Record(attemptLatency) + metrics.TaskProcessingLatency.With(e.chasmMetricsHandler).Record(attemptLatency) if persistenceDuration, ok := metrics.ContextCounterGet(ctx, metrics.TaskPersistenceLatency.Name()); ok { attemptNoPersistence := attemptLatency - time.Duration(persistenceDuration) - metrics.TaskProcessingNoPersistenceLatency.With(e.metricsHandler).Record(attemptNoPersistence) + metrics.TaskProcessingNoPersistenceLatency.With(e.chasmMetricsHandler).Record(attemptNoPersistence) } - priorityTaggedProvider := e.metricsHandler.WithTags(metrics.TaskPriorityTag(e.priority.String())) + priorityTaggedProvider := e.chasmMetricsHandler.WithTags(metrics.TaskPriorityTag(e.priority.String())) metrics.TaskRequests.With(priorityTaggedProvider).Record(1) metrics.TaskScheduleLatency.With(priorityTaggedProvider).Record(e.scheduleLatency) - metrics.OperationCounter.With(e.metricsHandler).Record(1) + metrics.OperationCounter.With(e.defaultMetricsHandler).Record(1) if retErr == nil { e.inMemoryNoUserLatency += e.scheduleLatency + e.attemptNoUserLatency @@ -359,7 +343,7 @@ func (e *executableImpl) Execute() (retErr error) { } resp := e.executor.Execute(ctx, e) - e.metricsHandler = e.metricsHandler.WithTags(resp.ExecutionMetricTags...) + e.refreshMetricsHandlers(resp.ExecutionMetricTags) if resp.ExecutedAsActive != e.lastActiveness { // namespace did a failover, @@ -388,10 +372,10 @@ func (e *executableImpl) writeToDLQ(ctx context.Context) error { e.lastActiveness, ) if err != nil { - metrics.TaskDLQFailures.With(e.metricsHandler).Record(1) + metrics.TaskDLQFailures.With(e.chasmMetricsHandler).Record(1) e.logger.Error("Failed to write task to DLQ", tag.Error(err)) } - metrics.TaskDLQSendLatency.With(e.metricsHandler).Record(e.timeSource.Now().Sub(start)) + metrics.TaskDLQSendLatency.With(e.chasmMetricsHandler).Record(e.timeSource.Now().Sub(start)) return err } @@ -409,7 +393,7 @@ func (e *executableImpl) isInvalidTaskError(err error) bool { // The task is stale and is safe to be dropped. // Even though ErrStaleReference is castable to serviceerror.NotFound, we give this error special treatment // because we're interested in the metric. - metrics.TaskSkipped.With(e.metricsHandler).Record(1) + metrics.TaskSkipped.With(e.chasmMetricsHandler).Record(1) e.logger.Info("Skipped task due to stale reference", tag.Error(err)) return true } @@ -424,7 +408,7 @@ func (e *executableImpl) isInvalidTaskError(err error) bool { } if err == consts.ErrTaskVersionMismatch { - metrics.TaskVersionMisMatch.With(e.metricsHandler).Record(1) + metrics.TaskVersionMisMatch.With(e.chasmMetricsHandler).Record(1) return true } @@ -433,7 +417,7 @@ func (e *executableImpl) isInvalidTaskError(err error) bool { func (e *executableImpl) isSafeToDropError(err error) bool { if err == consts.ErrTaskDiscarded { - metrics.TaskDiscarded.With(e.metricsHandler).Record(1) + metrics.TaskDiscarded.With(e.chasmMetricsHandler).Record(1) return true } @@ -462,7 +446,7 @@ func (e *executableImpl) isExpectedRetryableError(err error) (isRetryable bool, e.resourceExhaustedCount++ } - metrics.TaskThrottledCounter.With(e.metricsHandler).Record( + metrics.TaskThrottledCounter.With(e.chasmMetricsHandler).Record( 1, metrics.ResourceExhaustedCauseTag(resourceExhaustedErr.Cause)) return true, err } @@ -471,22 +455,22 @@ func (e *executableImpl) isExpectedRetryableError(err error) (isRetryable bool, if _, ok := err.(*serviceerror.NamespaceNotActive); ok { // error is expected when there's namespace failover, // so don't count it into task failures. - metrics.TaskNotActiveCounter.With(e.metricsHandler).Record(1) + metrics.TaskNotActiveCounter.With(e.chasmMetricsHandler).Record(1) return true, err } if err == consts.ErrDependencyTaskNotCompleted { - metrics.TasksDependencyTaskNotCompleted.With(e.metricsHandler).Record(1) + metrics.TasksDependencyTaskNotCompleted.With(e.chasmMetricsHandler).Record(1) return true, err } if err == consts.ErrTaskRetry { - metrics.TaskStandbyRetryCounter.With(e.metricsHandler).Record(1) + metrics.TaskStandbyRetryCounter.With(e.chasmMetricsHandler).Record(1) return true, err } if err.Error() == consts.ErrNamespaceHandover.Error() { - metrics.TaskNamespaceHandoverCounter.With(e.metricsHandler).Record(1) + metrics.TaskNamespaceHandoverCounter.With(e.chasmMetricsHandler).Record(1) return true, consts.ErrNamespaceHandover } @@ -505,7 +489,7 @@ func (e *executableImpl) isUnexpectedNonRetryableError(err error) bool { isInternalError := common.IsInternalError(err) if isInternalError { - metrics.TaskInternalErrorCounter.With(e.metricsHandler).Record(1) + metrics.TaskInternalErrorCounter.With(e.chasmMetricsHandler).Record(1) // Only DQL/drop when configured to shouldDLQ := e.dlqInternalErrors() return shouldDLQ @@ -553,7 +537,7 @@ func (e *executableImpl) HandleErr(err error) (retErr error) { // Unexpected errors handled below e.unexpectedErrorAttempts++ - metrics.TaskFailures.With(e.metricsHandler).Record(1) + metrics.TaskFailures.With(e.chasmMetricsHandler).Record(1) logger := log.With(e.logger, tag.Error(err), tag.ErrorType(err), @@ -572,12 +556,12 @@ func (e *executableImpl) HandleErr(err error) (retErr error) { // Terminal errors are likely due to data corruption. // Drop the task by returning nil so that task will be marked as completed, // or send it to the DLQ if that is enabled. - metrics.TaskCorruptionCounter.With(e.metricsHandler).Record(1) + metrics.TaskCorruptionCounter.With(e.chasmMetricsHandler).Record(1) if e.dlqEnabled() { // Keep this message in sync with the log line mentioned in Investigation section of docs/admin/dlq.md e.logger.Error("Marking task as terminally failed, will send to DLQ", tag.Error(err), tag.ErrorType(err)) e.terminalFailureCause = err // <- Execute() examines this attribute on the next attempt. - metrics.TaskTerminalFailures.With(e.metricsHandler).Record(1) + metrics.TaskTerminalFailures.With(e.chasmMetricsHandler).Record(1) return fmt.Errorf("%w: %v", ErrTerminalTaskFailure, err) } e.logger.Error("Dropping task due to terminal error", tag.Error(err), tag.ErrorType(err)) @@ -590,7 +574,7 @@ func (e *executableImpl) HandleErr(err error) (retErr error) { e.logger.Error("Marking task as terminally failed, will send to DLQ. Maximum number of attempts with unexpected errors", tag.UnexpectedErrorAttempts(int32(e.unexpectedErrorAttempts)), tag.Error(err)) e.terminalFailureCause = err // <- Execute() examines this attribute on the next attempt. - metrics.TaskTerminalFailures.With(e.metricsHandler).Record(1) + metrics.TaskTerminalFailures.With(e.chasmMetricsHandler).Record(1) return fmt.Errorf("%w: %w", ErrTerminalTaskFailure, e.terminalFailureCause) } @@ -616,7 +600,7 @@ func (e *executableImpl) matchDLQErrorPattern(err error) error { tag.Error(err), tag.ErrorType(err)) e.terminalFailureCause = err - metrics.TaskTerminalFailures.With(e.metricsHandler).Record(1) + metrics.TaskTerminalFailures.With(e.chasmMetricsHandler).Record(1) return fmt.Errorf("%w: %v", ErrTerminalTaskFailure, err) } @@ -669,9 +653,9 @@ func (e *executableImpl) Ack() { return } - metrics.TaskAttempt.With(e.metricsHandler).Record(int64(e.attempt)) + metrics.TaskAttempt.With(e.chasmMetricsHandler).Record(int64(e.attempt)) - priorityTaggedProvider := e.metricsHandler.WithTags(metrics.TaskPriorityTag(e.priority.String())) + priorityTaggedProvider := e.chasmMetricsHandler.WithTags(metrics.TaskPriorityTag(e.priority.String())) metrics.TaskLatency.With(priorityTaggedProvider).Record(e.inMemoryNoUserLatency) metrics.TaskQueueLatency.With(priorityTaggedProvider.WithTags(metrics.QueueReaderIDTag(e.readerID))). Record(time.Since(e.GetVisibilityTime())) @@ -824,11 +808,37 @@ func (e *executableImpl) incAttempt() { e.attempt++ if e.attempt > taskCriticalLogMetricAttempts { - metrics.TaskAttempt.With(e.metricsHandler).Record(int64(e.attempt)) + metrics.TaskAttempt.With(e.chasmMetricsHandler).Record(int64(e.attempt)) + } +} + +func (e *executableImpl) refreshMetricsHandlers(executionMetricTags []metrics.Tag) { + sharedTags := taskBaseMetricTagsWithoutArchetype( + e.GetTask(), + e.namespaceRegistry, + e.clusterMetadata.GetCurrentClusterName(), + e.chasmRegistry, + e.taskTypeTagProvider, + ) + if len(executionMetricTags) > 0 { + sharedTags = append(sharedTags, executionMetricTags...) } + e.defaultMetricsHandler = e.baseMetricsHandler.WithTags(sharedTags...) + e.chasmMetricsHandler = e.defaultMetricsHandler.WithTags(getArchetypeTag(e.GetTask(), e.chasmRegistry)) } -func estimateTaskMetricTags( +func taskBaseMetricTags( + task tasks.Task, + namespaceRegistry namespace.Registry, + currentClusterName string, + chasmRegistry *chasm.Registry, + taskTypeTagProvider TaskTypeTagProvider, +) []metrics.Tag { + tags := taskBaseMetricTagsWithoutArchetype(task, namespaceRegistry, currentClusterName, chasmRegistry, taskTypeTagProvider) + return append(tags, getArchetypeTag(task, chasmRegistry)) +} + +func taskBaseMetricTagsWithoutArchetype( task tasks.Task, namespaceRegistry namespace.Registry, currentClusterName string, @@ -853,6 +863,15 @@ func estimateTaskMetricTags( } } +func getArchetypeTag(task tasks.Task, chasmRegistry *chasm.Registry) metrics.Tag { + if t, ok := task.(tasks.HasArchetypeID); ok { + if name, ok := chasmRegistry.ArchetypeDisplayName(t.GetArchetypeID()); ok { + return metrics.ArchetypeTag(name) + } + } + return metrics.ArchetypeTag(chasm.WorkflowComponentName) +} + // CircuitBreakerExecutable wraps Executable with a circuit breaker. // If the executable returns DestinationDownError, it will signal the circuit breaker // of failure, and return the inner error. diff --git a/service/history/queues/executable_factory.go b/service/history/queues/executable_factory.go index 6cc47a7c14..072ad826d0 100644 --- a/service/history/queues/executable_factory.go +++ b/service/history/queues/executable_factory.go @@ -75,7 +75,7 @@ func NewExecutableFactory( chasmRegistry: chasmRegistry, taskTypeTagProvider: taskTypeTagProvider, logger: logger, - metricsHandler: metricsHandler.WithTags(defaultExecutableMetricsTags...), + metricsHandler: metricsHandler, tracer: tracer, dlqWriter: dlqWriter, dlqEnabled: dlqEnabled, diff --git a/service/history/queues/metrics_test.go b/service/history/queues/metrics_test.go index 6644905685..2ec817b85b 100644 --- a/service/history/queues/metrics_test.go +++ b/service/history/queues/metrics_test.go @@ -4,6 +4,9 @@ import ( "testing" "github.com/stretchr/testify/assert" + "go.temporal.io/server/chasm" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/metrics" "go.temporal.io/server/service/history/tasks" ) @@ -13,3 +16,19 @@ func TestGetArchivalTaskTypeTagValue(t *testing.T) { unknownTask := &tasks.CloseExecutionTask{} assert.Equal(t, unknownTask.GetType().String(), GetArchivalTaskTypeTagValue(unknownTask)) } + +func TestGetArchetypeTag(t *testing.T) { + registry := chasm.NewRegistry(log.NewTestLogger()) + + t.Run("legacy task without HasArchetypeID defaults to workflow", func(t *testing.T) { + task := &tasks.ActivityTask{} + tag := getArchetypeTag(task, registry) + assert.Equal(t, metrics.ArchetypeTag(chasm.WorkflowComponentName), tag) + }) + + t.Run("HasArchetypeID task with unregistered ID defaults to workflow", func(t *testing.T) { + task := &tasks.ChasmTaskPure{ArchetypeID: 9999} + tag := getArchetypeTag(task, registry) + assert.Equal(t, metrics.ArchetypeTag(chasm.WorkflowComponentName), tag) + }) +} diff --git a/service/history/queues/scheduler.go b/service/history/queues/scheduler.go index 7b33d4f8c9..9edcff1759 100644 --- a/service/history/queues/scheduler.go +++ b/service/history/queues/scheduler.go @@ -266,7 +266,7 @@ func NewRateLimitedScheduler( } taskMetricsTagsFn := func(e Executable) []metrics.Tag { return append( - estimateTaskMetricTags(e.GetTask(), namespaceRegistry, currentClusterName, chasmRegistry, GetTaskTypeTagValue), + taskBaseMetricTags(e.GetTask(), namespaceRegistry, currentClusterName, chasmRegistry, GetTaskTypeTagValue), metrics.TaskPriorityTag(e.GetPriority().String()), ) } diff --git a/service/history/queues/speculative_workflow_task_timeout_queue.go b/service/history/queues/speculative_workflow_task_timeout_queue.go index 749612f8f1..fe5c92c894 100644 --- a/service/history/queues/speculative_workflow_task_timeout_queue.go +++ b/service/history/queues/speculative_workflow_task_timeout_queue.go @@ -90,7 +90,7 @@ func (q SpeculativeWorkflowTaskTimeoutQueue) NotifyNewTasks(ts []tasks.Task) { q.chasmRegistry, GetTaskTypeTagValue, q.logger, - q.metricsHandler.WithTags(defaultExecutableMetricsTags...), + q.metricsHandler, q.tracer, ), wttt) q.timeoutQueue.Add(executable) diff --git a/service/history/timer_queue_active_task_executor_test.go b/service/history/timer_queue_active_task_executor_test.go index 36a5f7bd53..a8af08a052 100644 --- a/service/history/timer_queue_active_task_executor_test.go +++ b/service/history/timer_queue_active_task_executor_test.go @@ -2044,7 +2044,11 @@ func (s *timerQueueActiveTaskExecutorSuite) TestExecuteChasmPureTimerTask_Execut } // Mock the CHASM tree and execute interface. - mockEach := &chasm.MockNodePureTask{} + mockEach := &chasm.MockNodePureTask{ + HandleExecutePureTask: func(_ context.Context, _ chasm.TaskAttributes, _ any) (bool, error) { + return true, nil + }, + } chasmTree := historyi.NewMockChasmTree(s.controller) chasmTree.EXPECT().EachPureTask(gomock.Any(), gomock.Any()). Times(1).Do( diff --git a/service/history/workflow/mutable_state_impl.go b/service/history/workflow/mutable_state_impl.go index 7d271229bf..7c6aa34e54 100644 --- a/service/history/workflow/mutable_state_impl.go +++ b/service/history/workflow/mutable_state_impl.go @@ -44,6 +44,7 @@ import ( "go.temporal.io/server/common/definition" "go.temporal.io/server/common/enums" "go.temporal.io/server/common/failure" + "go.temporal.io/server/common/headers" "go.temporal.io/server/common/log" "go.temporal.io/server/common/log/tag" "go.temporal.io/server/common/metrics" @@ -412,7 +413,7 @@ func NewMutableState( s, chasm.DefaultPathEncoder, logger, - shard.GetMetricsHandler(), + shard.GetMetricsHandler().WithTags(metrics.NamespaceTag(namespaceName)), ) } @@ -560,7 +561,7 @@ func NewMutableStateFromDB( mutableState, chasm.DefaultPathEncoder, mutableState.logger, // this logger is tagged with execution key. - shard.GetMetricsHandler(), + shard.GetMetricsHandler().WithTags(metrics.NamespaceTag(namespaceEntry.Name().String())), ) if err != nil { return nil, err @@ -7093,6 +7094,27 @@ func (ms *MutableStateImpl) closeTransaction( return closeTransactionResult{}, err } + // Stamp events with the caller's principal. Only do this on the active + // cluster — standby (passive) replays events that were already stamped by + // the active side, and we must not overwrite those principals. + if transactionPolicy == historyi.TransactionPolicyActive { + principal := headers.GetPrincipal(ctx) + for _, we := range workflowEventsSeq { + for _, event := range we.Events { + // Skip events that already have a principal. Those are previously + // buffered events (e.g., signals) that were stamped when originally + // created and are now being flushed into history by a different caller + // (e.g., the worker completing a workflow task). + if event.Principal == nil { + event.Principal = principal + } + } + } + for _, event := range bufferEvents { + event.Principal = principal + } + } + // CloseTransaction() on chasmTree may update execution state & status, // so must be called before closeTransactionUpdateTransitionHistory(). chasmNodesMutation, err := ms.chasmTree.CloseTransaction() diff --git a/service/history/workflow/mutable_state_impl_test.go b/service/history/workflow/mutable_state_impl_test.go index 2fdaace570..8abffdfa19 100644 --- a/service/history/workflow/mutable_state_impl_test.go +++ b/service/history/workflow/mutable_state_impl_test.go @@ -38,6 +38,7 @@ import ( "go.temporal.io/server/common/definition" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/failure" + "go.temporal.io/server/common/headers" "go.temporal.io/server/common/log" "go.temporal.io/server/common/namespace" "go.temporal.io/server/common/payloads" @@ -6149,3 +6150,142 @@ func (s *mutableStateSuite) TestSetContextMetadata() { s.True(ok) s.Equal(taskQueue, tq) } + +func (s *mutableStateSuite) TestCloseTransaction_PrincipalStamped() { + for _, tc := range []struct { + name string + policy historyi.TransactionPolicy + }{ + {"Active", historyi.TransactionPolicyActive}, + {"Passive", historyi.TransactionPolicyPassive}, + } { + s.Run(tc.name, func() { + namespaceEntry := tests.GlobalNamespaceEntry + s.mockEventsCache.EXPECT().PutEvent(gomock.Any(), gomock.Any()).AnyTimes() + + dbState := s.buildWorkflowMutableState() + dbState.BufferedEvents = nil + + var err error + s.mutableState, err = NewMutableStateFromDB(s.mockShard, s.mockEventsCache, s.logger, namespaceEntry, dbState, 123) + s.NoError(err) + err = s.mutableState.UpdateCurrentVersion(namespaceEntry.FailoverVersion(tests.WorkflowID), false) + s.NoError(err) + + // Complete the workflow task to generate events in workflowEventsSeq. + workflowTaskInfo := s.mutableState.GetStartedWorkflowTask() + _, err = s.mutableState.AddWorkflowTaskCompletedEvent( + workflowTaskInfo, + &workflowservice.RespondWorkflowTaskCompletedRequest{}, + workflowTaskCompletionLimits, + ) + s.NoError(err) + + // Close the transaction with a principal in context. + principal := &commonpb.Principal{Type: "user", Name: "alice"} + ctx := headers.SetPrincipal(context.Background(), principal) + _, eventsSeq, err := s.mutableState.CloseTransactionAsMutation(ctx, tc.policy) + s.NoError(err) + + s.NotEmpty(eventsSeq) + for _, we := range eventsSeq { + for _, event := range we.Events { + if tc.policy == historyi.TransactionPolicyActive { + // Active: all events should be stamped with the caller's principal. + s.Equal("user", event.Principal.GetType(), "event %s should have principal type 'user'", event.EventType) + s.Equal("alice", event.Principal.GetName(), "event %s should have principal name 'alice'", event.EventType) + } else { + // Passive: events must not be stamped + s.Nil(event.Principal, "event %s should not have principal stamped in passive mode", event.EventType) + } + } + } + }) + } +} + +func (s *mutableStateSuite) TestCloseTransaction_PrincipalPreserved() { + namespaceEntry := tests.GlobalNamespaceEntry + s.mockEventsCache.EXPECT().PutEvent(gomock.Any(), gomock.Any()).AnyTimes() + + dbState := s.buildWorkflowMutableState() + + var err error + s.mutableState, err = NewMutableStateFromDB(s.mockShard, s.mockEventsCache, s.logger, namespaceEntry, dbState, 123) + s.NoError(err) + err = s.mutableState.UpdateCurrentVersion(namespaceEntry.FailoverVersion(tests.WorkflowID), false) + s.NoError(err) + + s.mockShard.Resource.ClusterMetadata.EXPECT().GetCurrentClusterName().Return(cluster.TestCurrentClusterName).AnyTimes() + + // Transaction 1: First signal arrives while a workflow task is started. + // The signal gets buffered and stamped with alice's principal. + _, err = s.mutableState.AddWorkflowExecutionSignaledEvent( + "signal-from-alice", + &commonpb.Payloads{}, + "alice-identity", + &commonpb.Header{}, + nil, + nil, + ) + s.NoError(err) + + aliceCtx := headers.SetPrincipal(context.Background(), &commonpb.Principal{Type: "user", Name: "alice"}) + mutation, _, err := s.mutableState.CloseTransactionAsMutation(aliceCtx, historyi.TransactionPolicyActive) + s.NoError(err) + s.Len(mutation.NewBufferedEvents, 1) + s.Equal("alice", mutation.NewBufferedEvents[0].Principal.GetName()) + + // Transaction 2: Second signal arrives from a different caller. + // It gets buffered and stamped with bob's principal. Alice's buffered + // signal must not be overwritten. + _, err = s.mutableState.AddWorkflowExecutionSignaledEvent( + "signal-from-bob", + &commonpb.Payloads{}, + "bob-identity", + &commonpb.Header{}, + nil, + nil, + ) + s.NoError(err) + + bobCtx := headers.SetPrincipal(context.Background(), &commonpb.Principal{Type: "user", Name: "bob"}) + mutation, _, err = s.mutableState.CloseTransactionAsMutation(bobCtx, historyi.TransactionPolicyActive) + s.NoError(err) + s.Len(mutation.NewBufferedEvents, 1) + s.Equal("bob", mutation.NewBufferedEvents[0].Principal.GetName()) + + // Transaction 3: A worker completes the workflow task. Both buffered + // signals are flushed into history. Each should retain its original + // principal. + workflowTaskInfo := s.mutableState.GetStartedWorkflowTask() + _, err = s.mutableState.AddWorkflowTaskCompletedEvent( + workflowTaskInfo, + &workflowservice.RespondWorkflowTaskCompletedRequest{}, + workflowTaskCompletionLimits, + ) + s.NoError(err) + + workerCtx := headers.SetPrincipal(context.Background(), &commonpb.Principal{Type: "worker", Name: "worker-1"}) + _, eventsSeq, err := s.mutableState.CloseTransactionAsMutation(workerCtx, historyi.TransactionPolicyActive) + s.NoError(err) + + s.NotEmpty(eventsSeq) + principalBySignalName := map[string]string{} + foundWorkerEvent := false + for _, we := range eventsSeq { + for _, event := range we.Events { + if event.EventType == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_SIGNALED { + signalName := event.GetWorkflowExecutionSignaledEventAttributes().GetSignalName() + principalBySignalName[signalName] = event.Principal.GetName() + } else { + foundWorkerEvent = true + s.Equal("worker", event.Principal.GetType(), "event %s should have worker's principal type", event.EventType) + s.Equal("worker-1", event.Principal.GetName(), "event %s should have worker's principal name", event.EventType) + } + } + } + s.True(foundWorkerEvent, "expected to find non-signal events in workflowEventsSeq") + s.Equal("alice", principalBySignalName["signal-from-alice"], "alice's signal should retain her principal") + s.Equal("bob", principalBySignalName["signal-from-bob"], "bob's signal should retain his principal") +} diff --git a/service/matching/backlog_manager_test.go b/service/matching/backlog_manager_test.go index b0f1e2089e..9327723452 100644 --- a/service/matching/backlog_manager_test.go +++ b/service/matching/backlog_manager_test.go @@ -7,6 +7,7 @@ import ( "maps" "math" "math/rand" + "slices" "sync" "sync/atomic" "testing" @@ -18,6 +19,7 @@ import ( persistencespb "go.temporal.io/server/api/persistence/v1" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/persistence" "go.temporal.io/server/common/primitives/timestamp" testutil "go.temporal.io/server/common/testing" "go.temporal.io/server/common/testing/testlogger" @@ -355,6 +357,119 @@ func (s *BacklogManagerTestSuite) TestApproximateBacklogCount_NotIncrementedBySp "backlog count should not be incremented") } +func (s *BacklogManagerTestSuite) TestSkipExpiredTasks_AllExpiredThenValid() { + s.testSkipExpiredTasks(10, 0, 33, 3) +} + +func (s *BacklogManagerTestSuite) TestSkipExpiredTasks_ValidExpiredValid() { + s.testSkipExpiredTasks(10, 3, 33, 3) +} + +// testSkipExpiredTasks verifies that the task reader correctly skips over expired tasks +// in the DB and advances the ack level past them. +// expiredPattern is: # valid, # expired, # valid, # expired, ... +func (s *BacklogManagerTestSuite) testSkipExpiredTasks(batchSize int, numValidExpired ...int) { + if !s.newMatcher { + s.T().Skip("not compatible with classic backlog manager") + } + + s.cfgcli.OverrideValue(dynamicconfig.MatchingGetTasksBatchSize.Key(), batchSize) + + // expand 1, 3, 2 -> {false, true, true, true, false, false} + var expiredPattern []bool + var isExpired bool + for _, num := range numValidExpired { + expiredPattern = append(expiredPattern, slices.Repeat([]bool{isExpired}, num)...) + isExpired = !isExpired + } + + // Pre-populate the DB with tasks before starting the backlog manager. + // This simulates tasks that were written and then expired before reading. + ctx := context.Background() + queueInfo := &persistencespb.TaskQueueInfo{ + Name: "test-queue", + TaskType: enumspb.TASK_QUEUE_TYPE_WORKFLOW, + Kind: enumspb.TASK_QUEUE_KIND_NORMAL, + LastUpdateTime: timestamp.TimeNowPtrUtc(), + // start with ack level at zero + } + _, err := s.taskMgr.CreateTaskQueue(ctx, &persistence.CreateTaskQueueRequest{ + RangeID: 1, + TaskQueueInfo: queueInfo, + }) + s.Require().NoError(err) + + var dbTasks []*persistencespb.AllocatedTaskInfo + numValid := 0 + for i, expired := range expiredPattern { + id := int64(i + 1) + task := &persistencespb.AllocatedTaskInfo{ + TaskId: id, + Data: &persistencespb.TaskInfo{ + CreateTime: timestamp.TimeNowPtrUtcAddSeconds(-3600), + }, + } + if expired { + task.Data.ExpiryTime = timestamp.TimeNowPtrUtcAddSeconds(-60) + } else { + task.Data.ExpiryTime = timestamp.TimeNowPtrUtcAddSeconds(3600) + numValid++ + } + if s.fairness { + task.TaskPass = id * 1000 // spread out pass numbers + } + dbTasks = append(dbTasks, task) + } + _, err = s.taskMgr.CreateTasks(ctx, &persistence.CreateTasksRequest{ + TaskQueueInfo: &persistence.PersistedTaskQueueInfo{Data: queueInfo, RangeID: 1}, + Tasks: dbTasks, + }) + s.Require().NoError(err) + + // Capture tasks delivered to the matcher. + var mu sync.Mutex + var delivered []*internalTask + s.ptqMgr.EXPECT().AddSpooledTask(gomock.Any()).DoAndReturn(func(t *internalTask) error { + mu.Lock() + defer mu.Unlock() + delivered = append(delivered, t) + return nil + }).AnyTimes() + + // Start backlog manager. + s.blm.Start() + defer s.blm.Stop() + s.Require().NoError(s.blm.WaitUntilInitialized(context.Background())) + + // Wait for all valid tasks to be delivered. + s.Require().Eventually(func() bool { + mu.Lock() + defer mu.Unlock() + return len(delivered) >= numValid + }, 2*time.Second, 10*time.Millisecond, "timed out waiting for valid tasks to be delivered") + + // Complete the delivered tasks. + mu.Lock() + tasks := slices.Clone(delivered) + mu.Unlock() + for _, t := range tasks { + t.finish(nil, true) + } + + // Verify the ack level advances past all tasks (expired + valid). + lastID := int64(len(expiredPattern)) + s.Eventually(func() bool { + db := s.blm.getDB() + db.Lock() + defer db.Unlock() + if s.fairness { + ackLevel := fairLevelFromProto(db.subqueues[subqueueZero].FairAckLevel) + return !ackLevel.less(fairLevel{pass: lastID * 1000, id: lastID}) + } + return db.subqueues[subqueueZero].AckLevel >= lastID + }, 2*time.Second, 10*time.Millisecond, "ack level did not advance past all tasks") +} + func totalApproximateBacklogCount(c backlogManager) (total int64) { for _, stats := range c.BacklogStatsByPriority() { total += stats.ApproximateBacklogCount diff --git a/service/matching/fair_task_reader.go b/service/matching/fair_task_reader.go index 3e56558705..a7e6480f82 100644 --- a/service/matching/fair_task_reader.go +++ b/service/matching/fair_task_reader.go @@ -3,7 +3,6 @@ package matching import ( "context" "errors" - "slices" "sync" "time" @@ -284,20 +283,11 @@ func (tr *fairTaskReader) readTaskBatch(readLevel fairLevel, loadedTasks int) er mode = mergeReadToEnd } - // filter out expired - // TODO(fairness): if we have _only_ expired tasks, and we filter them out here, we won't move - // the ack level and delete them. maybe we should put them in outstandingTasks as pre-acked. - tasks := slices.DeleteFunc(res.Tasks, func(t *persistencespb.AllocatedTaskInfo) bool { - if IsTaskExpired(t) { - metrics.ExpiredTasksPerTaskQueueCounter.With(tr.backlogMgr.metricsHandler).Record(1, metrics.TaskExpireStageReadTag) - return true - } - return false - }) - // Note: even if (especially if) len(tasks) == 0, we should go through the mergeTasks logic - // to update atEnd and the backlog size estimate. - tr.mergeTasks(tasks, mode) + // to update atEnd and the backlog size estimate. Expired tasks are passed through to + // mergeTasksLocked where they'll be added as pre-acked (nil) entries so they advance the + // ack level and get GC'd. + tr.mergeTasks(res.Tasks, mode) return nil } @@ -491,16 +481,31 @@ func (tr *fairTaskReader) mergeTasksLocked(tasks []*persistencespb.AllocatedTask tr.evictedAcks.PopMax() } - internalTasks := make([]*internalTask, len(tasks)) - for i, t := range tasks { + var hasExpired bool + internalTasks := make([]*internalTask, 0, len(tasks)) + for _, t := range tasks { level := fairLevelFromAllocatedTask(t) - internalTasks[i] = newInternalTaskFromBacklog(t, tr.completeTask) - tr.backlogMgr.setPriority(internalTasks[i]) + if IsTaskExpired(t) { + // Expired tasks are added as pre-acked (nil) so they participate in + // readLevel calculation above and advance ackLevel + get GC'd below. + tr.outstandingTasks.Put(level, nil) + metrics.ExpiredTasksPerTaskQueueCounter.With(tr.backlogMgr.metricsHandler).Record(1, metrics.TaskExpireStageReadTag) + hasExpired = true + continue + } + task := newInternalTaskFromBacklog(t, tr.completeTask) + tr.backlogMgr.setPriority(task) // After we get to this point, we must eventually call task.finish or // task.finishForwarded, which will call tr.completeTask. - tr.outstandingTasks.Put(level, internalTasks[i]) + tr.outstandingTasks.Put(level, task) tr.loadedTasks++ tr.backlogAge.record(t.Data.CreateTime, 1) + internalTasks = append(internalTasks, task) + } + + if hasExpired { + // Advance ack level past any expired tasks we just added as pre-acked. + tr.advanceAckLevelLocked() } // Update atEnd: diff --git a/service/matching/matching_engine.go b/service/matching/matching_engine.go index 86226ada4d..58e6b83f2f 100644 --- a/service/matching/matching_engine.go +++ b/service/matching/matching_engine.go @@ -148,7 +148,7 @@ type ( timeSource clock.TimeSource visibilityManager manager.VisibilityManager nexusEndpointClient *nexusEndpointClient - nexusEndpointsOwnershipLostCh chan struct{} + nexusEndpointsOwnershipLostCh atomic.Value // stores chan struct{} saMapperProvider searchattribute.MapperProvider saProvider searchattribute.Provider metricsHandler metrics.Handler @@ -271,29 +271,29 @@ func NewEngine( ) Engine { scopedMetricsHandler := metricsHandler.WithTags(metrics.OperationTag(metrics.MatchingEngineScope)) e := &matchingEngineImpl{ - status: common.DaemonStatusInitialized, - taskManager: taskManager, - fairTaskManager: fairTaskManager, - historyClient: historyClient, - matchingRawClient: matchingRawClient, - tokenSerializer: tasktoken.NewSerializer(), - workerDeploymentClient: workerDeploymentClient, - historySerializer: historySerializer, - logger: log.With(logger, tag.ComponentMatchingEngine), - throttledLogger: log.With(throttledLogger, tag.ComponentMatchingEngine), - namespaceRegistry: namespaceRegistry, - hostInfoProvider: hostInfoProvider, - serviceResolver: resolver, - membershipChangedCh: make(chan *membership.ChangedEvent, 1), // allow one signal to be buffered while we're working - clusterMeta: clusterMeta, - timeSource: clock.NewRealTimeSource(), // No need to mock this at the moment - visibilityManager: visibilityManager, - nexusEndpointClient: newEndpointClient(config.NexusEndpointsRefreshInterval, nexusEndpointManager), - nexusEndpointsOwnershipLostCh: make(chan struct{}), - saProvider: saProvider, - saMapperProvider: saMapperProvider, - metricsHandler: scopedMetricsHandler, - partitions: make(map[tqid.PartitionKey]taskQueuePartitionManager), + status: common.DaemonStatusInitialized, + taskManager: taskManager, + fairTaskManager: fairTaskManager, + historyClient: historyClient, + matchingRawClient: matchingRawClient, + tokenSerializer: tasktoken.NewSerializer(), + workerDeploymentClient: workerDeploymentClient, + historySerializer: historySerializer, + logger: log.With(logger, tag.ComponentMatchingEngine), + throttledLogger: log.With(throttledLogger, tag.ComponentMatchingEngine), + namespaceRegistry: namespaceRegistry, + hostInfoProvider: hostInfoProvider, + serviceResolver: resolver, + membershipChangedCh: make(chan *membership.ChangedEvent, 1), // allow one signal to be buffered while we're working + clusterMeta: clusterMeta, + timeSource: clock.NewRealTimeSource(), // No need to mock this at the moment + visibilityManager: visibilityManager, + nexusEndpointClient: newEndpointClient(config.NexusEndpointsRefreshInterval, nexusEndpointManager), + // nexusEndpointsOwnershipLostCh initialized below + saProvider: saProvider, + saMapperProvider: saMapperProvider, + metricsHandler: scopedMetricsHandler, + partitions: make(map[tqid.PartitionKey]taskQueuePartitionManager), gaugeMetrics: gaugeMetrics{ loadedTaskQueueFamilyCount: make(map[taskQueueCounterKey]int), loadedTaskQueueCount: make(map[taskQueueCounterKey]int), @@ -312,6 +312,7 @@ func NewEngine( userDataUpdateBatchers: collection.NewSyncMap[namespace.ID, *stream_batcher.Batcher[*userDataUpdate, error]](), rateLimiter: rateLimiter, } + e.nexusEndpointsOwnershipLostCh.Store(make(chan struct{})) e.reachabilityCache = newReachabilityCache( metrics.NoopMetricsHandler, visibilityManager, @@ -2100,7 +2101,7 @@ func (e *matchingEngineImpl) SyncDeploymentUserData( deploymentData.UnversionedRampData = vd } - } else if idx := worker_versioning.FindDeploymentVersion(deploymentData, vd.GetVersion()); idx >= 0 { + } else if idx := worker_versioning.FindOldDeploymentVersion(deploymentData, vd.GetVersion()); idx >= 0 { old := deploymentData.Versions[idx] if old.GetRoutingUpdateTime().AsTime().After(vd.GetRoutingUpdateTime().AsTime()) { continue @@ -2122,19 +2123,17 @@ func (e *matchingEngineImpl) SyncDeploymentUserData( clearVersionFromRoutingConfig(workerDeploymentData, nil, vd) } } else if v := req.GetForgetVersion(); v != nil { - if idx := worker_versioning.FindDeploymentVersion(deploymentData, v); idx >= 0 { + // Go through the new and old deployment data format for this deployment and remove the version if present. + workerDeploymentData := deploymentData.GetDeploymentsData()[v.GetDeploymentName()] + deleted := removeDeploymentVersions( + deploymentData, + v.GetDeploymentName(), + workerDeploymentData, + []string{v.GetBuildId()}, + /* removeOldFormat */ true, + ) + if deleted { changed = true - deploymentData.Versions = append(deploymentData.Versions[:idx], deploymentData.Versions[idx+1:]...) - - // Go through the new deployment data format for this deployment and remove the version if present. - workerDeploymentData := deploymentData.GetDeploymentsData()[v.GetDeploymentName()] - _ = removeDeploymentVersions( - deploymentData, - v.GetDeploymentName(), - workerDeploymentData, - []string{v.GetBuildId()}, - /* removeOldFormat */ false, - ) } } else { @@ -2751,7 +2750,7 @@ func (e *matchingEngineImpl) ListNexusEndpoints(ctx context.Context, request *ma func (e *matchingEngineImpl) checkNexusEndpointsOwnership() (bool, <-chan struct{}, error) { // Get the channel before checking the condition to prevent the channel from being closed while we're running this // check. - ch := e.nexusEndpointsOwnershipLostCh + ch := e.nexusEndpointsOwnershipLostCh.Load().(chan struct{}) //nolint:revive // type is always chan struct{} self := e.hostInfoProvider.HostInfo().Identity() owner, err := e.serviceResolver.Lookup(nexusEndpointsTablePartitionRoutingKey) if err != nil { @@ -2769,8 +2768,7 @@ func (e *matchingEngineImpl) notifyNexusEndpointsOwnershipChange() { return } if !isOwner { - close(e.nexusEndpointsOwnershipLostCh) - e.nexusEndpointsOwnershipLostCh = make(chan struct{}) + close(e.nexusEndpointsOwnershipLostCh.Swap(make(chan struct{})).(chan struct{})) //nolint:revive // type is always chan struct{} } e.nexusEndpointClient.notifyOwnershipChanged(isOwner) } @@ -3650,15 +3648,15 @@ func removeDeploymentVersions( buildIDs []string, removeOldFormat bool, ) bool { - if workerDeploymentData == nil { + if workerDeploymentData == nil && !removeOldFormat { return false } changed := false deletedInNew := false for _, buildID := range buildIDs { - if _, exists := workerDeploymentData.Versions[buildID]; exists { - delete(workerDeploymentData.Versions, buildID) + if _, exists := workerDeploymentData.GetVersions()[buildID]; exists { + delete(workerDeploymentData.GetVersions(), buildID) deletedInNew = true changed = true } @@ -3677,7 +3675,7 @@ func removeDeploymentVersions( } // Only remove the deployment entry if versions were actually deleted from the new-format map. - if deletedInNew && len(workerDeploymentData.Versions) == 0 { + if workerDeploymentData != nil && deletedInNew && len(workerDeploymentData.GetVersions()) == 0 { delete(deploymentData.GetDeploymentsData(), deploymentName) } return changed diff --git a/service/matching/matching_engine_test.go b/service/matching/matching_engine_test.go index 1bd6f5a64a..a82aac7529 100644 --- a/service/matching/matching_engine_test.go +++ b/service/matching/matching_engine_test.go @@ -240,7 +240,7 @@ func newMatchingEngine( mockVisibilityManager manager.VisibilityManager, mockHostInfoProvider membership.HostInfoProvider, mockServiceResolver membership.ServiceResolver, nexusEndpointManager persistence.NexusEndpointManager, ) *matchingEngineImpl { - return &matchingEngineImpl{ + e := &matchingEngineImpl{ taskManager: taskMgr, fairTaskManager: fairTaskMgr, historyClient: mockHistoryClient, @@ -251,23 +251,24 @@ func newMatchingEngine( loadedTaskQueuePartitionCount: make(map[taskQueueCounterKey]int), loadedPhysicalTaskQueueCount: make(map[taskQueueCounterKey]int), }, - queryResults: collection.NewSyncMap[string, chan *queryResult](), - logger: logger, - throttledLogger: log.ThrottledLogger(logger), - metricsHandler: metrics.NoopMetricsHandler, - matchingRawClient: mockMatchingClient, - tokenSerializer: tasktoken.NewSerializer(), - config: config, - namespaceRegistry: mockNamespaceCache, - hostInfoProvider: mockHostInfoProvider, - serviceResolver: mockServiceResolver, - membershipChangedCh: make(chan *membership.ChangedEvent, 1), - clusterMeta: clustertest.NewMetadataForTest(cluster.NewTestClusterMetadataConfig(false, true)), - timeSource: clock.NewRealTimeSource(), - visibilityManager: mockVisibilityManager, - nexusEndpointClient: newEndpointClient(config.NexusEndpointsRefreshInterval, nexusEndpointManager), - nexusEndpointsOwnershipLostCh: make(chan struct{}), - } + queryResults: collection.NewSyncMap[string, chan *queryResult](), + logger: logger, + throttledLogger: log.ThrottledLogger(logger), + metricsHandler: metrics.NoopMetricsHandler, + matchingRawClient: mockMatchingClient, + tokenSerializer: tasktoken.NewSerializer(), + config: config, + namespaceRegistry: mockNamespaceCache, + hostInfoProvider: mockHostInfoProvider, + serviceResolver: mockServiceResolver, + membershipChangedCh: make(chan *membership.ChangedEvent, 1), + clusterMeta: clustertest.NewMetadataForTest(cluster.NewTestClusterMetadataConfig(false, true)), + timeSource: clock.NewRealTimeSource(), + visibilityManager: mockVisibilityManager, + nexusEndpointClient: newEndpointClient(config.NexusEndpointsRefreshInterval, nexusEndpointManager), + } + e.nexusEndpointsOwnershipLostCh.Store(make(chan struct{})) + return e } func (s *matchingEngineSuite) newPartitionManager(prtn tqid.Partition, config *Config) taskQueuePartitionManager { @@ -1230,7 +1231,7 @@ func (s *matchingEngineSuite) TestSyncMatchActivities() { assert.EqualValues(collect, 0, s.taskManager.getTaskCount(dbq)) }, 2*time.Second, 100*time.Millisecond) - syncCtr := scope.Snapshot().Counters()["test.sync_throttle_count+namespace="+matchingTestNamespace+",namespace_state=active,operation=TaskQueueMgr,partition=0,service_name=matching,task_type=Activity,taskqueue=makeToast,worker_version=__unversioned__"] + syncCtr := scope.Snapshot().Counters()["test.sync_throttle_count+namespace="+matchingTestNamespace+",namespace_state=active,operation=TaskQueueMgr,partition=0,service_name=matching,task_type=Activity,taskqueue=makeToast,worker_build_id=,worker_deployment_name=,worker_version=__unversioned__"] s.Equal(1, int(syncCtr.Value())) // Check times zero rps is set = throttle counter expectedRange := int64((taskCount + 1) / 30) // Due to conflicts some ids are skipped and more real ranges are used. @@ -3579,7 +3580,7 @@ func (s *matchingEngineSuite) TestCheckNexusEndpointsOwnership() { } func (s *matchingEngineSuite) TestNotifyNexusEndpointsOwnershipLost() { - ch := s.matchingEngine.nexusEndpointsOwnershipLostCh + ch := s.matchingEngine.nexusEndpointsOwnershipLostCh.Load().(chan struct{}) //nolint:revive // type is always chan struct{} s.matchingEngine.notifyNexusEndpointsOwnershipChange() select { case <-ch: diff --git a/service/matching/physical_task_queue_manager.go b/service/matching/physical_task_queue_manager.go index 30e886ed45..173b5b1257 100644 --- a/service/matching/physical_task_queue_manager.go +++ b/service/matching/physical_task_queue_manager.go @@ -85,12 +85,11 @@ type ( clusterMeta cluster.Metadata metricsHandler metrics.Handler // namespace/taskqueue tagged metric scope // pollerHistory stores poller which poll from this taskqueue in last few minutes - pollerHistory *pollerHistory - currentPolls atomic.Int64 - taskValidator taskValidator - deploymentRegistrationCh chan struct{} - deploymentVersionRegistered bool - pollerScalingRateLimiter quotas.RateLimiter + pollerHistory *pollerHistory + currentPolls atomic.Int64 + taskValidator taskValidator + deploymentRegistrationCh chan struct{} + pollerScalingRateLimiter quotas.RateLimiter taskTrackerLock sync.RWMutex tasksAdded map[priorityKey]*taskTracker @@ -134,7 +133,10 @@ func newPhysicalTaskQueueManager( buildIDTag := tag.WorkerVersion(versionTagValue) taggedMetricsHandler := partitionMgr.metricsHandler.WithTags( metrics.OperationTag(metrics.MatchingTaskQueueMgrScope), - metrics.WorkerVersionTag(versionTagValue, config.BreakdownMetricsByBuildID())) + metrics.WorkerVersionTag(versionTagValue, config.BreakdownMetricsByBuildID()), + metrics.WorkerDeploymentNameTag(queue.Version().Deployment().GetSeriesName(), config.BreakdownMetricsByBuildID()), + metrics.WorkerDeploymentBuildIDTag(queue.Version().Deployment().GetBuildId(), config.BreakdownMetricsByBuildID()), + ) tqCtx, tqCancel := context.WithCancel(partitionMgr.callerInfoContext(context.Background())) @@ -713,6 +715,19 @@ func (c *physicalTaskQueueManagerImpl) ensureRegisteredInDeploymentVersion( return errMissingDeploymentVersion } + userData, _, err := c.partitionMgr.getPerTypeUserData() + if err != nil { + return err + } + + deploymentData := userData.GetDeploymentData() + if worker_versioning.HasDeploymentVersion(deploymentData, worker_versioning.DeploymentVersionFromDeployment(workerDeployment)) { + // already registered in user data, we can assume the workflow is running. + // TODO: consider replication scenarios where user data is replicated before + // the deployment workflow. + return nil + } + select { case <-ctx.Done(): return ctx.Err() @@ -730,17 +745,13 @@ func (c *physicalTaskQueueManagerImpl) ensureRegisteredInDeploymentVersion( } }() - if c.deploymentVersionRegistered { - // deployment version already registered - return nil - } - - userData, _, err := c.partitionMgr.GetUserDataManager().GetUserData() + // Recheck user data in case it was updated in the meantime while we were waiting for the lock. + userData, _, err = c.partitionMgr.getPerTypeUserData() if err != nil { return err } - deploymentData := userData.GetData().GetPerType()[int32(c.queue.TaskType())].GetDeploymentData() + deploymentData = userData.GetDeploymentData() if worker_versioning.HasDeploymentVersion(deploymentData, worker_versioning.DeploymentVersionFromDeployment(workerDeployment)) { // already registered in user data, we can assume the workflow is running. // TODO: consider replication scenarios where user data is replicated before @@ -790,11 +801,11 @@ func (c *physicalTaskQueueManagerImpl) ensureRegisteredInDeploymentVersion( // the deployment workflow will register itself in this task queue's user data. // wait for it to propagate here. for { - userData, userDataChanged, err := c.partitionMgr.GetUserDataManager().GetUserData() + userData, userDataChanged, err := c.partitionMgr.getPerTypeUserData() if err != nil { return err } - deploymentData := userData.GetData().GetPerType()[int32(c.queue.TaskType())].GetDeploymentData() + deploymentData := userData.GetDeploymentData() if worker_versioning.HasDeploymentVersion(deploymentData, worker_versioning.DeploymentVersionFromDeployment(workerDeployment)) { break } @@ -806,7 +817,6 @@ func (c *physicalTaskQueueManagerImpl) ensureRegisteredInDeploymentVersion( } } - c.deploymentVersionRegistered = true return nil } diff --git a/service/matching/task_queue_partition_manager.go b/service/matching/task_queue_partition_manager.go index c9dad7107d..47da025e35 100644 --- a/service/matching/task_queue_partition_manager.go +++ b/service/matching/task_queue_partition_manager.go @@ -7,6 +7,7 @@ import ( "maps" "math" "math/bits" + "strings" "sync" "time" @@ -1241,8 +1242,11 @@ func (pm *taskQueuePartitionManagerImpl) fetchAndEmitLogicalBacklogMetrics(ctx c pqInfo := vInfo.GetPhysicalTaskQueueInfo() + deploymentName, buildID := parseDeploymentFromVersionKey(versionKey) versionHandler := pm.metricsHandler.WithTags( metrics.WorkerVersionTag(versionKey, pm.config.BreakdownMetricsByBuildID()), + metrics.WorkerDeploymentNameTag(deploymentName, pm.config.BreakdownMetricsByBuildID()), + metrics.WorkerDeploymentBuildIDTag(buildID, pm.config.BreakdownMetricsByBuildID()), ) // Per-priority backlog count @@ -1278,8 +1282,11 @@ func (pm *taskQueuePartitionManagerImpl) emitZeroLogicalBacklogForQueue(version if !pm.config.BreakdownMetricsByTaskQueue() || !pm.config.BreakdownMetricsByPartition() { return } + deploymentName, buildID := parseDeploymentFromVersionKey(version.MetricsTagValue()) handler := pm.metricsHandler.WithTags( metrics.WorkerVersionTag(version.MetricsTagValue(), pm.config.BreakdownMetricsByBuildID()), + metrics.WorkerDeploymentNameTag(deploymentName, pm.config.BreakdownMetricsByBuildID()), + metrics.WorkerDeploymentBuildIDTag(buildID, pm.config.BreakdownMetricsByBuildID()), ) for pri := range pq.GetStatsByPriority(false) { metrics.ApproximateBacklogCount.With(handler).Record(0, metrics.MatchingTaskPriorityTag(pri)) @@ -1287,6 +1294,18 @@ func (pm *taskQueuePartitionManagerImpl) emitZeroLogicalBacklogForQueue(version metrics.ApproximateBacklogAgeSeconds.With(handler).Record(0) } +// parseDeploymentFromVersionKey extracts the deployment name and build ID from a version key +// string used as the map key in DescribeTaskQueuePartitionResponse.VersionsInfoInternal. +// The key format is "deploymentName:buildId" for V3 deployment-based versions, or empty for +// unversioned queues. Returns empty strings when the delimiter is not found (unversioned or +// V2 version-set keys). +func parseDeploymentFromVersionKey(versionKey string) (deploymentName, buildID string) { + if name, id, found := strings.Cut(versionKey, worker_versioning.WorkerDeploymentVersionDelimiter); found { + return name, id + } + return "", "" +} + func (pm *taskQueuePartitionManagerImpl) ephemeralDataChanged(data *taskqueuespb.EphemeralData) { // for now, only sticky partitions act on ephemeral data, normal partitions ignore it. if pm.partition.Kind() != enumspb.TASK_QUEUE_KIND_STICKY { diff --git a/service/matching/workers/worker_query_engine.go b/service/matching/workers/worker_query_engine.go index c5e468d0fe..ec23a929a9 100644 --- a/service/matching/workers/worker_query_engine.go +++ b/service/matching/workers/worker_query_engine.go @@ -64,12 +64,16 @@ Example query: Different fields can support different operators. - string fields (e.g., WorkerIdentity, HostName, TaskQueue, DeploymentName, BuildId, SdkName, SdkVersion): - starts_with, not starts_with + =, !=, starts_with, not starts_with, IS NULL, IS NOT NULL - time fields (e.g., StartTime, HeartbeatTime): - =, !=, >, >=, <, <=, between + =, !=, >, >=, <, <=, between, IS NULL, IS NOT NULL - metric fields (e.g., total_sticky_cache_hit): =, !=, >, >=, <, <= +For string fields, IS NULL matches workers where the field is empty, and IS NOT NULL matches +workers where the field is non-empty. For time fields, IS NULL matches workers where the +timestamp is not set. + Returns the list of workers for which the query matches the worker heartbeat, or an error, Errors are: - the query is invalid. @@ -219,7 +223,7 @@ func (w *workerQueryEngine) evaluateExpression(expr sqlparser.Expr) (bool, error case *sqlparser.RangeCond: return w.evaluateRange(e) case *sqlparser.IsExpr: - return false, serviceerror.NewInvalidArgumentf("%s: 'is' expression", notSupportedErrMessage) + return w.evaluateIsExpr(e) case *sqlparser.NotExpr: return false, serviceerror.NewInvalidArgumentf("%s: 'not' expression", notSupportedErrMessage) case *sqlparser.FuncExpr: @@ -254,6 +258,42 @@ func (w *workerQueryEngine) evaluateOr(expr *sqlparser.OrExpr) (bool, error) { return w.evaluateExpression(expr.Right) } +func (w *workerQueryEngine) evaluateIsExpr(expr *sqlparser.IsExpr) (bool, error) { + if expr == nil { + return false, serviceerror.NewInvalidArgumentf("IsExpr input expression cannot be nil") + } + + colNameExpr, ok := expr.Expr.(*sqlparser.ColName) + if !ok { + return false, serviceerror.NewInvalidArgumentf("invalid filter name: %s", sqlparser.String(expr.Expr)) + } + colName := strings.ReplaceAll(sqlparser.String(colNameExpr), "`", "") + + if expr.Operator != sqlparser.IsNullStr && expr.Operator != sqlparser.IsNotNullStr { + return false, serviceerror.NewInvalidArgumentf( + "%s: 'is' operator %q is not supported; only IS NULL and IS NOT NULL are supported", + notSupportedErrMessage, expr.Operator) + } + + isNull := expr.Operator == sqlparser.IsNullStr + + if propertyFunc, ok := propertyMapFuncs[colName]; ok { + isEmpty := propertyFunc(w.currentWorker) == "" + return isEmpty == isNull, nil + } + + switch colName { + case workerStartTimeColName, workerHeartbeatTimeColName: + timeValue, err := w.getTimeValue(colName) + if err != nil { + return false, err + } + return timeValue.IsZero() == isNull, nil + default: + return false, serviceerror.NewInvalidArgumentf("unknown or unsupported worker heartbeat search field: %s", colName) + } +} + func (w *workerQueryEngine) evaluateComparison(expr *sqlparser.ComparisonExpr) (bool, error) { if expr == nil { return false, serviceerror.NewInvalidArgumentf("ComparisonExpr input expression cannot be nil") diff --git a/service/matching/workers/worker_query_engine_test.go b/service/matching/workers/worker_query_engine_test.go index 0e7e255384..2c0ea7cffc 100644 --- a/service/matching/workers/worker_query_engine_test.go +++ b/service/matching/workers/worker_query_engine_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" deploymentpb "go.temporal.io/api/deployment/v1" enumspb "go.temporal.io/api/enums/v1" workerpb "go.temporal.io/api/worker/v1" @@ -348,3 +349,176 @@ func TestActivityInfoMatchEvaluator_SupportedTimeFields(t *testing.T) { }) } } + +func TestWorkerQueryEngine_IsNullString(t *testing.T) { + hbWithDeployment := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + DeploymentVersion: &deploymentpb.WorkerDeploymentVersion{ + DeploymentName: "my-deployment", + }, + } + hbWithoutDeployment := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + } + + engine, err := newWorkerQueryEngine("nsID", fmt.Sprintf("%s IS NULL", workerDeploymentNameColName)) + require.NoError(t, err) + + match, err := engine.EvaluateWorker(hbWithoutDeployment) + require.NoError(t, err) + assert.True(t, match, "IS NULL should match when DeploymentName is empty") + + match, err = engine.EvaluateWorker(hbWithDeployment) + require.NoError(t, err) + assert.False(t, match, "IS NULL should not match when DeploymentName is set") +} + +func TestWorkerQueryEngine_IsNotNullString(t *testing.T) { + hbWithDeployment := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + DeploymentVersion: &deploymentpb.WorkerDeploymentVersion{ + DeploymentName: "my-deployment", + }, + } + hbWithoutDeployment := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + } + + engine, err := newWorkerQueryEngine("nsID", fmt.Sprintf("%s IS NOT NULL", workerDeploymentNameColName)) + require.NoError(t, err) + + match, err := engine.EvaluateWorker(hbWithDeployment) + require.NoError(t, err) + assert.True(t, match, "IS NOT NULL should match when DeploymentName is set") + + match, err = engine.EvaluateWorker(hbWithoutDeployment) + require.NoError(t, err) + assert.False(t, match, "IS NOT NULL should not match when DeploymentName is empty") +} + +func TestWorkerQueryEngine_IsNullTime(t *testing.T) { + startTimeStr := "2023-10-26T14:30:00Z" + startTime, err := sqlquery.ConvertToTime(fmt.Sprintf("'%s'", startTimeStr)) + require.NoError(t, err) + + hbWithTime := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + StartTime: timestamppb.New(startTime), + HeartbeatTime: timestamppb.New(startTime), + } + hbWithoutTime := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + } + + for _, colName := range []string{workerStartTimeColName, workerHeartbeatTimeColName} { + engine, err := newWorkerQueryEngine("nsID", fmt.Sprintf("%s IS NULL", colName)) + require.NoError(t, err) + + t.Run(fmt.Sprintf("%s matches when not set", colName), func(t *testing.T) { + match, err := engine.EvaluateWorker(hbWithoutTime) + require.NoError(t, err) + assert.True(t, match) + }) + + t.Run(fmt.Sprintf("%s does not match when set", colName), func(t *testing.T) { + match, err := engine.EvaluateWorker(hbWithTime) + require.NoError(t, err) + assert.False(t, match) + }) + } +} + +func TestWorkerQueryEngine_IsNotNullTime(t *testing.T) { + startTimeStr := "2023-10-26T14:30:00Z" + startTime, err := sqlquery.ConvertToTime(fmt.Sprintf("'%s'", startTimeStr)) + require.NoError(t, err) + + hbWithTime := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + StartTime: timestamppb.New(startTime), + HeartbeatTime: timestamppb.New(startTime), + } + hbWithoutTime := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + } + + for _, colName := range []string{workerStartTimeColName, workerHeartbeatTimeColName} { + engine, err := newWorkerQueryEngine("nsID", fmt.Sprintf("%s IS NOT NULL", colName)) + require.NoError(t, err) + + t.Run(fmt.Sprintf("%s matches when set", colName), func(t *testing.T) { + match, err := engine.EvaluateWorker(hbWithTime) + require.NoError(t, err) + assert.True(t, match) + }) + + t.Run(fmt.Sprintf("%s does not match when not set", colName), func(t *testing.T) { + match, err := engine.EvaluateWorker(hbWithoutTime) + require.NoError(t, err) + assert.False(t, match) + }) + } +} + +func TestWorkerQueryEngine_IsNullComposite(t *testing.T) { + hb := &workerpb.WorkerHeartbeat{ + TaskQueue: "task_queue", + SdkName: "temporal-go", + } + + tests := []struct { + name string + query string + expectedMatch bool + }{ + { + name: "IS NOT NULL AND equality, both match", + query: fmt.Sprintf("%s IS NOT NULL AND %s = 'task_queue'", workerSdkNameColName, workerTaskQueueColName), + expectedMatch: true, + }, + { + name: "IS NULL OR equality, right matches", + query: fmt.Sprintf("%s IS NULL OR %s = 'task_queue'", workerSdkNameColName, workerTaskQueueColName), + expectedMatch: true, + }, + { + name: "IS NULL AND equality, left fails", + query: fmt.Sprintf("%s IS NULL AND %s = 'task_queue'", workerSdkNameColName, workerTaskQueueColName), + expectedMatch: false, + }, + { + name: "IS NULL with empty field in composite", + query: fmt.Sprintf("%s IS NULL AND %s IS NOT NULL", workerDeploymentNameColName, workerSdkNameColName), + expectedMatch: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + engine, err := newWorkerQueryEngine("nsID", tt.query) + require.NoError(t, err) + match, err := engine.EvaluateWorker(hb) + require.NoError(t, err) + assert.Equal(t, tt.expectedMatch, match) + }) + } +} + +func TestWorkerQueryEngine_IsNullRejectsUnknownColumn(t *testing.T) { + hb := &workerpb.WorkerHeartbeat{TaskQueue: "task_queue"} + engine, err := newWorkerQueryEngine("nsID", "UnknownField IS NULL") + require.NoError(t, err) + _, err = engine.EvaluateWorker(hb) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown or unsupported") +} + +// Only IS NULL and IS NOT NULL are supported; other IS operators (e.g., IS TRUE) should be rejected. +func TestWorkerQueryEngine_IsExprRejectsUnsupportedOperators(t *testing.T) { + hb := &workerpb.WorkerHeartbeat{TaskQueue: "task_queue"} + engine, err := newWorkerQueryEngine("nsID", fmt.Sprintf("%s IS TRUE", workerTaskQueueColName)) + require.NoError(t, err) + _, err = engine.EvaluateWorker(hb) + require.Error(t, err) + assert.Contains(t, err.Error(), "not supported") +} diff --git a/service/worker/batcher/activities.go b/service/worker/batcher/activities.go index 7cd1f5445e..2262da2597 100644 --- a/service/worker/batcher/activities.go +++ b/service/worker/batcher/activities.go @@ -266,8 +266,19 @@ type activities struct { concurrency dynamicconfig.IntPropertyFnWithNamespaceFilter } -func (a *activities) checkNamespaceID(namespaceID string) error { - if namespaceID != a.namespaceID.String() { +// checkNamespace validates that batchParams targets the worker's own namespace. +// The NamespaceId, Request.Namespace (if set), and AdminRequest.Namespace (if set) +// must all agree with the worker's bound namespace. This prevents cross-namespace +// escalation via the privileged internal-frontend connection (NoopClaimMapper → RoleAdmin). +func (a *activities) checkNamespace(batchParams *batchspb.BatchOperationInput) error { + if batchParams.NamespaceId != a.namespaceID.String() { + return errNamespaceMismatch + } + ns := a.namespace.String() + if req := batchParams.GetRequest(); req != nil && req.GetNamespace() != ns { + return errNamespaceMismatch + } + if req := batchParams.GetAdminRequest(); req != nil && req.GetNamespace() != ns { return errNamespaceMismatch } return nil @@ -280,14 +291,15 @@ func (a *activities) BatchActivityWithProtobuf(ctx context.Context, batchParams hbd := HeartBeatDetails{} metricsHandler := a.MetricsHandler.WithTags(metrics.OperationTag(metrics.BatcherScope), metrics.NamespaceIDTag(batchParams.NamespaceId)) - if err := a.checkNamespaceID(batchParams.NamespaceId); err != nil { + if err := a.checkNamespace(batchParams); err != nil { metrics.BatcherOperationFailures.With(metricsHandler).Record(1) logger.Error("Failed to run batch operation due to namespace mismatch", tag.Error(err)) return hbd, err } + ns := a.namespace.String() sdkClient := a.ClientFactory.NewClient(sdkclient.Options{ - Namespace: a.namespace.String(), + Namespace: ns, DataConverter: sdk.PreferProtoDataConverter, }) startOver := true @@ -299,19 +311,16 @@ func (a *activities) BatchActivityWithProtobuf(ctx context.Context, batchParams } } - // Get namespace and query based on request type (public vs admin) - var ns string + // Get executions based on request type (public vs admin). var visibilityQuery string var executions []*commonpb.WorkflowExecution if batchParams.AdminRequest != nil { ctx = headers.SetCallerType(ctx, headers.CallerTypePreemptable) adminReq := batchParams.AdminRequest - ns = adminReq.Namespace visibilityQuery = adminReq.GetVisibilityQuery() executions = adminReq.GetExecutions() } else { - ns = batchParams.Request.Namespace visibilityQuery = a.adjustQueryBatchTypeEnum(batchParams.Request.VisibilityQuery, batchParams.BatchType) executions = batchParams.Request.Executions } @@ -482,7 +491,7 @@ func (a *activities) startTaskProcessor( } else { // Old fields //nolint:staticcheck // SA1019: worker versioning v0.31 - eventId, err = getResetEventIDByType(ctx, operation.ResetOperation.ResetType, batchOperation.Request.Namespace, executionInfo.Execution, frontendClient, logger) + eventId, err = getResetEventIDByType(ctx, operation.ResetOperation.ResetType, namespace, executionInfo.Execution, frontendClient, logger) //nolint:staticcheck // SA1019: worker versioning v0.31 resetReapplyType = operation.ResetOperation.ResetReapplyType } diff --git a/service/worker/batcher/activities_namespace_test.go b/service/worker/batcher/activities_namespace_test.go new file mode 100644 index 0000000000..833d9bc653 --- /dev/null +++ b/service/worker/batcher/activities_namespace_test.go @@ -0,0 +1,156 @@ +package batcher + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + batchpb "go.temporal.io/api/batch/v1" + commonpb "go.temporal.io/api/common/v1" + workflowpb "go.temporal.io/api/workflow/v1" + "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/sdk/testsuite" + "go.temporal.io/server/api/adminservice/v1" + batchspb "go.temporal.io/server/api/batch/v1" + "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/namespace" + "go.temporal.io/server/common/testing/mockapi/workflowservicemock/v1" + "go.uber.org/mock/gomock" + "golang.org/x/time/rate" +) + +// These tests guard against cross-namespace escalation via the batcher activity. +// The per-NS worker's frontendClient dials internal-frontend (NoopClaimMapper → +// RoleAdmin), so the activity MUST NOT forward a user-controlled namespace string +// to frontendClient calls. The namespace for all downstream operations must be +// the worker's bound namespace. + +const ( + boundNSName = "bound-ns" + boundNSID = "bound-ns-id" + otherNSName = "other-ns" +) + +func newBoundActivities(frontend workflowservice.WorkflowServiceClient) *activities { + return &activities{ + activityDeps: activityDeps{ + MetricsHandler: metrics.NoopMetricsHandler, + Logger: log.NewTestLogger(), + FrontendClient: frontend, + }, + namespace: namespace.Name(boundNSName), + namespaceID: namespace.ID(boundNSID), + rps: dynamicconfig.GetIntPropertyFnFilteredByNamespace(50), + concurrency: dynamicconfig.GetIntPropertyFnFilteredByNamespace(1), + } +} + +// TestBatchActivityWithProtobuf_RejectsMismatchedRequestNamespace verifies that +// BatchActivityWithProtobuf rejects a request whose Request.Namespace differs +// from the worker's bound namespace, even when NamespaceId is valid. +// This blocks the cross-namespace attack where an attacker submits a valid +// NamespaceId for their own namespace but sets Request.Namespace to a victim's. +func TestBatchActivityWithProtobuf_RejectsMismatchedRequestNamespace(t *testing.T) { + ts := testsuite.WorkflowTestSuite{} + env := ts.NewTestActivityEnvironment() + a := newBoundActivities(nil) + env.RegisterActivity(a.BatchActivityWithProtobuf) + + input := &batchspb.BatchOperationInput{ + NamespaceId: boundNSID, // ID check passes; name check must catch the mismatch + Request: &workflowservice.StartBatchOperationRequest{ + Namespace: otherNSName, // mismatched — must be rejected + Operation: &workflowservice.StartBatchOperationRequest_SignalOperation{ + SignalOperation: &batchpb.BatchOperationSignal{Signal: "s"}, + }, + Executions: []*commonpb.WorkflowExecution{{WorkflowId: "w"}}, + }, + } + + _, err := env.ExecuteActivity(a.BatchActivityWithProtobuf, input) + require.Error(t, err) + require.ErrorContains(t, err, errNamespaceMismatch.Error()) +} + +// TestBatchActivityWithProtobuf_RejectsMismatchedAdminRequestNamespace verifies +// that the same namespace mismatch check applies to admin batch requests. +func TestBatchActivityWithProtobuf_RejectsMismatchedAdminRequestNamespace(t *testing.T) { + ts := testsuite.WorkflowTestSuite{} + env := ts.NewTestActivityEnvironment() + a := newBoundActivities(nil) + env.RegisterActivity(a.BatchActivityWithProtobuf) + + input := &batchspb.BatchOperationInput{ + NamespaceId: boundNSID, + AdminRequest: &adminservice.StartAdminBatchOperationRequest{ + Namespace: otherNSName, // mismatched — must be rejected + Executions: []*commonpb.WorkflowExecution{{WorkflowId: "w"}}, + }, + } + + _, err := env.ExecuteActivity(a.BatchActivityWithProtobuf, input) + require.Error(t, err) + require.ErrorContains(t, err, errNamespaceMismatch.Error()) +} + +// TestStartTaskProcessor_UsesWorkerBoundNamespaceForSignal verifies that when +// BatchActivityWithProtobuf dispatches via startTaskProcessor, the namespace +// delivered to frontendClient.SignalWorkflowExecution is the worker's bound +// namespace. This is a belt-and-suspenders check: even if the early validation +// above is ever relaxed, the namespace used in operations stays worker-bound. +func TestStartTaskProcessor_UsesWorkerBoundNamespaceForSignal(t *testing.T) { + r := require.New(t) + ctrl := gomock.NewController(t) + mockFE := workflowservicemock.NewMockWorkflowServiceClient(ctrl) + + var captured *workflowservice.SignalWorkflowExecutionRequest + mockFE.EXPECT(). + SignalWorkflowExecution(gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, req *workflowservice.SignalWorkflowExecutionRequest, _ ...any) (*workflowservice.SignalWorkflowExecutionResponse, error) { + captured = req + return &workflowservice.SignalWorkflowExecutionResponse{}, nil + }) + + a := newBoundActivities(mockFE) + + // Simulate the namespace that BatchActivityWithProtobuf derives — + // with the fix this is always a.namespace.String(). + ns := a.namespace.String() + batchOp := &batchspb.BatchOperationInput{ + NamespaceId: boundNSID, + Request: &workflowservice.StartBatchOperationRequest{ + Namespace: ns, + Operation: &workflowservice.StartBatchOperationRequest_SignalOperation{ + SignalOperation: &batchpb.BatchOperationSignal{Signal: "s"}, + }, + }, + } + + taskCh := make(chan task, 1) + respCh := make(chan taskResponse, 1) + taskCh <- task{ + executionInfo: &workflowpb.WorkflowExecutionInfo{ + Execution: &commonpb.WorkflowExecution{WorkflowId: "w"}, + }, + page: &page{}, + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + defer close(done) + a.startTaskProcessor(ctx, batchOp, ns, taskCh, respCh, + rate.NewLimiter(rate.Inf, 1), nil, mockFE, + metrics.NoopMetricsHandler, log.NewTestLogger()) + }() + + <-respCh + cancel() + <-done + + r.NotNil(captured) + r.Equal(boundNSName, captured.Namespace, + "frontendClient.SignalWorkflowExecution must receive the worker's bound namespace") +} diff --git a/service/worker/batcher/activities_test.go b/service/worker/batcher/activities_test.go index d7cac8bfe4..65ea595503 100644 --- a/service/worker/batcher/activities_test.go +++ b/service/worker/batcher/activities_test.go @@ -10,6 +10,7 @@ import ( "unicode" "github.com/stretchr/testify/suite" + batchpb "go.temporal.io/api/batch/v1" commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" historypb "go.temporal.io/api/history/v1" @@ -22,6 +23,7 @@ import ( "go.temporal.io/server/api/historyservice/v1" "go.temporal.io/server/api/historyservicemock/v1" "go.temporal.io/server/common/log" + "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/primitives/timestamp" "go.temporal.io/server/common/testing/mockapi/workflowservicemock/v1" "go.uber.org/mock/gomock" @@ -565,6 +567,74 @@ func (s *activitiesSuite) TestIsNonRetryableError() { } } +// TestStartTaskProcessor_SignalUsesWorkerNamespace verifies that startTaskProcessor uses +// the worker's authoritative namespace (passed as the namespace argument) for operations, +// not the user-controlled namespace from batchOperation.Request.Namespace. +// This guards against a regression introduced in PR #8144 where batchParams.Request.Namespace +// (user-controlled) was used instead of a.namespace.String() (server-trusted). +func (s *activitiesSuite) TestStartTaskProcessor_SignalUsesWorkerNamespace() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + a := &activities{ + activityDeps: activityDeps{ + FrontendClient: s.mockFrontendClient, + Logger: log.NewTestLogger(), + MetricsHandler: metrics.NoopMetricsHandler, + }, + } + + workerNamespace := "trusted-namespace" + requestNamespace := "untrusted-namespace" // intentionally different + + batchOperation := &batchspb.BatchOperationInput{ + NamespaceId: "some-namespace-id", + Request: &workflowservice.StartBatchOperationRequest{ + Namespace: requestNamespace, + Operation: &workflowservice.StartBatchOperationRequest_SignalOperation{ + SignalOperation: &batchpb.BatchOperationSignal{ + Signal: "test-signal", + }, + }, + }, + } + + testPage := &page{ + executionInfos: []*workflowpb.WorkflowExecutionInfo{ + { + Execution: &commonpb.WorkflowExecution{ + WorkflowId: "test-workflow-id", + RunId: "test-run-id", + }, + }, + }, + } + testTask := task{ + executionInfo: testPage.executionInfos[0], + attempts: 1, + page: testPage, + } + + taskCh := make(chan task, 1) + respCh := make(chan taskResponse, 1) + limiter := rate.NewLimiter(rate.Limit(100), 1) + + // The signal must be executed with the worker's trusted namespace, not the user-supplied one. + s.mockFrontendClient.EXPECT(). + SignalWorkflowExecution(gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, req *workflowservice.SignalWorkflowExecutionRequest, _ ...any) (*workflowservice.SignalWorkflowExecutionResponse, error) { + s.Equal(workerNamespace, req.Namespace, "must use worker namespace, not request namespace") + return &workflowservice.SignalWorkflowExecutionResponse{}, nil + }) + + taskCh <- testTask + + go a.startTaskProcessor(ctx, batchOperation, workerNamespace, taskCh, respCh, limiter, nil, s.mockFrontendClient, metrics.NoopMetricsHandler, log.NewTestLogger()) + + resp := <-respCh + s.NoError(resp.err) +} + func (s *activitiesSuite) TestProcessAdminTask_UnknownOperation() { ctx := context.Background() diff --git a/service/worker/scheduler/activities.go b/service/worker/scheduler/activities.go index 1b17767846..471faafbfb 100644 --- a/service/worker/scheduler/activities.go +++ b/service/worker/scheduler/activities.go @@ -375,6 +375,13 @@ func (r responseBuilder) makeResponse(result *commonpb.Payloads, failure *failur } func (a *activities) MigrateScheduleToChasm(ctx context.Context, req *schedulerpb.CreateFromMigrationStateRequest) error { + if req.GetNamespaceId() != a.namespaceID.String() { + return temporal.NewNonRetryableApplicationError( + fmt.Sprintf("MigrateScheduleToChasm: request namespace ID %q does not match activity namespace ID %q", req.GetNamespaceId(), a.namespaceID), + "namespace_mismatch", + nil, + ) + } _, err := a.SchedulerClient.CreateFromMigrationState(ctx, req) if err != nil { // Treat "already exists" as success (idempotency). diff --git a/service/worker/scheduler/activities_test.go b/service/worker/scheduler/activities_test.go index 5fd2b8419b..c667033b36 100644 --- a/service/worker/scheduler/activities_test.go +++ b/service/worker/scheduler/activities_test.go @@ -10,6 +10,7 @@ import ( schedulerpb "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb/v1" "go.temporal.io/server/common/log" "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/namespace" "google.golang.org/grpc" ) @@ -26,21 +27,26 @@ func (m *mockSchedulerClient) CreateFromMigrationState( return &schedulerpb.CreateFromMigrationStateResponse{}, m.migrateErr } -func newTestActivities(client schedulerpb.SchedulerServiceClient) *activities { +func newTestActivities(client schedulerpb.SchedulerServiceClient, nsID namespace.ID) *activities { return &activities{ activityDeps: activityDeps{ Logger: log.NewNoopLogger(), SchedulerClient: client, MetricsHandler: metrics.NoopMetricsHandler, }, + namespaceID: nsID, } } +const testNamespaceID = "test-namespace-id" + func TestMigrateScheduleToChasm_Success(t *testing.T) { client := &mockSchedulerClient{} - a := newTestActivities(client) + a := newTestActivities(client, testNamespaceID) - err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{}) + err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{ + NamespaceId: testNamespaceID, + }) require.NoError(t, err) } @@ -48,9 +54,11 @@ func TestMigrateScheduleToChasm_AlreadyExists(t *testing.T) { client := &mockSchedulerClient{ migrateErr: serviceerror.NewAlreadyExistsf("schedule %q is already registered", "test-schedule"), } - a := newTestActivities(client) + a := newTestActivities(client, testNamespaceID) - err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{}) + err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{ + NamespaceId: testNamespaceID, + }) require.NoError(t, err, "already-exists should be treated as success") } @@ -58,9 +66,11 @@ func TestMigrateScheduleToChasm_SentinelBlocked(t *testing.T) { client := &mockSchedulerClient{ migrateErr: serviceerror.NewUnavailable("schedule is a sentinel; please retry after sentinel expires"), } - a := newTestActivities(client) + a := newTestActivities(client, testNamespaceID) - err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{}) + err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{ + NamespaceId: testNamespaceID, + }) require.Error(t, err) require.Contains(t, err.Error(), "blocked by sentinel") } @@ -69,9 +79,24 @@ func TestMigrateScheduleToChasm_OtherError(t *testing.T) { client := &mockSchedulerClient{ migrateErr: errors.New("some transient error"), } - a := newTestActivities(client) + a := newTestActivities(client, testNamespaceID) - err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{}) + err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{ + NamespaceId: testNamespaceID, + }) require.Error(t, err) require.Contains(t, err.Error(), "MigrateScheduleToChasm") } + +func TestMigrateScheduleToChasm_NamespaceMismatch(t *testing.T) { + client := &mockSchedulerClient{} + a := newTestActivities(client, testNamespaceID) + + err := a.MigrateScheduleToChasm(context.Background(), &schedulerpb.CreateFromMigrationStateRequest{ + NamespaceId: "different-namespace-id", + }) + require.Error(t, err) + require.Contains(t, err.Error(), "namespace_mismatch") + require.Contains(t, err.Error(), "different-namespace-id") + require.Contains(t, err.Error(), testNamespaceID) +} diff --git a/service/worker/workerdeployment/client.go b/service/worker/workerdeployment/client.go index 2355978153..4145d67320 100644 --- a/service/worker/workerdeployment/client.go +++ b/service/worker/workerdeployment/client.go @@ -34,7 +34,6 @@ import ( "go.temporal.io/server/common/searchattribute/sadefs" "go.temporal.io/server/common/testing/testhooks" "go.temporal.io/server/common/worker_versioning" - "google.golang.org/protobuf/types/known/timestamppb" ) type Client interface { @@ -1680,19 +1679,6 @@ func (d *ClientImpl) makeVersionWorkflowArgs( return &deploymentspb.WorkerDeploymentVersionWorkflowArgs{ NamespaceName: namespaceEntry.Name().String(), NamespaceId: namespaceEntry.ID().String(), - VersionState: &deploymentspb.VersionLocalState{ - Version: &deploymentspb.WorkerDeploymentVersion{ - DeploymentName: deploymentName, - BuildId: buildID, - }, - CreateTime: timestamppb.Now(), - RoutingUpdateTime: nil, - CurrentSinceTime: nil, // not current - RampingSinceTime: nil, // not ramping - RampPercentage: 0, // not ramping - DrainageInfo: &deploymentpb.VersionDrainageInfo{}, // not draining or drained - Metadata: nil, - SyncBatchSize: d.getSyncBatchSize(), - }, + VersionState: makeNewVersionState(deploymentName, buildID, time.Now(), d.getSyncBatchSize()), } } diff --git a/service/worker/workerdeployment/util.go b/service/worker/workerdeployment/util.go index c54f03f40f..92fbec9400 100644 --- a/service/worker/workerdeployment/util.go +++ b/service/worker/workerdeployment/util.go @@ -30,6 +30,7 @@ import ( "go.temporal.io/server/service/history/api" "go.temporal.io/server/service/history/consts" update2 "go.temporal.io/server/service/history/workflow/update" + "google.golang.org/protobuf/types/known/timestamppb" ) const ( @@ -426,3 +427,20 @@ func buildSearchAttributes() *commonpb.SearchAttributes { searchattribute.AddSearchAttribute(&sa, sadefs.TemporalNamespaceDivision, payload.EncodeString(WorkerDeploymentNamespaceDivision)) return sa } + +func makeNewVersionState(deploymentName, buildID string, createTime time.Time, syncBatchSize int32) *deploymentspb.VersionLocalState { + return &deploymentspb.VersionLocalState{ + Version: &deploymentspb.WorkerDeploymentVersion{ + DeploymentName: deploymentName, + BuildId: buildID, + }, + CreateTime: timestamppb.New(createTime), + RoutingUpdateTime: nil, + CurrentSinceTime: nil, // not current + RampingSinceTime: nil, // not ramping + RampPercentage: 0, // not ramping + DrainageInfo: &deploymentpb.VersionDrainageInfo{}, // not draining or drained + Metadata: nil, + SyncBatchSize: syncBatchSize, + } +} diff --git a/service/worker/workerdeployment/version_workflow.go b/service/worker/workerdeployment/version_workflow.go index abd67c69e4..cb6549d7e8 100644 --- a/service/worker/workerdeployment/version_workflow.go +++ b/service/worker/workerdeployment/version_workflow.go @@ -52,7 +52,7 @@ type ( // Track if async propagations are in progress (prevents CaN) asyncPropagationsInProgress int // When true, all the ongoing propagations should cancel themselves - // Deprecated. With version data revision number, we don't need to cancel propagations anymore. + // Used when delete happens while there are ongoing propagations. cancelPropagations bool // workflowVersion is set at workflow start based on the dynamic config of the worker // that completes the first task. It remains constant for the lifetime of the run and @@ -152,6 +152,13 @@ func (d *VersionWorkflowRunner) listenToSignals(ctx workflow.Context) { c.Receive(ctx, nil) // No payload needed + if d.deleteVersion { + // This is only possible if delete happened between the time that history checked for version presence and before the signal arrived to the workflow. + // Note that generally History is supposed to not allow VersioningOverrides for deleted versions, but this race condition is possible. + // We should drop the signal in this case to be consistent with the case where the same signal arrived after this workflow is closed. + // In that case signal fails completely and the error is ignored by History. + return + } // Only reactivate if DRAINED or INACTIVE if d.VersionState.Status == enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED || d.VersionState.Status == enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE { @@ -163,8 +170,6 @@ func (d *VersionWorkflowRunner) listenToSignals(ctx workflow.Context) { LastCheckedTime: timestamppb.New(workflow.Now(ctx)), } - d.deleteVersion = false // Clear deletion flag if set - // Use existing function to update status and sync to task queues d.updateVersionStatusAfterDrainageStatusChange(ctx, enumspb.VERSION_DRAINAGE_STATUS_DRAINING) d.syncSummary(ctx) // Notify parent deployment workflow of the status change @@ -273,8 +278,8 @@ func (d *VersionWorkflowRunner) run(ctx workflow.Context) error { return (d.deleteVersion && d.asyncPropagationsInProgress == 0) || // version is deleted -> it's ok to drop all signals and updates. // There is no pending signal or update, but the state is dirty or forceCaN is requested: (!d.signalHandler.signalSelector.HasPending() && d.signalHandler.processingSignals == 0 && workflow.AllHandlersFinished(ctx) && - // And there is a force CaN or a propagated state change - (d.forceCAN || (d.stateChanged && d.asyncPropagationsInProgress == 0))) + // And there is a force CaN or a propagated state change or history got too large + (d.forceCAN || (d.stateChanged && d.asyncPropagationsInProgress == 0) || workflow.GetInfo(ctx).GetContinueAsNewSuggested())) }) if err != nil { return err @@ -433,8 +438,13 @@ func (d *VersionWorkflowRunner) handleDeleteVersion(ctx workflow.Context, args * if args.AsyncPropagation { d.deleteVersion = true - if d.hasMinVersion(VersionDataRevisionNumber) { - d.syncTaskQueuesAsync(ctx, nil, true) + if workflow.GetVersion(ctx, "serialDelete", workflow.DefaultVersion, 1) == workflow.DefaultVersion { + if d.hasMinVersion(VersionDataRevisionNumber) { + d.syncTaskQueuesAsync(ctx, nil, true) + } else { + d.asyncPropagationsInProgress++ + workflow.Go(ctx, d.deleteVersionFromTaskQueuesAsync) + } } else { d.asyncPropagationsInProgress++ workflow.Go(ctx, d.deleteVersionFromTaskQueuesAsync) @@ -575,16 +585,29 @@ func (d *VersionWorkflowRunner) handleRegisterWorker(ctx workflow.Context, args err = workflow.Await(ctx, func() bool { return d.asyncPropagationsInProgress == 0 }) + if err != nil { + return err + } } - if err != nil { - return err - } + if d.deleteVersion { - // In case it was marked as deleted we make it undeleted - d.deleteVersion = false - if withRevisionNumbers { - // If we're changing the version data, we need to increment the revision number - d.GetVersionState().RevisionNumber++ + if workflow.GetVersion(ctx, "awaitSerialDelete", workflow.DefaultVersion, 1) == workflow.DefaultVersion { + // In case it was marked as deleted we make it undeleted + d.deleteVersion = false + if withRevisionNumbers { + // If we're changing the version data, we need to increment the revision number + d.GetVersionState().RevisionNumber++ + } + } else { + // In case this version just got deleted, we wait until it finished propagating delete to all task queues before reviving it. + // This is because the deleted flag propagation is not protected by revision number and if done parallel to other propagations, it can cause a race condition. + err = workflow.Await(ctx, func() bool { + return d.asyncPropagationsInProgress == 0 + }) + if err != nil { + return err + } + d.reviveDeleted(ctx) } } @@ -616,6 +639,14 @@ func (d *VersionWorkflowRunner) handleRegisterWorker(ctx workflow.Context, args return err } +func (d *VersionWorkflowRunner) reviveDeleted(ctx workflow.Context) { + // Resetting state to get rid of the info from the past life. + state := makeNewVersionState(d.VersionState.Version.DeploymentName, d.VersionState.Version.BuildId, workflow.Now(ctx), d.VersionState.SyncBatchSize) + state.Status = enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE + d.VersionState = state + d.deleteVersion = false +} + func (d *VersionWorkflowRunner) syncRegisteredTaskQueueOld(ctx workflow.Context, args *deploymentspb.RegisterWorkerInVersionArgs) error { // initial data var data *deploymentspb.DeploymentVersionData diff --git a/service/worker/workerdeployment/version_workflow_test.go b/service/worker/workerdeployment/version_workflow_test.go index 2b7696142e..b33346bc8a 100644 --- a/service/worker/workerdeployment/version_workflow_test.go +++ b/service/worker/workerdeployment/version_workflow_test.go @@ -3,6 +3,7 @@ package workerdeployment import ( "context" "fmt" + "sync/atomic" "testing" "time" @@ -29,7 +30,7 @@ type VersionWorkflowSuite struct { workflowVersion DeploymentWorkflowVersion } -func TestVersionWorkflowSuiteV2(t *testing.T) { +func TestVersionWorkflowSuite(t *testing.T) { t.Parallel() suite.Run(t, &VersionWorkflowSuite{workflowVersion: VersionDataRevisionNumber}) } @@ -990,9 +991,9 @@ func (s *VersionWorkflowSuite) Test_DeleteVersion_AsyncPropagation_BlocksWorkerR s.True(workerRegistrationCompleted, "worker registration should have completed") } -// Test_RegisterWorker_IncrementsRevisionNumber_WhenRevivingDeletedVersion tests that the revision number -// is incremented when a worker registers on a version that was previously deleted -func (s *VersionWorkflowSuite) Test_RegisterWorker_IncrementsRevisionNumber_WhenRevivingDeletedVersion() { +// Test_RegisterWorker_ResetRevisionNumber_WhenRevivingDeletedVersion tests that the revision number +// is reset to 0 when a worker registers on a version that was previously deleted +func (s *VersionWorkflowSuite) Test_RegisterWorker_ResetRevisionNumber_WhenRevivingDeletedVersion() { s.skipBeforeVersion(VersionDataRevisionNumber) tv := testvars.New(s.T()) @@ -1010,7 +1011,7 @@ func (s *VersionWorkflowSuite) Test_RegisterWorker_IncrementsRevisionNumber_When // Mock delete and register propagation s.env.OnActivity(a.SyncDeploymentVersionUserData, mock.Anything, mock.Anything).Return( func(ctx context.Context, req *deploymentspb.SyncDeploymentVersionUserDataRequest) (*deploymentspb.SyncDeploymentVersionUserDataResponse, error) { - if req.UpsertVersionData != nil && req.UpsertVersionData.Deleted { + if req.GetForgetVersion() { // This is the delete call s.Equal(int64(6), req.UpsertVersionData.RevisionNumber, "Revision number should be incremented from 5 to 6 on delete") return &deploymentspb.SyncDeploymentVersionUserDataResponse{ @@ -1020,7 +1021,7 @@ func (s *VersionWorkflowSuite) Test_RegisterWorker_IncrementsRevisionNumber_When // This is a register worker propagation call s.NotNil(req.UpsertVersionData, "UpsertVersionData should be present for registration") s.False(req.UpsertVersionData.Deleted, "Deleted should be false after revival") - s.Equal(int64(7), req.UpsertVersionData.RevisionNumber, "Revision number should be incremented from 6 to 7 on revival") + s.Equal(int64(0), req.UpsertVersionData.RevisionNumber, "Revision number should be reset to 0 on revival") return &deploymentspb.SyncDeploymentVersionUserDataResponse{ TaskQueueMaxVersions: map[string]int64{newTaskQueueName: 1}, }, nil @@ -1071,6 +1072,34 @@ func (s *VersionWorkflowSuite) Test_RegisterWorker_IncrementsRevisionNumber_When OnAccept: func() {}, OnComplete: func(result any, err error) { s.Require().NoError(err) + + // Capture state after revive + queryResp := &deploymentspb.QueryDescribeVersionResponse{} + val, err := s.env.QueryWorkflow(QueryDescribeVersion) + s.Require().NoError(err) + err = val.Get(queryResp) + s.Require().NoError(err) + stateAfterRevive := queryResp.VersionState + + // Verify that status is reset to INACTIVE + s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE, stateAfterRevive.Status) + + // Verify that timing fields are reset + s.Nil(stateAfterRevive.CurrentSinceTime, "CurrentSinceTime should be reset") + s.Nil(stateAfterRevive.RampingSinceTime, "RampingSinceTime should be reset") + s.Nil(stateAfterRevive.RoutingUpdateTime, "RoutingUpdateTime should be reset") + s.Nil(stateAfterRevive.FirstActivationTime, "FirstActivationTime should be reset") + s.Nil(stateAfterRevive.LastCurrentTime, "LastCurrentTime should be reset") + s.Nil(stateAfterRevive.LastDeactivationTime, "LastDeactivationTime should be reset") + + // Verify that ramp percentage is reset + s.InDelta(float32(0), stateAfterRevive.RampPercentage, 0) + + // Verify that drainage info is reset (drainage info is set to an empty struct and not nil) + s.Equal((&deploymentpb.VersionDrainageInfo{}).String(), stateAfterRevive.DrainageInfo.String(), "DrainageInfo should be reset") + + // Verify that metadata is reset + s.Nil(stateAfterRevive.Metadata, "Metadata should be reset") }, }, registerArgs) }, 50*time.Millisecond) @@ -1090,7 +1119,9 @@ func (s *VersionWorkflowSuite) Test_RegisterWorker_IncrementsRevisionNumber_When }, }, }, - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE, + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED, + DrainageInfo: &deploymentpb.VersionDrainageInfo{Status: enumspb.VERSION_DRAINAGE_STATUS_DRAINED}, + Metadata: &deploymentpb.VersionMetadata{}, RevisionNumber: 5, SyncBatchSize: int32(s.workerDeploymentClient.getSyncBatchSize()), StartedDeploymentWorkflow: true, @@ -1193,21 +1224,21 @@ func (s *VersionWorkflowSuite) Test_MultipleSyncStates_BlocksCaNUntilAllComplete taskQueueName := tv.TaskQueue().Name - syncCallCount := 0 + var syncCallCount atomic.Int32 s.env.OnActivity(a.SyncDeploymentVersionUserData, mock.Anything, mock.Anything).Return( func(ctx context.Context, req *deploymentspb.SyncDeploymentVersionUserDataRequest) (*deploymentspb.SyncDeploymentVersionUserDataResponse, error) { - syncCallCount++ + count := syncCallCount.Add(1) return &deploymentspb.SyncDeploymentVersionUserDataResponse{ - TaskQueueMaxVersions: map[string]int64{taskQueueName: int64(syncCallCount * 10)}, + TaskQueueMaxVersions: map[string]int64{taskQueueName: int64(count * 10)}, }, nil }, ).Maybe() // Mock propagation check with delay to simulate slow propagation - propagationCheckCount := 0 + var propagationCheckCount atomic.Int32 s.env.OnActivity(a.CheckWorkerDeploymentUserDataPropagation, mock.Anything, mock.Anything). Return(func(ctx context.Context, req *deploymentspb.CheckWorkerDeploymentUserDataPropagationRequest) error { - propagationCheckCount++ + propagationCheckCount.Add(1) return nil }). After(100 * time.Millisecond). @@ -1289,7 +1320,7 @@ func (s *VersionWorkflowSuite) Test_MultipleSyncStates_BlocksCaNUntilAllComplete s.True(s.env.IsWorkflowCompleted()) // Both propagations should have completed before CaN - s.Equal(2, propagationCheckCount, "Both propagations should complete before CaN") + s.Equal(2, int(propagationCheckCount.Load()), "Both propagations should complete before CaN") } // Test_SyncState_And_RegisterWorker_ConcurrentPropagations tests concurrent async propagations @@ -1305,10 +1336,10 @@ func (s *VersionWorkflowSuite) Test_SyncState_And_RegisterWorker_ConcurrentPropa taskQueueName := tv.TaskQueue().Name newTaskQueueName := tv.TaskQueue().Name + "_new" - syncActivityCalls := 0 + var syncActivityCalls atomic.Int32 s.env.OnActivity(a.SyncDeploymentVersionUserData, mock.Anything, mock.Anything).Return( func(ctx context.Context, req *deploymentspb.SyncDeploymentVersionUserDataRequest) (*deploymentspb.SyncDeploymentVersionUserDataResponse, error) { - syncActivityCalls++ + syncActivityCalls.Add(1) if req.UpdateRoutingConfig != nil && len(req.Sync) == 1 && req.Sync[0].Name == taskQueueName { // This is the SyncState propagation return &deploymentspb.SyncDeploymentVersionUserDataResponse{ @@ -1322,10 +1353,10 @@ func (s *VersionWorkflowSuite) Test_SyncState_And_RegisterWorker_ConcurrentPropa }, ).Maybe() - propagationChecks := 0 + var propagationChecks atomic.Int32 s.env.OnActivity(a.CheckWorkerDeploymentUserDataPropagation, mock.Anything, mock.Anything). Return(func(ctx context.Context, req *deploymentspb.CheckWorkerDeploymentUserDataPropagationRequest) error { - propagationChecks++ + propagationChecks.Add(1) return nil }). After(50 * time.Millisecond). @@ -1408,9 +1439,9 @@ func (s *VersionWorkflowSuite) Test_SyncState_And_RegisterWorker_ConcurrentPropa s.True(s.env.IsWorkflowCompleted()) // Both syncs should complete - s.GreaterOrEqual(syncActivityCalls, 2, "Both propagations should complete before CaN") + s.GreaterOrEqual(int(syncActivityCalls.Load()), 2, "Both propagations should complete before CaN") // Only syncVersionState should wait for propagation - s.GreaterOrEqual(propagationChecks, 1, "Both propagations should complete before CaN") + s.GreaterOrEqual(int(propagationChecks.Load()), 1, "Both propagations should complete before CaN") } // Test_SyncState_SignalsPropagationComplete_WithCorrectRevisionNumber tests that the deployment @@ -2237,3 +2268,176 @@ func (s *VersionWorkflowSuite) skipFromVersion(version DeploymentWorkflowVersion s.T().Skipf("test supports version older than %v", version) } } + +// Test_ReactivateVersion_FromDrained tests that a drained version can be reactivated +// via the ReactivateVersionSignal and properly resets its state +func (s *VersionWorkflowSuite) Test_ReactivateVersion_FromDrained() { + tv := testvars.New(s.T()) + now := timestamppb.New(time.Now()) + + var a *VersionActivities + s.env.RegisterActivity(a.StartWorkerDeploymentWorkflow) + s.env.OnActivity(a.StartWorkerDeploymentWorkflow, mock.Anything, mock.Anything).Return(nil).Maybe() + + // Mock SyncDeploymentVersionUserData for reactivation + s.env.OnActivity(a.SyncDeploymentVersionUserData, mock.Anything, mock.Anything).Return( + &deploymentspb.SyncDeploymentVersionUserDataResponse{}, nil, + ).Maybe() + + // Mock external signal to deployment workflow + s.env.OnSignalExternalWorkflow(mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Maybe() + + // Schedule reactivation signal + s.env.RegisterDelayedCallback(func() { + // Send reactivation signal + s.env.SignalWorkflow(ReactivateVersionSignalName, nil) + + // Wait a bit, then query to verify state + s.env.RegisterDelayedCallback(func() { + queryResp := &deploymentspb.QueryDescribeVersionResponse{} + val, err := s.env.QueryWorkflow(QueryDescribeVersion) + s.Require().NoError(err) + err = val.Get(queryResp) + s.Require().NoError(err) + + // Verify that status is DRAINING after reactivation + s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, queryResp.VersionState.Status) + + // Verify drainage info is set up for monitoring + s.NotNil(queryResp.VersionState.DrainageInfo) + s.Equal(enumspb.VERSION_DRAINAGE_STATUS_DRAINING, queryResp.VersionState.DrainageInfo.Status) + s.NotNil(queryResp.VersionState.DrainageInfo.LastChangedTime) + s.NotNil(queryResp.VersionState.DrainageInfo.LastCheckedTime) + }, 10*time.Millisecond) + }, 10*time.Millisecond) + + // Start workflow with DRAINED status + s.env.ExecuteWorkflow(WorkerDeploymentVersionWorkflowType, &deploymentspb.WorkerDeploymentVersionWorkflowArgs{ + NamespaceName: tv.NamespaceName().String(), + NamespaceId: tv.NamespaceID().String(), + VersionState: &deploymentspb.VersionLocalState{ + Version: &deploymentspb.WorkerDeploymentVersion{ + DeploymentName: tv.DeploymentSeries(), + BuildId: tv.BuildID(), + }, + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED, + DrainageInfo: &deploymentpb.VersionDrainageInfo{ + Status: enumspb.VERSION_DRAINAGE_STATUS_DRAINED, + LastChangedTime: now, + LastCheckedTime: now, + }, + SyncBatchSize: int32(s.workerDeploymentClient.getSyncBatchSize()), + StartedDeploymentWorkflow: true, + }, + }) + + s.True(s.env.IsWorkflowCompleted()) +} + +// Test_ReactivateVersion_FromInactive tests that an inactive version can be reactivated +func (s *VersionWorkflowSuite) Test_ReactivateVersion_FromInactive() { + tv := testvars.New(s.T()) + + var a *VersionActivities + s.env.RegisterActivity(a.StartWorkerDeploymentWorkflow) + s.env.OnActivity(a.StartWorkerDeploymentWorkflow, mock.Anything, mock.Anything).Return(nil).Maybe() + + // Mock SyncDeploymentVersionUserData for reactivation + s.env.OnActivity(a.SyncDeploymentVersionUserData, mock.Anything, mock.Anything).Return( + &deploymentspb.SyncDeploymentVersionUserDataResponse{}, nil, + ).Maybe() + + // Mock external signal to deployment workflow + s.env.OnSignalExternalWorkflow(mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Maybe() + + // Schedule reactivation signal + s.env.RegisterDelayedCallback(func() { + // Send reactivation signal + s.env.SignalWorkflow(ReactivateVersionSignalName, nil) + + // Wait a bit, then query to verify state + s.env.RegisterDelayedCallback(func() { + queryResp := &deploymentspb.QueryDescribeVersionResponse{} + val, err := s.env.QueryWorkflow(QueryDescribeVersion) + s.Require().NoError(err) + err = val.Get(queryResp) + s.Require().NoError(err) + + // Verify that status is DRAINING after reactivation + s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, queryResp.VersionState.Status) + + // Verify drainage info is set up + s.NotNil(queryResp.VersionState.DrainageInfo) + s.Equal(enumspb.VERSION_DRAINAGE_STATUS_DRAINING, queryResp.VersionState.DrainageInfo.Status) + }, 10*time.Millisecond) + }, 10*time.Millisecond) + + // Start workflow with INACTIVE status + s.env.ExecuteWorkflow(WorkerDeploymentVersionWorkflowType, &deploymentspb.WorkerDeploymentVersionWorkflowArgs{ + NamespaceName: tv.NamespaceName().String(), + NamespaceId: tv.NamespaceID().String(), + VersionState: &deploymentspb.VersionLocalState{ + Version: &deploymentspb.WorkerDeploymentVersion{ + DeploymentName: tv.DeploymentSeries(), + BuildId: tv.BuildID(), + }, + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE, + SyncBatchSize: int32(s.workerDeploymentClient.getSyncBatchSize()), + StartedDeploymentWorkflow: true, + }, + }) + + s.True(s.env.IsWorkflowCompleted()) +} + +// Test_ReactivateVersion_IgnoredWhenCurrent tests that reactivation signal is ignored +// when the version is already CURRENT +func (s *VersionWorkflowSuite) Test_ReactivateVersion_IgnoredWhenNotDrainedOrInactive() { + tv := testvars.New(s.T()) + now := timestamppb.New(time.Now()) + + var a *VersionActivities + s.env.RegisterActivity(a.StartWorkerDeploymentWorkflow) + s.env.OnActivity(a.StartWorkerDeploymentWorkflow, mock.Anything, mock.Anything).Return(nil).Maybe() + + // No mocks for SyncDeploymentVersionUserData since reactivation should be ignored + + // Schedule reactivation signal + s.env.RegisterDelayedCallback(func() { + // Send reactivation signal + s.env.SignalWorkflow(ReactivateVersionSignalName, nil) + + // Wait a bit, then query to verify state hasn't changed + s.env.RegisterDelayedCallback(func() { + queryResp := &deploymentspb.QueryDescribeVersionResponse{} + val, err := s.env.QueryWorkflow(QueryDescribeVersion) + s.Require().NoError(err) + err = val.Get(queryResp) + s.Require().NoError(err) + + // Verify that status remains CURRENT + s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, queryResp.VersionState.Status) + + // Verify no drainage info is set + s.Nil(queryResp.VersionState.DrainageInfo) + }, 10*time.Millisecond) + }, 10*time.Millisecond) + + // Start workflow with CURRENT status + s.env.ExecuteWorkflow(WorkerDeploymentVersionWorkflowType, &deploymentspb.WorkerDeploymentVersionWorkflowArgs{ + NamespaceName: tv.NamespaceName().String(), + NamespaceId: tv.NamespaceID().String(), + VersionState: &deploymentspb.VersionLocalState{ + Version: &deploymentspb.WorkerDeploymentVersion{ + DeploymentName: tv.DeploymentSeries(), + BuildId: tv.BuildID(), + }, + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + CurrentSinceTime: now, + SyncBatchSize: int32(s.workerDeploymentClient.getSyncBatchSize()), + StartedDeploymentWorkflow: true, + }, + }) + + s.True(s.env.IsWorkflowCompleted()) +} diff --git a/service/worker/workerdeployment/workflow.go b/service/worker/workerdeployment/workflow.go index e8f24c6eb8..e0c1218556 100644 --- a/service/worker/workerdeployment/workflow.go +++ b/service/worker/workerdeployment/workflow.go @@ -345,7 +345,7 @@ func (d *WorkflowRunner) run(ctx workflow.Context) error { canContinue := d.deleteDeployment || // deployment is deleted -> it's ok to drop all signals and updates. // There is no pending signal or update, but the state is dirty or forceCaN is requested: (!d.signalHandler.signalSelector.HasPending() && d.signalHandler.processingSignals == 0 && workflow.AllHandlersFinished(ctx) && - (d.forceCAN || d.stateChanged)) + (d.forceCAN || d.stateChanged || workflow.GetInfo(ctx).GetContinueAsNewSuggested())) // TODO(carlydf): remove verbose logging if canContinue { diff --git a/service/worker/workerdeployment/workflow_test.go b/service/worker/workerdeployment/workflow_test.go index 9b3fea8c84..a4943c18da 100644 --- a/service/worker/workerdeployment/workflow_test.go +++ b/service/worker/workerdeployment/workflow_test.go @@ -30,7 +30,7 @@ type WorkerDeploymentSuite struct { workflowVersion DeploymentWorkflowVersion } -func TestWorkerDeploymentSuiteV2(t *testing.T) { +func TestWorkerDeploymentSuite(t *testing.T) { t.Parallel() suite.Run(t, &WorkerDeploymentSuite{workflowVersion: VersionDataRevisionNumber}) } diff --git a/tests/activity_api_batch_reset_test.go b/tests/activity_api_batch_reset_test.go index 767f855196..c9391d7cbd 100644 --- a/tests/activity_api_batch_reset_test.go +++ b/tests/activity_api_batch_reset_test.go @@ -1,7 +1,6 @@ package tests import ( - "context" "fmt" "testing" "time" @@ -9,7 +8,6 @@ import ( "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" "github.com/temporalio/sqlparser" batchpb "go.temporal.io/api/batch/v1" commonpb "go.temporal.io/api/common/v1" @@ -17,51 +15,50 @@ import ( "go.temporal.io/api/workflowservice/v1" sdkclient "go.temporal.io/sdk/client" "go.temporal.io/server/common/searchattribute/sadefs" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/tests/testcore" "google.golang.org/grpc/codes" ) -type ActivityApiBatchResetClientTestSuite struct { - testcore.FunctionalTestBase +type ActivityAPIBatchResetClientTestSuite struct { + parallelsuite.Suite[*ActivityAPIBatchResetClientTestSuite] } -func TestActivityApiBatchResetClientTestSuite(t *testing.T) { - s := new(ActivityApiBatchResetClientTestSuite) - suite.Run(t, s) +func TestActivityAPIBatchResetClientTestSuite(t *testing.T) { + parallelsuite.Run(t, &ActivityAPIBatchResetClientTestSuite{}) } -func (s *ActivityApiBatchResetClientTestSuite) createWorkflow(ctx context.Context, workflowFn WorkflowFunction) sdkclient.WorkflowRun { +func (s *ActivityAPIBatchResetClientTestSuite) createBatchResetWorkflow(env *testcore.TestEnv, workflowFn WorkflowFunction) sdkclient.WorkflowRun { workflowOptions := sdkclient.StartWorkflowOptions{ - ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), - TaskQueue: s.TaskQueue(), + ID: testcore.RandomizeStr("wf_id"), + TaskQueue: env.WorkerTaskQueue(), } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(env.Context(), workflowOptions, workflowFn) s.NoError(err) s.NotNil(workflowRun) return workflowRun } -func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() +func (s *ActivityAPIBatchResetClientTestSuite) TestActivityBatchReset_Success() { + env := testcore.NewEnv(s.T()) internalWorkflow := newInternalWorkflow() - s.SdkWorker().RegisterWorkflow(internalWorkflow.WorkflowFunc) - s.SdkWorker().RegisterActivity(internalWorkflow.ActivityFunc) + env.SdkWorker().RegisterWorkflow(internalWorkflow.WorkflowFunc) + env.SdkWorker().RegisterActivity(internalWorkflow.ActivityFunc) - workflowRun1 := s.createWorkflow(ctx, internalWorkflow.WorkflowFunc) - workflowRun2 := s.createWorkflow(ctx, internalWorkflow.WorkflowFunc) + workflowRun1 := s.createBatchResetWorkflow(env, internalWorkflow.WorkflowFunc) + workflowRun2 := s.createBatchResetWorkflow(env, internalWorkflow.WorkflowFunc) // wait for activity to start in both workflows s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Positive(t, internalWorkflow.startedActivityCount.Load()) - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun2.GetID(), workflowRun2.GetRunID()) + description, err = env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun2.GetID(), workflowRun2.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Positive(t, internalWorkflow.startedActivityCount.Load()) @@ -69,24 +66,24 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() // pause activities in both workflows pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{}, Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, } pauseRequest.Execution.WorkflowId = workflowRun1.GetID() - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) + resp, err := env.FrontendClient().PauseActivity(env.Context(), pauseRequest) s.NoError(err) s.NotNil(resp) pauseRequest.Execution.WorkflowId = workflowRun2.GetID() - resp, err = s.FrontendClient().PauseActivity(ctx, pauseRequest) + resp, err = env.FrontendClient().PauseActivity(env.Context(), pauseRequest) s.NoError(err) s.NotNil(resp) // wait for activities to be paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.True(t, description.PendingActivities[0].Paused) @@ -103,8 +100,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() query := fmt.Sprintf("(WorkflowType='%s' AND %s)", workflowTypeName, resetCause) s.EventuallyWithT(func(t *assert.CollectT) { - listResp, err = s.FrontendClient().ListWorkflowExecutions(ctx, &workflowservice.ListWorkflowExecutionsRequest{ - Namespace: s.Namespace().String(), + listResp, err = env.FrontendClient().ListWorkflowExecutions(env.Context(), &workflowservice.ListWorkflowExecutionsRequest{ + Namespace: env.Namespace().String(), PageSize: 10, Query: query, }) @@ -115,8 +112,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() }, 5*time.Second, 500*time.Millisecond) // reset the activities in both workflows with batch reset - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: activityTypeName}, @@ -131,13 +128,13 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() // make sure activities are restarted and still paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.True(t, description.PendingActivities[0].Paused) require.Equal(t, int32(1), description.PendingActivities[0].Attempt) - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun2.GetID(), workflowRun2.GetRunID()) + description, err = env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun2.GetID(), workflowRun2.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.True(t, description.PendingActivities[0].Paused) @@ -148,8 +145,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() internalWorkflow.letActivitySucceed.Store(true) // reset the activities in both workflows with batch reset - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: activityTypeName}, @@ -163,33 +160,32 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success() s.NoError(err) var out string - err = workflowRun1.Get(ctx, &out) + err = workflowRun1.Get(env.Context(), &out) s.NoError(err) - err = workflowRun2.Get(ctx, &out) + err = workflowRun2.Get(env.Context(), &out) s.NoError(err) } -func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Protobuf() { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() +func (s *ActivityAPIBatchResetClientTestSuite) TestActivityBatchReset_Success_Protobuf() { + env := testcore.NewEnv(s.T()) internalWorkflow := newInternalWorkflow() - s.SdkWorker().RegisterWorkflow(internalWorkflow.WorkflowFunc) - s.SdkWorker().RegisterActivity(internalWorkflow.ActivityFunc) + env.SdkWorker().RegisterWorkflow(internalWorkflow.WorkflowFunc) + env.SdkWorker().RegisterActivity(internalWorkflow.ActivityFunc) - workflowRun1 := s.createWorkflow(ctx, internalWorkflow.WorkflowFunc) - workflowRun2 := s.createWorkflow(ctx, internalWorkflow.WorkflowFunc) + workflowRun1 := s.createBatchResetWorkflow(env, internalWorkflow.WorkflowFunc) + workflowRun2 := s.createBatchResetWorkflow(env, internalWorkflow.WorkflowFunc) // wait for activity to start in both workflows s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Positive(t, internalWorkflow.startedActivityCount.Load()) - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun2.GetID(), workflowRun2.GetRunID()) + description, err = env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun2.GetID(), workflowRun2.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Positive(t, internalWorkflow.startedActivityCount.Load()) @@ -197,24 +193,24 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Pr // pause activities in both workflows pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{}, Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, } pauseRequest.Execution.WorkflowId = workflowRun1.GetID() - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) + resp, err := env.FrontendClient().PauseActivity(env.Context(), pauseRequest) s.NoError(err) s.NotNil(resp) pauseRequest.Execution.WorkflowId = workflowRun2.GetID() - resp, err = s.FrontendClient().PauseActivity(ctx, pauseRequest) + resp, err = env.FrontendClient().PauseActivity(env.Context(), pauseRequest) s.NoError(err) s.NotNil(resp) // wait for activities to be paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.True(t, description.PendingActivities[0].Paused) @@ -231,8 +227,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Pr query := fmt.Sprintf("(WorkflowType='%s' AND %s)", workflowTypeName, resetCause) s.EventuallyWithT(func(t *assert.CollectT) { - listResp, err = s.FrontendClient().ListWorkflowExecutions(ctx, &workflowservice.ListWorkflowExecutionsRequest{ - Namespace: s.Namespace().String(), + listResp, err = env.FrontendClient().ListWorkflowExecutions(env.Context(), &workflowservice.ListWorkflowExecutionsRequest{ + Namespace: env.Namespace().String(), PageSize: 10, Query: query, }) @@ -243,8 +239,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Pr }, 5*time.Second, 500*time.Millisecond) // reset the activities in both workflows with batch reset - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: activityTypeName}, @@ -259,13 +255,13 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Pr // make sure activities are restarted and still paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.True(t, description.PendingActivities[0].Paused) require.Equal(t, int32(1), description.PendingActivities[0].Attempt) - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun2.GetID(), workflowRun2.GetRunID()) + description, err = env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun2.GetID(), workflowRun2.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.True(t, description.PendingActivities[0].Paused) @@ -276,8 +272,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Pr internalWorkflow.letActivitySucceed.Store(true) // reset the activities in both workflows with batch reset - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: activityTypeName}, @@ -291,33 +287,32 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Success_Pr s.NoError(err) var out string - err = workflowRun1.Get(ctx, &out) + err = workflowRun1.Get(env.Context(), &out) s.NoError(err) - err = workflowRun2.Get(ctx, &out) + err = workflowRun2.Get(env.Context(), &out) s.NoError(err) } -func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetAttempts() { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() +func (s *ActivityAPIBatchResetClientTestSuite) TestActivityBatchReset_DontResetAttempts() { + env := testcore.NewEnv(s.T()) internalWorkflow := newInternalWorkflow() - s.SdkWorker().RegisterWorkflow(internalWorkflow.WorkflowFunc) - s.SdkWorker().RegisterActivity(internalWorkflow.ActivityFunc) + env.SdkWorker().RegisterWorkflow(internalWorkflow.WorkflowFunc) + env.SdkWorker().RegisterActivity(internalWorkflow.ActivityFunc) - workflowRun1 := s.createWorkflow(ctx, internalWorkflow.WorkflowFunc) - workflowRun2 := s.createWorkflow(ctx, internalWorkflow.WorkflowFunc) + workflowRun1 := s.createBatchResetWorkflow(env, internalWorkflow.WorkflowFunc) + workflowRun2 := s.createBatchResetWorkflow(env, internalWorkflow.WorkflowFunc) // wait for activity to start in both workflows s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Positive(t, internalWorkflow.startedActivityCount.Load()) - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun2.GetID(), workflowRun2.GetRunID()) + description, err = env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun2.GetID(), workflowRun2.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Positive(t, internalWorkflow.startedActivityCount.Load()) @@ -325,24 +320,24 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetA // pause activities in both workflows pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{}, Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, } pauseRequest.Execution.WorkflowId = workflowRun1.GetID() - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) + resp, err := env.FrontendClient().PauseActivity(env.Context(), pauseRequest) s.NoError(err) s.NotNil(resp) pauseRequest.Execution.WorkflowId = workflowRun2.GetID() - resp, err = s.FrontendClient().PauseActivity(ctx, pauseRequest) + resp, err = env.FrontendClient().PauseActivity(env.Context(), pauseRequest) s.NoError(err) s.NotNil(resp) // wait for activities to be paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.True(t, description.PendingActivities[0].Paused) @@ -359,8 +354,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetA query := fmt.Sprintf("(WorkflowType='%s' AND %s)", workflowTypeName, resetCause) s.EventuallyWithT(func(t *assert.CollectT) { - listResp, err = s.FrontendClient().ListWorkflowExecutions(ctx, &workflowservice.ListWorkflowExecutionsRequest{ - Namespace: s.Namespace().String(), + listResp, err = env.FrontendClient().ListWorkflowExecutions(env.Context(), &workflowservice.ListWorkflowExecutionsRequest{ + Namespace: env.Namespace().String(), PageSize: 10, Query: query, }) @@ -371,8 +366,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetA }, 5*time.Second, 500*time.Millisecond) // reset the activities in both workflows with batch reset - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: activityTypeName}, @@ -388,12 +383,12 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetA // make sure activities are restarted and still paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun1.GetID(), workflowRun1.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun1.GetID(), workflowRun1.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.NotEqual(t, int32(1), description.PendingActivities[0].Attempt) - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun2.GetID(), workflowRun2.GetRunID()) + description, err = env.SdkClient().DescribeWorkflowExecution(env.Context(), workflowRun2.GetID(), workflowRun2.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.NotEqual(t, int32(1), description.PendingActivities[0].Attempt) @@ -403,8 +398,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetA internalWorkflow.letActivitySucceed.Store(true) // reset the activities in both workflows with batch reset - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: activityTypeName}, @@ -418,17 +413,19 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_DontResetA s.NoError(err) var out string - err = workflowRun1.Get(ctx, &out) + err = workflowRun1.Get(env.Context(), &out) s.NoError(err) - err = workflowRun2.Get(ctx, &out) + err = workflowRun2.Get(env.Context(), &out) s.NoError(err) } -func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Failed() { +func (s *ActivityAPIBatchResetClientTestSuite) TestActivityBatchReset_Failed() { + env := testcore.NewEnv(s.T()) + // neither activity type not "match all" is provided - _, err := s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err := env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{}, }, @@ -441,8 +438,8 @@ func (s *ActivityApiBatchResetClientTestSuite) TestActivityBatchReset_Failed() { s.ErrorAs(err, new(*serviceerror.InvalidArgument)) // neither activity type not "match all" is provided - _, err = s.SdkClient().WorkflowService().StartBatchOperation(context.Background(), &workflowservice.StartBatchOperationRequest{ - Namespace: s.Namespace().String(), + _, err = env.SdkClient().WorkflowService().StartBatchOperation(env.Context(), &workflowservice.StartBatchOperationRequest{ + Namespace: env.Namespace().String(), Operation: &workflowservice.StartBatchOperationRequest_ResetActivitiesOperation{ ResetActivitiesOperation: &batchpb.BatchOperationResetActivities{ Activity: &batchpb.BatchOperationResetActivities_Type{Type: ""}, diff --git a/tests/activity_api_batch_unpause_test.go b/tests/activity_api_batch_unpause_test.go index c46fa20ed1..97d6dbb7d5 100644 --- a/tests/activity_api_batch_unpause_test.go +++ b/tests/activity_api_batch_unpause_test.go @@ -15,6 +15,7 @@ import ( "github.com/temporalio/sqlparser" batchpb "go.temporal.io/api/batch/v1" commonpb "go.temporal.io/api/common/v1" + enumspb "go.temporal.io/api/enums/v1" "go.temporal.io/api/serviceerror" "go.temporal.io/api/workflowservice/v1" sdkclient "go.temporal.io/sdk/client" @@ -228,3 +229,84 @@ func (s *ActivityApiBatchUnpauseClientTestSuite) TestActivityBatchUnpause_Failed s.Equal(codes.InvalidArgument, serviceerror.ToStatus(err).Code()) s.ErrorAs(err, new(*serviceerror.InvalidArgument)) } + +// TestBatchTerminate_NamespaceIsolation verifies that a batch terminate operation +// scoped to the primary namespace does not affect workflows in a separate namespace. +// This is an end-to-end complement to the unit-level checkNamespace tests: it +// exercises the full path from StartBatchOperation through the batcher worker. +func (s *ActivityApiBatchUnpauseClientTestSuite) TestBatchTerminate_NamespaceIsolation() { + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + // Register a uniquely-named workflow type to avoid interference from parallel tests. + wfTypeName := testcore.RandomizeStr("isolation-wf") + sleepWorkflow := func(ctx workflow.Context) error { + return workflow.Sleep(ctx, 24*time.Hour) + } + s.SdkWorker().RegisterWorkflowWithOptions(sleepWorkflow, workflow.RegisterOptions{Name: wfTypeName}) + + // Start two workflows in the primary namespace (worker is registered and will execute them). + startWf := func(client sdkclient.Client, taskQueue string) sdkclient.WorkflowRun { + run, err := client.ExecuteWorkflow(ctx, sdkclient.StartWorkflowOptions{ + ID: testcore.RandomizeStr("wf"), + TaskQueue: taskQueue, + }, wfTypeName) + s.NoError(err) + return run + } + primaryRun1 := startWf(s.SdkClient(), s.TaskQueue()) + primaryRun2 := startWf(s.SdkClient(), s.TaskQueue()) + + // Create a client for the external namespace and start two workflows there. + // No worker polls this task queue in the external namespace, so these workflows + // will remain in RUNNING state without executing. + extClient, err := sdkclient.Dial(sdkclient.Options{ + HostPort: s.FrontendGRPCAddress(), + Namespace: s.ExternalNamespace().String(), + }) + s.NoError(err) + defer extClient.Close() + extRun1 := startWf(extClient, s.TaskQueue()) + extRun2 := startWf(extClient, s.TaskQueue()) + + // Wait for both primary-namespace workflows to be indexed in visibility before + // submitting the batch, which uses a visibility query to find its targets. + s.EventuallyWithT(func(t *assert.CollectT) { + resp, err := s.FrontendClient().ListWorkflowExecutions(ctx, &workflowservice.ListWorkflowExecutionsRequest{ + Namespace: s.Namespace().String(), + Query: fmt.Sprintf("WorkflowType='%s'", wfTypeName), + PageSize: 10, + }) + require.NoError(t, err) + require.Len(t, resp.GetExecutions(), 2) + }, 10*time.Second, 500*time.Millisecond) + + // Batch-terminate all workflows of this type in the primary namespace only. + _, err = s.SdkClient().WorkflowService().StartBatchOperation(ctx, &workflowservice.StartBatchOperationRequest{ + Namespace: s.Namespace().String(), + VisibilityQuery: fmt.Sprintf("WorkflowType='%s'", wfTypeName), + JobId: uuid.NewString(), + Reason: "namespace-isolation-test", + Operation: &workflowservice.StartBatchOperationRequest_TerminationOperation{ + TerminationOperation: &batchpb.BatchOperationTermination{}, + }, + }) + s.NoError(err) + + // Primary-namespace workflows must reach TERMINATED status. + s.EventuallyWithT(func(t *assert.CollectT) { + for _, run := range []sdkclient.WorkflowRun{primaryRun1, primaryRun2} { + desc, err := s.SdkClient().DescribeWorkflowExecution(ctx, run.GetID(), run.GetRunID()) + require.NoError(t, err) + require.Equal(t, enumspb.WORKFLOW_EXECUTION_STATUS_TERMINATED, desc.WorkflowExecutionInfo.Status) + } + }, 10*time.Second, 500*time.Millisecond) + + // External-namespace workflows must remain RUNNING — the batch must not cross namespace boundaries. + for _, run := range []sdkclient.WorkflowRun{extRun1, extRun2} { + desc, err := extClient.DescribeWorkflowExecution(ctx, run.GetID(), run.GetRunID()) + s.NoError(err) + s.Equal(enumspb.WORKFLOW_EXECUTION_STATUS_RUNNING, desc.WorkflowExecutionInfo.Status, + "batch terminate in primary namespace must not affect external namespace workflows") + } +} diff --git a/tests/activity_api_pause_test.go b/tests/activity_api_pause_test.go index 286542d4d5..797918657f 100644 --- a/tests/activity_api_pause_test.go +++ b/tests/activity_api_pause_test.go @@ -15,622 +15,628 @@ import ( sdkclient "go.temporal.io/sdk/client" "go.temporal.io/sdk/temporal" "go.temporal.io/sdk/workflow" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/util" "go.temporal.io/server/tests/testcore" ) -func TestActivityApiPauseClientTestSuite(t *testing.T) { - t.Parallel() - t.Run("TestActivityPauseApi_WhileRunning", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - initialRetryInterval := 1 * time.Second - scheduleToCloseTimeout := 30 * time.Minute - startToCloseTimeout := 15 * time.Minute - activityRetryPolicy := &temporal.RetryPolicy{ - InitialInterval: initialRetryInterval, - BackoffCoefficient: 1, - } - makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { - return func(ctx workflow.Context) error { - var ret string - err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ - ActivityID: "activity-id", - DisableEagerExecution: true, - StartToCloseTimeout: startToCloseTimeout, - ScheduleToCloseTimeout: scheduleToCloseTimeout, - RetryPolicy: activityRetryPolicy, - }), activityFunction).Get(ctx, &ret) - return err - } - } - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() +type ActivityAPIPauseClientTestSuite struct { + parallelsuite.Suite[*ActivityAPIPauseClientTestSuite] +} - activityPausedCn := make(chan struct{}) - var startedActivityCount atomic.Int32 - activityErr := errors.New("bad-luck-please-retry") +func TestActivityAPIPauseClientTestSuite(t *testing.T) { + parallelsuite.Run(t, &ActivityAPIPauseClientTestSuite{}) +} - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - s.WaitForChannel(ctx, activityPausedCn) - return "", activityErr - } - return "done!", nil +func (s *ActivityAPIPauseClientTestSuite) TestActivityPauseApi_WhileRunning() { + env := testcore.NewEnv(s.T(), testcore.WithSdkWorker()) + + initialRetryInterval := 1 * time.Second + scheduleToCloseTimeout := 30 * time.Minute + startToCloseTimeout := 15 * time.Minute + activityRetryPolicy := &temporal.RetryPolicy{ + InitialInterval: initialRetryInterval, + BackoffCoefficient: 1, + } + makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { + return func(ctx workflow.Context) error { + var ret string + err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + ActivityID: "activity-id", + DisableEagerExecution: true, + StartToCloseTimeout: startToCloseTimeout, + ScheduleToCloseTimeout: scheduleToCloseTimeout, + RetryPolicy: activityRetryPolicy, + }), activityFunction).Get(ctx, &ret) + return err } + } - workflowFn := makeWorkflowFunc(activityFunction) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) - - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), - TaskQueue: s.WorkerTaskQueue(), - } - - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 500*time.Millisecond) - - // pause activity - testIdentity := "test-identity" - testReason := "test-reason" - pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, - Identity: testIdentity, - Reason: testReason, - } - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) - s.NoError(err) - s.NotNil(resp) - - // make sure activity is paused on server while running on worker - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSE_REQUESTED, description.PendingActivities[0].State) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 500*time.Millisecond) - - // unblock the activity - activityPausedCn <- struct{}{} - // make sure activity is paused on server and completed on the worker - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSED, description.PendingActivities[0].State) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 500*time.Millisecond) - - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - s.NoError(err) - s.Len(description.PendingActivities, 1) - s.True(description.PendingActivities[0].Paused) - - // wait long enough for activity to retry if pause is not working - // Note: because activity is retried we expect the attempts to be incremented - err = util.InterruptibleSleep(ctx, 2*time.Second) - s.NoError(err) - - // make sure activity is not completed, and was not retried - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - s.NoError(err) - s.Len(description.PendingActivities, 1) - s.True(description.PendingActivities[0].Paused) - s.Equal(int32(2), description.PendingActivities[0].Attempt) - s.NotNil(description.PendingActivities[0].LastFailure) - s.Equal(activityErr.Error(), description.PendingActivities[0].LastFailure.Message) - s.NotNil(description.PendingActivities[0].PauseInfo) - s.NotNil(description.PendingActivities[0].PauseInfo.GetManual()) - s.Equal(testIdentity, description.PendingActivities[0].PauseInfo.GetManual().Identity) - s.Equal(testReason, description.PendingActivities[0].PauseInfo.GetManual().Reason) - - // unpause the activity - unpauseRequest := &workflowservice.UnpauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, - } - unpauseResp, err := s.FrontendClient().UnpauseActivity(ctx, unpauseRequest) - s.NoError(err) - s.NotNil(unpauseResp) - - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) - - t.Run("TestActivityPauseApi_IncreaseAttemptsOnFailure", func(t *testing.T) { - /* - * 1. Run an activity that runs forever - * 2. Pause the activity - * 3. Send a failure signal to the activity - * 4. Validate activity failed - * 5. Validate number of activity attempts increased - */ - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - initialRetryInterval := 1 * time.Second - scheduleToCloseTimeout := 30 * time.Minute - startToCloseTimeout := 15 * time.Minute - activityRetryPolicy := &temporal.RetryPolicy{ - InitialInterval: initialRetryInterval, - BackoffCoefficient: 1, - } - makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { - return func(ctx workflow.Context) error { - var ret string - err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ - ActivityID: "activity-id", - DisableEagerExecution: true, - StartToCloseTimeout: startToCloseTimeout, - ScheduleToCloseTimeout: scheduleToCloseTimeout, - RetryPolicy: activityRetryPolicy, - }), activityFunction).Get(ctx, &ret) - return err - } - } + activityPausedCn := make(chan struct{}) + var startedActivityCount atomic.Int32 + activityErr := errors.New("bad-luck-please-retry") - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - var startedActivityCount atomic.Int32 - activityPausedCn := make(chan struct{}) - activityErr := errors.New("activity-failed-while-paused") - var shouldSucceed atomic.Bool - - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - s.WaitForChannel(ctx, activityPausedCn) - return "", activityErr - } - if shouldSucceed.Load() { - return "done!", nil - } + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + env.WaitForChannel(ctx, activityPausedCn) return "", activityErr } + return "done!", nil + } + + workflowFn := makeWorkflowFunc(activityFunction) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 500*time.Millisecond) + + // pause activity + testIdentity := "test-identity" + testReason := "test-reason" + pauseRequest := &workflowservice.PauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, + Identity: testIdentity, + Reason: testReason, + } + resp, err := env.FrontendClient().PauseActivity(ctx, pauseRequest) + s.NoError(err) + s.NotNil(resp) + + // make sure activity is paused on server while running on worker + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSE_REQUESTED, description.PendingActivities[0].State) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 500*time.Millisecond) + + // unblock the activity + activityPausedCn <- struct{}{} + // make sure activity is paused on server and completed on the worker + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSED, description.PendingActivities[0].State) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 500*time.Millisecond) + + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + s.NoError(err) + s.Len(description.PendingActivities, 1) + s.True(description.PendingActivities[0].Paused) + + // wait long enough for activity to retry if pause is not working + // Note: because activity is retried we expect the attempts to be incremented + err = util.InterruptibleSleep(ctx, 2*time.Second) + s.NoError(err) + + // make sure activity is not completed, and was not retried + description, err = env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + s.NoError(err) + s.Len(description.PendingActivities, 1) + s.True(description.PendingActivities[0].Paused) + s.Equal(int32(2), description.PendingActivities[0].Attempt) + s.NotNil(description.PendingActivities[0].LastFailure) + s.Equal(activityErr.Error(), description.PendingActivities[0].LastFailure.Message) + s.NotNil(description.PendingActivities[0].PauseInfo) + s.NotNil(description.PendingActivities[0].PauseInfo.GetManual()) + s.Equal(testIdentity, description.PendingActivities[0].PauseInfo.GetManual().Identity) + s.Equal(testReason, description.PendingActivities[0].PauseInfo.GetManual().Reason) + + // unpause the activity + unpauseRequest := &workflowservice.UnpauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, + } + unpauseResp, err := env.FrontendClient().UnpauseActivity(ctx, unpauseRequest) + s.NoError(err) + s.NotNil(unpauseResp) + + var out string + err = workflowRun.Get(ctx, &out) + + s.NoError(err) +} - workflowFn := makeWorkflowFunc(activityFunction) - - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) - - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), - TaskQueue: s.WorkerTaskQueue(), - } - - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 500*time.Millisecond) - - // pause activity - testIdentity := "test-identity" - testReason := "test-reason" - pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, - Identity: testIdentity, - Reason: testReason, - } - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) - s.NoError(err) - s.NotNil(resp) - - // make sure activity is paused on server while running on worker - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSE_REQUESTED, description.PendingActivities[0].State) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 500*time.Millisecond) - - // End the activity - activityPausedCn <- struct{}{} - - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.NotNil(t, description) - require.Len(t, description.PendingActivities, 1) - require.True(t, description.PendingActivities[0].Paused) - require.Equal(t, int32(2), description.PendingActivities[0].Attempt) - require.NotNil(t, description.PendingActivities[0].LastFailure) - require.NotNil(t, description.PendingActivities[0].PauseInfo) - require.NotNil(t, description.PendingActivities[0].PauseInfo.GetManual()) - require.Equal(t, testIdentity, description.PendingActivities[0].PauseInfo.GetManual().Identity) - require.Equal(t, testReason, description.PendingActivities[0].PauseInfo.GetManual().Reason) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 500*time.Millisecond) - - // Let the workflow finish gracefully - // set the flag to make activity succeed on next attempt - shouldSucceed.Store(true) - - // unpause the activity - unpauseRequest := &workflowservice.UnpauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, - } - unpauseResp, err := s.FrontendClient().UnpauseActivity(ctx, unpauseRequest) - s.NoError(err) - s.NotNil(unpauseResp) - - // wait for activity to complete - s.EventuallyWithT(func(t *assert.CollectT) { - require.Equal(t, int32(2), startedActivityCount.Load()) - }, 5*time.Second, 100*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) - - t.Run("TestActivityPauseApi_WhileWaiting", func(t *testing.T) { - // In this case, pause happens when activity is in retry state. - // Make sure that activity is paused and then unpaused. - // Also check that activity will not be retried while unpaused. - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - initialRetryInterval := 1 * time.Second - scheduleToCloseTimeout := 30 * time.Minute - startToCloseTimeout := 15 * time.Minute - activityRetryPolicy := &temporal.RetryPolicy{ - InitialInterval: initialRetryInterval, - BackoffCoefficient: 1, - } - makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { - return func(ctx workflow.Context) error { - var ret string - err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ - ActivityID: "activity-id", - DisableEagerExecution: true, - StartToCloseTimeout: startToCloseTimeout, - ScheduleToCloseTimeout: scheduleToCloseTimeout, - RetryPolicy: activityRetryPolicy, - }), activityFunction).Get(ctx, &ret) - return err - } - } - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - var startedActivityCount atomic.Int32 - - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - activityErr := errors.New("bad-luck-please-retry") - return "", activityErr - } - return "done!", nil +func (s *ActivityAPIPauseClientTestSuite) TestActivityPauseApi_IncreaseAttemptsOnFailure() { + /* + * 1. Run an activity that runs forever + * 2. Pause the activity + * 3. Send a failure signal to the activity + * 4. Validate activity failed + * 5. Validate number of activity attempts increased + */ + env := testcore.NewEnv(s.T(), testcore.WithSdkWorker()) + + initialRetryInterval := 1 * time.Second + scheduleToCloseTimeout := 30 * time.Minute + startToCloseTimeout := 15 * time.Minute + activityRetryPolicy := &temporal.RetryPolicy{ + InitialInterval: initialRetryInterval, + BackoffCoefficient: 1, + } + makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { + return func(ctx workflow.Context) error { + var ret string + err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + ActivityID: "activity-id", + DisableEagerExecution: true, + StartToCloseTimeout: startToCloseTimeout, + ScheduleToCloseTimeout: scheduleToCloseTimeout, + RetryPolicy: activityRetryPolicy, + }), activityFunction).Get(ctx, &ret) + return err } + } - workflowFn := makeWorkflowFunc(activityFunction) - - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), - TaskQueue: s.WorkerTaskQueue(), - } + var startedActivityCount atomic.Int32 + activityPausedCn := make(chan struct{}) + activityErr := errors.New("activity-failed-while-paused") + var shouldSucceed atomic.Bool - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 100*time.Millisecond) - - // pause activity - testIdentity := "test-identity" - testReason := "test-reason" - pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, - Identity: testIdentity, - Reason: testReason, - } - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) - s.NoError(err) - s.NotNil(resp) - - // wait long enough for activity to retry if pause is not working - require.NoError(t, util.InterruptibleSleep(ctx, 2*time.Second)) - - // make sure activity is not completed, and was not retried - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - s.NoError(err) - s.Len(description.PendingActivities, 1) - s.True(description.PendingActivities[0].Paused) - s.Equal(int32(2), description.PendingActivities[0].Attempt) - s.NotNil(description.PendingActivities[0].PauseInfo) - s.NotNil(description.PendingActivities[0].PauseInfo.GetManual()) - s.Equal(testIdentity, description.PendingActivities[0].PauseInfo.GetManual().Identity) - s.Equal(testReason, description.PendingActivities[0].PauseInfo.GetManual().Reason) - - // unpause the activity - unpauseRequest := &workflowservice.UnpauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, - } - unpauseResp, err := s.FrontendClient().UnpauseActivity(ctx, unpauseRequest) - s.NoError(err) - s.NotNil(unpauseResp) - - // wait for activity to complete - s.EventuallyWithT(func(t *assert.CollectT) { - require.Equal(t, int32(2), startedActivityCount.Load()) - }, 5*time.Second, 100*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) - - t.Run("TestActivityPauseApi_WhileRetryNoWait", func(t *testing.T) { - // In this case, pause can happen when activity is in retry state. - // Make sure that activity is paused and then unpaused. - // Also tests noWait flag. - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - initialRetryInterval := 30 * time.Second - scheduleToCloseTimeout := 30 * time.Minute - startToCloseTimeout := 15 * time.Minute - activityRetryPolicy := &temporal.RetryPolicy{ - InitialInterval: initialRetryInterval, - BackoffCoefficient: 1, + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + env.WaitForChannel(ctx, activityPausedCn) + return "", activityErr } - makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { - return func(ctx workflow.Context) error { - var ret string - err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ - ActivityID: "activity-id", - DisableEagerExecution: true, - StartToCloseTimeout: startToCloseTimeout, - ScheduleToCloseTimeout: scheduleToCloseTimeout, - RetryPolicy: activityRetryPolicy, - }), activityFunction).Get(ctx, &ret) - return err - } + if shouldSucceed.Load() { + return "done!", nil } + return "", activityErr + } + + workflowFn := makeWorkflowFunc(activityFunction) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 500*time.Millisecond) + + // pause activity + testIdentity := "test-identity" + testReason := "test-reason" + pauseRequest := &workflowservice.PauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, + Identity: testIdentity, + Reason: testReason, + } + resp, err := env.FrontendClient().PauseActivity(ctx, pauseRequest) + s.NoError(err) + s.NotNil(resp) + + // make sure activity is paused on server while running on worker + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSE_REQUESTED, description.PendingActivities[0].State) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 500*time.Millisecond) + + // End the activity + activityPausedCn <- struct{}{} + + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.NotNil(t, description) + require.Len(t, description.PendingActivities, 1) + require.True(t, description.PendingActivities[0].Paused) + require.Equal(t, int32(2), description.PendingActivities[0].Attempt) + require.NotNil(t, description.PendingActivities[0].LastFailure) + require.NotNil(t, description.PendingActivities[0].PauseInfo) + require.NotNil(t, description.PendingActivities[0].PauseInfo.GetManual()) + require.Equal(t, testIdentity, description.PendingActivities[0].PauseInfo.GetManual().Identity) + require.Equal(t, testReason, description.PendingActivities[0].PauseInfo.GetManual().Reason) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 500*time.Millisecond) + + // Let the workflow finish gracefully + // set the flag to make activity succeed on next attempt + shouldSucceed.Store(true) + + // unpause the activity + unpauseRequest := &workflowservice.UnpauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, + } + unpauseResp, err := env.FrontendClient().UnpauseActivity(ctx, unpauseRequest) + s.NoError(err) + s.NotNil(unpauseResp) + + // wait for activity to complete + s.EventuallyWithT(func(t *assert.CollectT) { + require.Equal(t, int32(2), startedActivityCount.Load()) + }, 5*time.Second, 100*time.Millisecond) + + var out string + err = workflowRun.Get(ctx, &out) + + s.NoError(err) +} - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - var startedActivityCount atomic.Int32 - - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - activityErr := errors.New("bad-luck-please-retry") - return "", activityErr - } - return "done!", nil +func (s *ActivityAPIPauseClientTestSuite) TestActivityPauseApi_WhileWaiting() { + // In this case, pause happens when activity is in retry state. + // Make sure that activity is paused and then unpaused. + // Also check that activity will not be retried while unpaused. + env := testcore.NewEnv(s.T(), testcore.WithSdkWorker()) + + initialRetryInterval := 1 * time.Second + scheduleToCloseTimeout := 30 * time.Minute + startToCloseTimeout := 15 * time.Minute + activityRetryPolicy := &temporal.RetryPolicy{ + InitialInterval: initialRetryInterval, + BackoffCoefficient: 1, + } + makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { + return func(ctx workflow.Context) error { + var ret string + err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + ActivityID: "activity-id", + DisableEagerExecution: true, + StartToCloseTimeout: startToCloseTimeout, + ScheduleToCloseTimeout: scheduleToCloseTimeout, + RetryPolicy: activityRetryPolicy, + }), activityFunction).Get(ctx, &ret) + return err } + } - workflowFn := makeWorkflowFunc(activityFunction) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + var startedActivityCount atomic.Int32 - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), - TaskQueue: s.WorkerTaskQueue(), + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + activityErr := errors.New("bad-luck-please-retry") + return "", activityErr } + return "done!", nil + } + + workflowFn := makeWorkflowFunc(activityFunction) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 100*time.Millisecond) + + // pause activity + testIdentity := "test-identity" + testReason := "test-reason" + pauseRequest := &workflowservice.PauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, + Identity: testIdentity, + Reason: testReason, + } + resp, err := env.FrontendClient().PauseActivity(ctx, pauseRequest) + s.NoError(err) + s.NotNil(resp) + + // wait long enough for activity to retry if pause is not working + s.NoError(util.InterruptibleSleep(ctx, 2*time.Second)) + + // make sure activity is not completed, and was not retried + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + s.NoError(err) + s.Len(description.PendingActivities, 1) + s.True(description.PendingActivities[0].Paused) + s.Equal(int32(2), description.PendingActivities[0].Attempt) + s.NotNil(description.PendingActivities[0].PauseInfo) + s.NotNil(description.PendingActivities[0].PauseInfo.GetManual()) + s.Equal(testIdentity, description.PendingActivities[0].PauseInfo.GetManual().Identity) + s.Equal(testReason, description.PendingActivities[0].PauseInfo.GetManual().Reason) + + // unpause the activity + unpauseRequest := &workflowservice.UnpauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, + } + unpauseResp, err := env.FrontendClient().UnpauseActivity(ctx, unpauseRequest) + s.NoError(err) + s.NotNil(unpauseResp) + + // wait for activity to complete + s.EventuallyWithT(func(t *assert.CollectT) { + require.Equal(t, int32(2), startedActivityCount.Load()) + }, 5*time.Second, 100*time.Millisecond) + + var out string + err = workflowRun.Get(ctx, &out) + + s.NoError(err) +} - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 5*time.Second, 100*time.Millisecond) - - // pause activity - pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, - } - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) - s.NoError(err) - s.NotNil(resp) - - // unpause the activity - unpauseRequest := &workflowservice.UnpauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, - } - unpauseResp, err := s.FrontendClient().UnpauseActivity(ctx, unpauseRequest) - s.NoError(err) - s.NotNil(unpauseResp) - - // wait for activity to complete. It should happen immediately since noWait is set - s.EventuallyWithT(func(t *assert.CollectT) { - require.Equal(t, int32(2), startedActivityCount.Load()) - }, 2*time.Second, 100*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) - - t.Run("TestActivityPauseApi_WithReset", func(t *testing.T) { - // pause/unpause the activity with reset option and noWait flag - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - initialRetryInterval := 1 * time.Second - scheduleToCloseTimeout := 30 * time.Minute - startToCloseTimeout := 15 * time.Minute - activityRetryPolicy := &temporal.RetryPolicy{ - InitialInterval: initialRetryInterval, - BackoffCoefficient: 1, - } - makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { - return func(ctx workflow.Context) error { - var ret string - err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ - ActivityID: "activity-id", - DisableEagerExecution: true, - StartToCloseTimeout: startToCloseTimeout, - ScheduleToCloseTimeout: scheduleToCloseTimeout, - RetryPolicy: activityRetryPolicy, - }), activityFunction).Get(ctx, &ret) - return err - } +func (s *ActivityAPIPauseClientTestSuite) TestActivityPauseApi_WhileRetryNoWait() { + // In this case, pause can happen when activity is in retry state. + // Make sure that activity is paused and then unpaused. + // Also tests noWait flag. + env := testcore.NewEnv(s.T(), testcore.WithSdkWorker()) + + initialRetryInterval := 30 * time.Second + scheduleToCloseTimeout := 30 * time.Minute + startToCloseTimeout := 15 * time.Minute + activityRetryPolicy := &temporal.RetryPolicy{ + InitialInterval: initialRetryInterval, + BackoffCoefficient: 1, + } + makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { + return func(ctx workflow.Context) error { + var ret string + err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + ActivityID: "activity-id", + DisableEagerExecution: true, + StartToCloseTimeout: startToCloseTimeout, + ScheduleToCloseTimeout: scheduleToCloseTimeout, + RetryPolicy: activityRetryPolicy, + }), activityFunction).Get(ctx, &ret) + return err } + } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - var startedActivityCount atomic.Int32 - activityWasReset := false - activityCompleteCn := make(chan struct{}) + var startedActivityCount atomic.Int32 - activityFunction := func() (string, error) { - startedActivityCount.Add(1) + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + activityErr := errors.New("bad-luck-please-retry") + return "", activityErr + } + return "done!", nil + } + + workflowFn := makeWorkflowFunc(activityFunction) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 5*time.Second, 100*time.Millisecond) + + // pause activity + pauseRequest := &workflowservice.PauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, + } + resp, err := env.FrontendClient().PauseActivity(ctx, pauseRequest) + s.NoError(err) + s.NotNil(resp) + + // unpause the activity + unpauseRequest := &workflowservice.UnpauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, + } + unpauseResp, err := env.FrontendClient().UnpauseActivity(ctx, unpauseRequest) + s.NoError(err) + s.NotNil(unpauseResp) + + // wait for activity to complete. It should happen immediately since noWait is set + s.EventuallyWithT(func(t *assert.CollectT) { + require.Equal(t, int32(2), startedActivityCount.Load()) + }, 2*time.Second, 100*time.Millisecond) + + var out string + err = workflowRun.Get(ctx, &out) + + s.NoError(err) +} - if !activityWasReset { - activityErr := errors.New("bad-luck-please-retry") - return "", activityErr - } - s.WaitForChannel(ctx, activityCompleteCn) - return "done!", nil +func (s *ActivityAPIPauseClientTestSuite) TestActivityPauseApi_WithReset() { + // pause/unpause the activity with reset option and noWait flag + env := testcore.NewEnv(s.T(), testcore.WithSdkWorker()) + + initialRetryInterval := 1 * time.Second + scheduleToCloseTimeout := 30 * time.Minute + startToCloseTimeout := 15 * time.Minute + activityRetryPolicy := &temporal.RetryPolicy{ + InitialInterval: initialRetryInterval, + BackoffCoefficient: 1, + } + makeWorkflowFunc := func(activityFunction ActivityFunctions) WorkflowFunction { + return func(ctx workflow.Context) error { + var ret string + err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + ActivityID: "activity-id", + DisableEagerExecution: true, + StartToCloseTimeout: startToCloseTimeout, + ScheduleToCloseTimeout: scheduleToCloseTimeout, + RetryPolicy: activityRetryPolicy, + }), activityFunction).Get(ctx, &ret) + return err } + } - workflowFn := makeWorkflowFunc(activityFunction) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + var startedActivityCount atomic.Int32 + activityWasReset := false + activityCompleteCn := make(chan struct{}) - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), - TaskQueue: s.WorkerTaskQueue(), - } + activityFunction := func() (string, error) { + startedActivityCount.Add(1) - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start/fail few times - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Greater(t, startedActivityCount.Load(), int32(1)) - }, 5*time.Second, 100*time.Millisecond) - - // pause activity - pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, - } - resp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) - s.NoError(err) - s.NotNil(resp) - - // wait for activity to be in paused state and waiting for retry - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSED, description.PendingActivities[0].State) - // also verify that the number of attempts was not reset - require.Greater(t, description.PendingActivities[0].Attempt, int32(1)) - }, 5*time.Second, 100*time.Millisecond) - - activityWasReset = true - - // unpause the activity with reset, and set noWait flag - unpauseRequest := &workflowservice.UnpauseActivityRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, - ResetAttempts: true, + if !activityWasReset { + activityErr := errors.New("bad-luck-please-retry") + return "", activityErr } - unpauseResp, err := s.FrontendClient().UnpauseActivity(ctx, unpauseRequest) - s.NoError(err) - s.NotNil(unpauseResp) - - // wait for activity to be running - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_STARTED, description.PendingActivities[0].State) - // also verify that the number of attempts was reset - require.Equal(t, int32(1), description.PendingActivities[0].Attempt) - }, 5*time.Second, 100*time.Millisecond) - - // let activity finish - activityCompleteCn <- struct{}{} - - // wait for workflow to finish - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) + env.WaitForChannel(ctx, activityCompleteCn) + return "done!", nil + } + + workflowFn := makeWorkflowFunc(activityFunction) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start/fail few times + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Greater(t, startedActivityCount.Load(), int32(1)) + }, 5*time.Second, 100*time.Millisecond) + + // pause activity + pauseRequest := &workflowservice.PauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, + } + resp, err := env.FrontendClient().PauseActivity(ctx, pauseRequest) + s.NoError(err) + s.NotNil(resp) + + // wait for activity to be in paused state and waiting for retry + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_PAUSED, description.PendingActivities[0].State) + // also verify that the number of attempts was not reset + require.Greater(t, description.PendingActivities[0].Attempt, int32(1)) + }, 5*time.Second, 100*time.Millisecond) + + activityWasReset = true + + // unpause the activity with reset, and set noWait flag + unpauseRequest := &workflowservice.UnpauseActivityRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, + ResetAttempts: true, + } + unpauseResp, err := env.FrontendClient().UnpauseActivity(ctx, unpauseRequest) + s.NoError(err) + s.NotNil(unpauseResp) + + // wait for activity to be running + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_STARTED, description.PendingActivities[0].State) + // also verify that the number of attempts was reset + require.Equal(t, int32(1), description.PendingActivities[0].Attempt) + }, 5*time.Second, 100*time.Millisecond) + + // let activity finish + activityCompleteCn <- struct{}{} + + // wait for workflow to finish + var out string + err = workflowRun.Get(ctx, &out) + + s.NoError(err) } diff --git a/tests/activity_api_reset_test.go b/tests/activity_api_reset_test.go index 180c80a4f1..0e96bfa001 100644 --- a/tests/activity_api_reset_test.go +++ b/tests/activity_api_reset_test.go @@ -33,7 +33,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" "go.temporal.io/api/workflowservice/v1" @@ -42,51 +41,28 @@ import ( "go.temporal.io/sdk/temporal" "go.temporal.io/sdk/workflow" "go.temporal.io/server/common/payloads" - "go.temporal.io/server/common/testing/testvars" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/util" "go.temporal.io/server/tests/testcore" ) type ActivityApiResetClientTestSuite struct { - testcore.FunctionalTestBase - tv *testvars.TestVars - initialRetryInterval time.Duration - scheduleToCloseTimeout time.Duration - startToCloseTimeout time.Duration - - activityRetryPolicy *temporal.RetryPolicy + parallelsuite.Suite[*ActivityApiResetClientTestSuite] } func TestActivityApiResetClientTestSuite(t *testing.T) { - s := new(ActivityApiResetClientTestSuite) - suite.Run(t, s) + parallelsuite.Run(t, &ActivityApiResetClientTestSuite{}) } -func (s *ActivityApiResetClientTestSuite) SetupTest() { - s.FunctionalTestBase.SetupTest() - - s.tv = testvars.New(s.T()).WithTaskQueue(s.TaskQueue()).WithNamespaceName(s.Namespace()) - - s.initialRetryInterval = 1 * time.Second - s.scheduleToCloseTimeout = 30 * time.Minute - s.startToCloseTimeout = 15 * time.Minute - - s.activityRetryPolicy = &temporal.RetryPolicy{ - InitialInterval: s.initialRetryInterval, - BackoffCoefficient: 1, - } -} - -func (s *ActivityApiResetClientTestSuite) makeWorkflowFunc(activityFunction ActivityFunctions) WorkflowFunction { +func (s *ActivityApiResetClientTestSuite) makeWorkflowFunc(activityFunction ActivityFunctions, retryPolicy *temporal.RetryPolicy) WorkflowFunction { return func(ctx workflow.Context) error { - var ret string err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ ActivityID: "activity-id", DisableEagerExecution: true, - StartToCloseTimeout: s.startToCloseTimeout, - ScheduleToCloseTimeout: s.scheduleToCloseTimeout, - RetryPolicy: s.activityRetryPolicy, + StartToCloseTimeout: 15 * time.Minute, + ScheduleToCloseTimeout: 30 * time.Minute, + RetryPolicy: retryPolicy, }), activityFunction).Get(ctx, &ret) return err } @@ -94,6 +70,8 @@ func (s *ActivityApiResetClientTestSuite) makeWorkflowFunc(activityFunction Acti func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_AfterRetry() { // activity reset is called after multiple attempts, + env := testcore.NewEnv(s.T()) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -109,40 +87,43 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_AfterRetry() { return "", activityErr } - s.WaitForChannel(ctx, activityCompleteCh) + env.WaitForChannel(ctx, activityCompleteCh) return "done!", nil } - workflowFn := s.makeWorkflowFunc(activityFunction) + workflowFn := s.makeWorkflowFunc(activityFunction, &temporal.RetryPolicy{ + InitialInterval: 1 * time.Second, + BackoffCoefficient: 1, + }) - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) wfId := testcore.RandomizeStr("wfid-" + s.T().Name()) workflowOptions := sdkclient.StartWorkflowOptions{ ID: wfId, - TaskQueue: s.TaskQueue(), + TaskQueue: env.WorkerTaskQueue(), } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) s.NoError(err) // wait for activity to start/fail few times s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Greater(t, startedActivityCount.Load(), int32(1)) }, 5*time.Second, 200*time.Millisecond) resetRequest := &workflowservice.ResetActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, Activity: &workflowservice.ResetActivityRequest_Id{Id: "activity-id"}, } - resp, err := s.FrontendClient().ResetActivity(ctx, resetRequest) + resp, err := env.FrontendClient().ResetActivity(ctx, resetRequest) s.NoError(err) s.NotNil(resp) @@ -150,7 +131,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_AfterRetry() { // wait for activity to be running s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_STARTED, description.PendingActivities[0].State) @@ -170,6 +151,8 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_AfterRetry() { func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_WhileRunning() { // activity reset is called while activity is running + env := testcore.NewEnv(s.T()) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -177,39 +160,42 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_WhileRunning() { var startedActivityCount atomic.Int32 activityFunction := func() (string, error) { startedActivityCount.Add(1) - s.WaitForChannel(ctx, activityCompleteCh) + env.WaitForChannel(ctx, activityCompleteCh) return "done!", nil } - workflowFn := s.makeWorkflowFunc(activityFunction) + workflowFn := s.makeWorkflowFunc(activityFunction, &temporal.RetryPolicy{ + InitialInterval: 1 * time.Second, + BackoffCoefficient: 1, + }) - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) workflowOptions := sdkclient.StartWorkflowOptions{ - ID: s.tv.WorkflowID(), - TaskQueue: s.TaskQueue(), + ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), + TaskQueue: env.WorkerTaskQueue(), } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) s.NoError(err) // wait for activity to start s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_STARTED, description.PendingActivities[0].State) }, 5*time.Second, 200*time.Millisecond) resetRequest := &workflowservice.ResetActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, Activity: &workflowservice.ResetActivityRequest_Id{Id: "activity-id"}, } - resp, err := s.FrontendClient().ResetActivity(ctx, resetRequest) + resp, err := env.FrontendClient().ResetActivity(ctx, resetRequest) s.NoError(err) s.NotNil(resp) @@ -218,7 +204,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_WhileRunning() { // check if workflow and activity are still running s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_STARTED, description.PendingActivities[0].State) @@ -240,11 +226,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_WhileRunning() { func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_InRetry() { // reset is called while activity is in retry - s.initialRetryInterval = 1 * time.Minute - s.activityRetryPolicy = &temporal.RetryPolicy{ - InitialInterval: s.initialRetryInterval, - BackoffCoefficient: 1, - } + env := testcore.NewEnv(s.T()) ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -260,27 +242,30 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_InRetry() { return "", activityErr } - s.WaitForChannel(ctx, activityCompleteCh) + env.WaitForChannel(ctx, activityCompleteCh) return "done!", nil } - workflowFn := s.makeWorkflowFunc(activityFunction) + workflowFn := s.makeWorkflowFunc(activityFunction, &temporal.RetryPolicy{ + InitialInterval: 1 * time.Minute, + BackoffCoefficient: 1, + }) - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) wfId := testcore.RandomizeStr("wf_id-" + s.T().Name()) workflowOptions := sdkclient.StartWorkflowOptions{ ID: wfId, - TaskQueue: s.TaskQueue(), + TaskQueue: env.WorkerTaskQueue(), } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) s.NoError(err) // wait for activity to start, fail and wait for retry s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_SCHEDULED, description.PendingActivities[0].State) @@ -288,19 +273,19 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_InRetry() { }, 5*time.Second, 200*time.Millisecond) resetRequest := &workflowservice.ResetActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, Activity: &workflowservice.ResetActivityRequest_Id{Id: "activity-id"}, } - resp, err := s.FrontendClient().ResetActivity(ctx, resetRequest) + resp, err := env.FrontendClient().ResetActivity(ctx, resetRequest) s.NoError(err) s.NotNil(resp) // wait for activity to start. Wait time is shorter than original retry interval s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.GetPendingActivities(), 1) require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_STARTED, description.PendingActivities[0].State) @@ -320,11 +305,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_InRetry() { func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_KeepPaused() { // reset is called while activity is in retry - s.initialRetryInterval = 1 * time.Minute - s.activityRetryPolicy = &temporal.RetryPolicy{ - InitialInterval: s.initialRetryInterval, - BackoffCoefficient: 1, - } + env := testcore.NewEnv(s.T()) ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -341,27 +322,30 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_KeepPaused() { return "", activityErr } - s.WaitForChannel(ctx, activityCompleteCh) + env.WaitForChannel(ctx, activityCompleteCh) return "done!", nil } - workflowFn := s.makeWorkflowFunc(activityFunction) + workflowFn := s.makeWorkflowFunc(activityFunction, &temporal.RetryPolicy{ + InitialInterval: 1 * time.Minute, + BackoffCoefficient: 1, + }) - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) wfId := testcore.RandomizeStr("wf_id-" + s.T().Name()) workflowOptions := sdkclient.StartWorkflowOptions{ ID: wfId, - TaskQueue: s.TaskQueue(), + TaskQueue: env.WorkerTaskQueue(), } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) s.NoError(err) // wait for activity to start, fail few times and wait for retry s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) require.Equal(t, enumspb.PENDING_ACTIVITY_STATE_SCHEDULED, description.PendingActivities[0].State) @@ -370,19 +354,19 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_KeepPaused() { // pause the activity pauseRequest := &workflowservice.PauseActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, Activity: &workflowservice.PauseActivityRequest_Id{Id: "activity-id"}, } - pauseResp, err := s.FrontendClient().PauseActivity(ctx, pauseRequest) + pauseResp, err := env.FrontendClient().PauseActivity(ctx, pauseRequest) s.NoError(err) s.NotNil(pauseResp) // verify that activity is paused s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.NotNil(t, description) require.Len(t, description.GetPendingActivities(), 1) @@ -394,20 +378,20 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_KeepPaused() { // reset the activity, while keeping it paused resetRequest := &workflowservice.ResetActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, Activity: &workflowservice.ResetActivityRequest_Id{Id: "activity-id"}, KeepPaused: true, } - resp, err := s.FrontendClient().ResetActivity(ctx, resetRequest) + resp, err := env.FrontendClient().ResetActivity(ctx, resetRequest) s.NoError(err) s.NotNil(resp) // verify that activity is still paused, and reset s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.NotNil(t, description) require.Len(t, description.GetPendingActivities(), 1) @@ -421,13 +405,13 @@ func (s *ActivityApiResetClientTestSuite) TestActivityResetApi_KeepPaused() { // unpause the activity unpauseRequest := &workflowservice.UnpauseActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, Activity: &workflowservice.UnpauseActivityRequest_Id{Id: "activity-id"}, } - unpauseResp, err := s.FrontendClient().UnpauseActivity(ctx, unpauseRequest) + unpauseResp, err := env.FrontendClient().UnpauseActivity(ctx, unpauseRequest) s.NoError(err) s.NotNil(unpauseResp) @@ -457,6 +441,12 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { // 2. First invocation of activity sets heartbeat details and fails upon request. // 3. Second invocation triggers waits to be triggered, and then send new heartbeat until requested to finish. // 6. Once workflow completes -- we're done. + env := testcore.NewEnv(s.T()) + + activityRetryPolicy := &temporal.RetryPolicy{ + InitialInterval: 1 * time.Second, + BackoffCoefficient: 1, + } activityCompleteCh := make(chan struct{}) var activityIteration atomic.Int32 @@ -472,7 +462,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { return "", errors.New("bad-luck-please-retry") } // not the first iteration - s.WaitForChannel(ctx, activityCompleteCh) + env.WaitForChannel(ctx, activityCompleteCh) for activityShouldFinish.Load() == false { activity.RecordHeartbeat(ctx, "second") time.Sleep(time.Second) //nolint:forbidigo @@ -486,25 +476,25 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ ActivityID: activityId, DisableEagerExecution: true, - StartToCloseTimeout: s.startToCloseTimeout, - ScheduleToCloseTimeout: s.scheduleToCloseTimeout, - RetryPolicy: s.activityRetryPolicy, + StartToCloseTimeout: 15 * time.Minute, + ScheduleToCloseTimeout: 30 * time.Minute, + RetryPolicy: activityRetryPolicy, }), activityFn).Get(ctx, &ret) return ret, err } - s.SdkWorker().RegisterActivity(activityFn) - s.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFn) + env.SdkWorker().RegisterWorkflow(workflowFn) wfId := "functional-test-heartbeat-details-after-reset" workflowOptions := sdkclient.StartWorkflowOptions{ ID: wfId, - TaskQueue: s.TaskQueue(), + TaskQueue: env.WorkerTaskQueue(), WorkflowRunTimeout: 20 * time.Second, } ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) s.NoError(err) s.NotNil(workflowRun) @@ -513,7 +503,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { // make sure activity is running and sending heartbeats s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) requirePayload(t, "first", description.PendingActivities[0].GetHeartbeatDetails()) @@ -522,7 +512,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { // reset the activity, with heartbeats resetRequest := &workflowservice.ResetActivityRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowRun.GetID(), }, @@ -530,7 +520,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { ResetHeartbeat: true, } - resp, err := s.FrontendClient().ResetActivity(ctx, resetRequest) + resp, err := env.FrontendClient().ResetActivity(ctx, resetRequest) s.NoError(err) s.NotNil(resp) @@ -539,7 +529,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { // wait for activity to fail and retried s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Len(t, description.PendingActivities, 1) ap := description.PendingActivities[0] @@ -555,7 +545,7 @@ func (s *ActivityApiResetClientTestSuite) TestActivityReset_HeartbeatDetails() { // make sure activity is running and sending heartbeats s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) require.NoError(t, err) require.Equal(t, int32(1), activityIteration.Load()) require.Len(t, description.PendingActivities, 1) diff --git a/tests/activity_api_update_test.go b/tests/activity_api_update_test.go index 0e1a85a80c..13fd9a5e1b 100644 --- a/tests/activity_api_update_test.go +++ b/tests/activity_api_update_test.go @@ -16,6 +16,7 @@ import ( sdkclient "go.temporal.io/sdk/client" "go.temporal.io/sdk/temporal" "go.temporal.io/sdk/workflow" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/tests/testcore" "google.golang.org/protobuf/types/known/durationpb" "google.golang.org/protobuf/types/known/fieldmaskpb" @@ -56,367 +57,372 @@ func makeActivityUpdateWorkflowFunc( } } -func TestActivityApiUpdateClientTestSuite(t *testing.T) { - t.Parallel() - t.Run("TestActivityUpdateApi_ChangeRetryInterval", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithSdkWorker()) +type ActivityAPIUpdateClientTestSuite struct { + parallelsuite.Suite[*ActivityAPIUpdateClientTestSuite] +} + +func TestActivityAPIUpdateClientTestSuite(t *testing.T) { + parallelsuite.Run(t, &ActivityAPIUpdateClientTestSuite{}) +} - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() +func (s *ActivityAPIUpdateClientTestSuite) TestActivityUpdateApi_ChangeRetryInterval() { + env := testcore.NewEnv(s.T()) - activityUpdated := make(chan struct{}) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - var startedActivityCount atomic.Int32 - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - activityErr := errors.New("bad-luck-please-retry") + activityUpdated := make(chan struct{}) - return "", activityErr - } + var startedActivityCount atomic.Int32 + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + activityErr := errors.New("bad-luck-please-retry") - s.WaitForChannel(ctx, activityUpdated) - return "done!", nil + return "", activityErr } - scheduleToCloseTimeout := 30 * time.Minute - retryTimeout := 10 * time.Minute - workflowFn := makeActivityUpdateWorkflowFunc(activityFunction, scheduleToCloseTimeout, retryTimeout) - - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + env.WaitForChannel(ctx, activityUpdated) + return "done!", nil + } - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: activityUpdateWorkflowID, - TaskQueue: s.WorkerTaskQueue(), - } + scheduleToCloseTimeout := 30 * time.Minute + retryTimeout := 10 * time.Minute + workflowFn := makeActivityUpdateWorkflowFunc(activityFunction, scheduleToCloseTimeout, retryTimeout) - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 10*time.Second, 500*time.Millisecond) + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: activityUpdateWorkflowID, + TaskQueue: env.WorkerTaskQueue(), + } - updateRequest := &workflowservice.UpdateActivityOptionsRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, - ActivityOptions: &activitypb.ActivityOptions{ - RetryPolicy: &commonpb.RetryPolicy{ - InitialInterval: durationpb.New(1 * time.Second), - }, + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 10*time.Second, 500*time.Millisecond) + + updateRequest := &workflowservice.UpdateActivityOptionsRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, + ActivityOptions: &activitypb.ActivityOptions{ + RetryPolicy: &commonpb.RetryPolicy{ + InitialInterval: durationpb.New(1 * time.Second), }, - UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"retry_policy.initial_interval"}}, - } - resp, err := s.FrontendClient().UpdateActivityOptions(ctx, updateRequest) - s.NoError(err) - s.NotNil(resp) - - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - s.NoError(err) - s.Len(description.PendingActivities, 1) - - activityUpdated <- struct{}{} - - s.EventuallyWithT(func(t *assert.CollectT) { - description, err = s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Empty(t, description.GetPendingActivities()) - require.Equal(t, int32(2), startedActivityCount.Load()) - }, 3*time.Second, 100*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) - - t.Run("TestActivityUpdateApi_ChangeScheduleToClose", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - var startedActivityCount atomic.Int32 - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - activityErr := errors.New("bad-luck-please-retry") - return "", activityErr - } - return "done!", nil - } + }, + UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"retry_policy.initial_interval"}}, + } + resp, err := env.FrontendClient().UpdateActivityOptions(ctx, updateRequest) + s.NoError(err) + s.NotNil(resp) - scheduleToCloseTimeout := 30 * time.Minute - retryTimeout := 10 * time.Minute + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + s.NoError(err) + s.Len(description.PendingActivities, 1) - workflowFn := makeActivityUpdateWorkflowFunc(activityFunction, scheduleToCloseTimeout, retryTimeout) + activityUpdated <- struct{}{} - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + s.EventuallyWithT(func(t *assert.CollectT) { + description, err = env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Empty(t, description.GetPendingActivities()) + require.Equal(t, int32(2), startedActivityCount.Load()) + }, 3*time.Second, 100*time.Millisecond) - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: activityUpdateWorkflowID, - TaskQueue: s.WorkerTaskQueue(), - } + var out string + err = workflowRun.Get(ctx, &out) - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) + s.NoError(err) +} - // wait for activity to start (and fail) - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Equal(t, int32(1), startedActivityCount.Load()) +func (s *ActivityAPIUpdateClientTestSuite) TestActivityUpdateApi_ChangeScheduleToClose() { + env := testcore.NewEnv(s.T()) - }, 2*time.Second, 200*time.Millisecond) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - // update schedule_to_close_timeout - updateRequest := &workflowservice.UpdateActivityOptionsRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, - ActivityOptions: &activitypb.ActivityOptions{ - ScheduleToCloseTimeout: durationpb.New(1 * time.Second), - }, - UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"schedule_to_close_timeout"}}, - } - resp, err := s.FrontendClient().UpdateActivityOptions(ctx, updateRequest) - s.NoError(err) - s.NotNil(resp) - - // activity should fail immediately - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Empty(t, description.GetPendingActivities()) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 2*time.Second, 200*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - var activityError *temporal.ActivityError - s.ErrorAs(err, &activityError) - // SCHEDULE_TO_CLOSE timeout now returns RETRY_STATE_TIMEOUT instead of RETRY_STATE_NON_RETRYABLE_FAILURE - s.Equal(enumspb.RETRY_STATE_TIMEOUT, activityError.RetryState()) - var timeoutError *temporal.TimeoutError - s.ErrorAs(activityError, &timeoutError) - s.Equal(enumspb.TIMEOUT_TYPE_SCHEDULE_TO_CLOSE, timeoutError.TimeoutType()) - s.Equal(int32(1), startedActivityCount.Load()) - }) - - t.Run("TestActivityUpdateApi_ChangeScheduleToCloseAndRetry", func(t *testing.T) { - // change both schedule to close and retry policy - // initial values are chosen in such a way that activity will fail due to schedule to close timeout - // we change schedule to close to a longer value and retry policy to a shorter value - // after that activity should succeed - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - var startedActivityCount atomic.Int32 - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - activityErr := errors.New("bad-luck-please-retry") - - return "", activityErr - } - return "done!", nil + var startedActivityCount atomic.Int32 + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + activityErr := errors.New("bad-luck-please-retry") + return "", activityErr } + return "done!", nil + } - // make scheduleToClose shorter than retry 2nd retry interval - scheduleToCloseTimeout := 8 * time.Second - retryInterval := 5 * time.Second + scheduleToCloseTimeout := 30 * time.Minute + retryTimeout := 10 * time.Minute - workflowFn := makeActivityUpdateWorkflowFunc( - activityFunction, scheduleToCloseTimeout, retryInterval) + workflowFn := makeActivityUpdateWorkflowFunc(activityFunction, scheduleToCloseTimeout, retryTimeout) - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: activityUpdateWorkflowID, - TaskQueue: s.WorkerTaskQueue(), - } + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: activityUpdateWorkflowID, + TaskQueue: env.WorkerTaskQueue(), + } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start (and fail) - s.EventuallyWithT(func(t *assert.CollectT) { - require.NotZero(t, startedActivityCount.Load()) - }, 2*time.Second, 200*time.Millisecond) - - // update schedule_to_close_timeout, make it longer - // also update retry policy interval, make it shorter - newScheduleToCloseTimeout := 10 * time.Second - updateRequest := &workflowservice.UpdateActivityOptionsRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, - ActivityOptions: &activitypb.ActivityOptions{ - ScheduleToCloseTimeout: durationpb.New(newScheduleToCloseTimeout), - RetryPolicy: &commonpb.RetryPolicy{ - InitialInterval: durationpb.New(1 * time.Second), - }, - }, - UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"schedule_to_close_timeout", "retry_policy.initial_interval"}}, - } + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start (and fail) + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + + }, 2*time.Second, 200*time.Millisecond) + + // update schedule_to_close_timeout + updateRequest := &workflowservice.UpdateActivityOptionsRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, + ActivityOptions: &activitypb.ActivityOptions{ + ScheduleToCloseTimeout: durationpb.New(1 * time.Second), + }, + UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"schedule_to_close_timeout"}}, + } + resp, err := env.FrontendClient().UpdateActivityOptions(ctx, updateRequest) + s.NoError(err) + s.NotNil(resp) + + // activity should fail immediately + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Empty(t, description.GetPendingActivities()) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 2*time.Second, 200*time.Millisecond) + + var out string + err = workflowRun.Get(ctx, &out) + var activityError *temporal.ActivityError + s.ErrorAs(err, &activityError) + // SCHEDULE_TO_CLOSE timeout now returns RETRY_STATE_TIMEOUT instead of RETRY_STATE_NON_RETRYABLE_FAILURE + s.Equal(enumspb.RETRY_STATE_TIMEOUT, activityError.RetryState()) + var timeoutError *temporal.TimeoutError + s.ErrorAs(activityError, &timeoutError) + s.Equal(enumspb.TIMEOUT_TYPE_SCHEDULE_TO_CLOSE, timeoutError.TimeoutType()) + s.Equal(int32(1), startedActivityCount.Load()) +} - resp, err := s.FrontendClient().UpdateActivityOptions(ctx, updateRequest) - s.NoError(err) - s.NotNil(resp) - // check that the update was successful - s.Equal(int64(newScheduleToCloseTimeout.Seconds()), resp.GetActivityOptions().ScheduleToCloseTimeout.GetSeconds()) - // check that field we didn't update is the same - s.Equal(int64(scheduleToCloseTimeout.Seconds()), resp.GetActivityOptions().StartToCloseTimeout.GetSeconds()) - - // now activity should succeed - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Empty(t, description.GetPendingActivities()) - require.Equal(t, int32(2), startedActivityCount.Load()) - }, 5*time.Second, 200*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - s.NoError(err) - }) - - t.Run("TestActivityUpdateApi_ResetDefaultOptions", func(t *testing.T) { - // plan: - // 1. start the workflow, wait for activity to start and fail, - // 2. update activity options to change retry policy maximum attempts - // 3. reset activity options to default, verify that retry policy is reset to default - // 4. update activity options again, this time change schedule to close timeout and retry policy initial interval - // 5. let activity finish, verify that it finished with updated options - s := testcore.NewEnv(t, testcore.WithSdkWorker()) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - activityUpdated := make(chan struct{}) - - var startedActivityCount atomic.Int32 - activityFunction := func() (string, error) { - startedActivityCount.Add(1) - if startedActivityCount.Load() == 1 { - activityErr := errors.New("bad-luck-please-retry") - - return "", activityErr - } - - s.WaitForChannel(ctx, activityUpdated) - return "done!", nil - } +func (s *ActivityAPIUpdateClientTestSuite) TestActivityUpdateApi_ChangeScheduleToCloseAndRetry() { + // change both schedule to close and retry policy + // initial values are chosen in such a way that activity will fail due to schedule to close timeout + // we change schedule to close to a longer value and retry policy to a shorter value + // after that activity should succeed + env := testcore.NewEnv(s.T()) - scheduleToCloseTimeout := 30 * time.Minute - retryTimeout := 10 * time.Minute - workflowFn := makeActivityUpdateWorkflowFunc(activityFunction, scheduleToCloseTimeout, retryTimeout) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - s.SdkWorker().RegisterWorkflow(workflowFn) - s.SdkWorker().RegisterActivity(activityFunction) + var startedActivityCount atomic.Int32 + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + activityErr := errors.New("bad-luck-please-retry") - workflowOptions := sdkclient.StartWorkflowOptions{ - ID: activityUpdateWorkflowID, - TaskQueue: s.WorkerTaskQueue(), + return "", activityErr } + return "done!", nil + } - workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) - s.NoError(err) - - // wait for activity to start (and fail) - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.GetPendingActivities(), 1) - require.Equal(t, int32(1), startedActivityCount.Load()) - }, 10*time.Second, 500*time.Millisecond) - - // update activity options, set retry policy to 1000 attempts - updateRequest := &workflowservice.UpdateActivityOptionsRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowRun.GetID(), - }, - Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, - ActivityOptions: &activitypb.ActivityOptions{ - RetryPolicy: &commonpb.RetryPolicy{ - MaximumAttempts: 1000, - }, - }, - UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"retry_policy.maximum_attempts"}}, - } - resp, err := s.FrontendClient().UpdateActivityOptions(ctx, updateRequest) - s.NoError(err) - s.NotNil(resp) - - // check that the update was successful - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, int32(1000), description.PendingActivities[0].GetActivityOptions().GetRetryPolicy().GetMaximumAttempts()) - }, 3*time.Second, 200*time.Millisecond) - - // reset activity options to default - updateRequest.ActivityOptions = nil - updateRequest.UpdateMask = &fieldmaskpb.FieldMask{Paths: []string{}} - updateRequest.RestoreOriginal = true - resp, err = s.FrontendClient().UpdateActivityOptions(ctx, updateRequest) - s.NoError(err) - s.NotNil(resp) - - // check that the update was successful - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Len(t, description.PendingActivities, 1) - require.Equal(t, int32(defaultMaximumAttempts), description.PendingActivities[0].GetActivityOptions().GetRetryPolicy().GetMaximumAttempts()) - }, 3*time.Second, 200*time.Millisecond) - - // update activity options again, this time set retry interval to 1 second - newScheduleToCloseTimeout := 10 * time.Second - updateRequest.ActivityOptions = &activitypb.ActivityOptions{ + // make scheduleToClose shorter than retry 2nd retry interval + scheduleToCloseTimeout := 8 * time.Second + retryInterval := 5 * time.Second + + workflowFn := makeActivityUpdateWorkflowFunc( + activityFunction, scheduleToCloseTimeout, retryInterval) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: activityUpdateWorkflowID, + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start (and fail) + s.EventuallyWithT(func(t *assert.CollectT) { + require.NotZero(t, startedActivityCount.Load()) + }, 2*time.Second, 200*time.Millisecond) + + // update schedule_to_close_timeout, make it longer + // also update retry policy interval, make it shorter + newScheduleToCloseTimeout := 10 * time.Second + updateRequest := &workflowservice.UpdateActivityOptionsRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, + ActivityOptions: &activitypb.ActivityOptions{ ScheduleToCloseTimeout: durationpb.New(newScheduleToCloseTimeout), RetryPolicy: &commonpb.RetryPolicy{ InitialInterval: durationpb.New(1 * time.Second), }, + }, + UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"schedule_to_close_timeout", "retry_policy.initial_interval"}}, + } + + resp, err := env.FrontendClient().UpdateActivityOptions(ctx, updateRequest) + s.NoError(err) + s.NotNil(resp) + // check that the update was successful + s.Equal(int64(newScheduleToCloseTimeout.Seconds()), resp.GetActivityOptions().ScheduleToCloseTimeout.GetSeconds()) + // check that field we didn't update is the same + s.Equal(int64(scheduleToCloseTimeout.Seconds()), resp.GetActivityOptions().StartToCloseTimeout.GetSeconds()) + + // now activity should succeed + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Empty(t, description.GetPendingActivities()) + require.Equal(t, int32(2), startedActivityCount.Load()) + }, 5*time.Second, 200*time.Millisecond) + + var out string + err = workflowRun.Get(ctx, &out) + s.NoError(err) +} + +func (s *ActivityAPIUpdateClientTestSuite) TestActivityUpdateApi_ResetDefaultOptions() { + // plan: + // 1. start the workflow, wait for activity to start and fail, + // 2. update activity options to change retry policy maximum attempts + // 3. reset activity options to default, verify that retry policy is reset to default + // 4. update activity options again, this time change schedule to close timeout and retry policy initial interval + // 5. let activity finish, verify that it finished with updated options + env := testcore.NewEnv(s.T()) + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + activityUpdated := make(chan struct{}) + + var startedActivityCount atomic.Int32 + activityFunction := func() (string, error) { + startedActivityCount.Add(1) + if startedActivityCount.Load() == 1 { + activityErr := errors.New("bad-luck-please-retry") + + return "", activityErr } - updateRequest.UpdateMask = &fieldmaskpb.FieldMask{Paths: []string{"schedule_to_close_timeout", "retry_policy.initial_interval"}} - updateRequest.RestoreOriginal = false - resp, err = s.FrontendClient().UpdateActivityOptions(ctx, updateRequest) - s.NoError(err) - s.NotNil(resp) - - // let activity finish - activityUpdated <- struct{}{} - - // wait for activity to finish - s.EventuallyWithT(func(t *assert.CollectT) { - description, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) - require.NoError(t, err) - require.Empty(t, description.GetPendingActivities()) - require.Equal(t, int32(2), startedActivityCount.Load()) - }, 3*time.Second, 100*time.Millisecond) - - var out string - err = workflowRun.Get(ctx, &out) - - s.NoError(err) - }) + + env.WaitForChannel(ctx, activityUpdated) + return "done!", nil + } + + scheduleToCloseTimeout := 30 * time.Minute + retryTimeout := 10 * time.Minute + workflowFn := makeActivityUpdateWorkflowFunc(activityFunction, scheduleToCloseTimeout, retryTimeout) + + env.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterActivity(activityFunction) + + workflowOptions := sdkclient.StartWorkflowOptions{ + ID: activityUpdateWorkflowID, + TaskQueue: env.WorkerTaskQueue(), + } + + workflowRun, err := env.SdkClient().ExecuteWorkflow(ctx, workflowOptions, workflowFn) + s.NoError(err) + + // wait for activity to start (and fail) + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.GetPendingActivities(), 1) + require.Equal(t, int32(1), startedActivityCount.Load()) + }, 10*time.Second, 500*time.Millisecond) + + // update activity options, set retry policy to 1000 attempts + updateRequest := &workflowservice.UpdateActivityOptionsRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowRun.GetID(), + }, + Activity: &workflowservice.UpdateActivityOptionsRequest_Id{Id: "activity-id"}, + ActivityOptions: &activitypb.ActivityOptions{ + RetryPolicy: &commonpb.RetryPolicy{ + MaximumAttempts: 1000, + }, + }, + UpdateMask: &fieldmaskpb.FieldMask{Paths: []string{"retry_policy.maximum_attempts"}}, + } + resp, err := env.FrontendClient().UpdateActivityOptions(ctx, updateRequest) + s.NoError(err) + s.NotNil(resp) + + // check that the update was successful + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, int32(1000), description.PendingActivities[0].GetActivityOptions().GetRetryPolicy().GetMaximumAttempts()) + }, 3*time.Second, 200*time.Millisecond) + + // reset activity options to default + updateRequest.ActivityOptions = nil + updateRequest.UpdateMask = &fieldmaskpb.FieldMask{Paths: []string{}} + updateRequest.RestoreOriginal = true + resp, err = env.FrontendClient().UpdateActivityOptions(ctx, updateRequest) + s.NoError(err) + s.NotNil(resp) + + // check that the update was successful + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Len(t, description.PendingActivities, 1) + require.Equal(t, int32(defaultMaximumAttempts), description.PendingActivities[0].GetActivityOptions().GetRetryPolicy().GetMaximumAttempts()) + }, 3*time.Second, 200*time.Millisecond) + + // update activity options again, this time set retry interval to 1 second + newScheduleToCloseTimeout := 10 * time.Second + updateRequest.ActivityOptions = &activitypb.ActivityOptions{ + ScheduleToCloseTimeout: durationpb.New(newScheduleToCloseTimeout), + RetryPolicy: &commonpb.RetryPolicy{ + InitialInterval: durationpb.New(1 * time.Second), + }, + } + updateRequest.UpdateMask = &fieldmaskpb.FieldMask{Paths: []string{"schedule_to_close_timeout", "retry_policy.initial_interval"}} + updateRequest.RestoreOriginal = false + resp, err = env.FrontendClient().UpdateActivityOptions(ctx, updateRequest) + s.NoError(err) + s.NotNil(resp) + + // let activity finish + activityUpdated <- struct{}{} + + // wait for activity to finish + s.EventuallyWithT(func(t *assert.CollectT) { + description, err := env.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) + require.NoError(t, err) + require.Empty(t, description.GetPendingActivities()) + require.Equal(t, int32(2), startedActivityCount.Load()) + }, 3*time.Second, 100*time.Millisecond) + + var out string + err = workflowRun.Get(ctx, &out) + + s.NoError(err) } diff --git a/tests/activity_test.go b/tests/activity_test.go index 60cce2c9db..036af5dd38 100644 --- a/tests/activity_test.go +++ b/tests/activity_test.go @@ -908,6 +908,8 @@ func (s *ActivityTestSuite) TestActivityHeartBeatWorkflow_Timeout() { } func (s *ActivityTestSuite) TestTryActivityCancellationFromWorkflow() { + ctx := testcore.NewContext() + id := "functional-activity-cancellation-test" wt := "functional-activity-cancellation-test-type" tl := "functional-activity-cancellation-test-taskqueue" @@ -930,7 +932,7 @@ func (s *ActivityTestSuite) TestTryActivityCancellationFromWorkflow() { Identity: identity, } - we, err0 := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + we, err0 := s.FrontendClient().StartWorkflowExecution(ctx, request) s.NoError(err0) s.Logger.Info("StartWorkflowExecution: response", tag.WorkflowRunID(we.GetRunId())) @@ -1045,7 +1047,12 @@ func (s *ActivityTestSuite) TestTryActivityCancellationFromWorkflow() { s.True(err == nil || errors.Is(err, testcore.ErrNoTasks)) s.Logger.Info("Waiting for cancel to complete.", tag.WorkflowRunID(we.RunId)) - <-cancelCh + select { + case <-cancelCh: + case <-ctx.Done(): + s.Fail("Test timed out for activity cancellation", ctx.Err()) + return + } s.True(activityCanceled, "Activity was not cancelled.") s.Logger.Info("Activity cancelled.", tag.WorkflowRunID(we.RunId)) } diff --git a/tests/admin_test.go b/tests/admin_test.go index a045aa2203..1f026494a5 100644 --- a/tests/admin_test.go +++ b/tests/admin_test.go @@ -6,7 +6,6 @@ import ( "time" "github.com/google/uuid" - "github.com/stretchr/testify/suite" commonpb "go.temporal.io/api/common/v1" sdkclient "go.temporal.io/sdk/client" "go.temporal.io/sdk/workflow" @@ -15,43 +14,37 @@ import ( "go.temporal.io/server/chasm" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/primitives/timestamp" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/testing/testvars" "go.temporal.io/server/tests/testcore" ) type AdminTestSuite struct { - testcore.FunctionalTestBase - testContext context.Context + parallelsuite.Suite[*AdminTestSuite] } -func TestAdminTestSuite(t *testing.T) { - t.Parallel() - suite.Run(t, new(AdminTestSuite)) +func TestAdminRebuildMutableState_ChasmDisabled(t *testing.T) { + parallelsuite.Run(t, &AdminTestSuite{}, false) } -func (s *AdminTestSuite) SetupSuite() { - // Call parent setup to initialize the test cluster - s.FunctionalTestBase.SetupSuite() - s.testContext = context.Background() +func TestAdminRebuildMutableState_ChasmEnabled(t *testing.T) { + parallelsuite.Run(t, &AdminTestSuite{}, true) } -func (s *AdminTestSuite) TestAdminRebuildMutableState_ChasmDisabled() { - rebuildMutableStateWorkflowHelper(s, false) -} - -func (s *AdminTestSuite) TestAdminRebuildMutableState_ChasmEnabled() { - cleanup := s.OverrideDynamicConfig(dynamicconfig.EnableChasm, true) - defer cleanup() +func (s *AdminTestSuite) TestAdminRebuildMutableState(testWithChasm bool) { + var opts []testcore.TestOption + if testWithChasm { + opts = append(opts, testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true)) + } + env := testcore.NewEnv(s.T(), opts...) - configValues := s.GetTestCluster().Host().DcClient().GetValue(dynamicconfig.EnableChasm.Key()) - s.NotEmpty(configValues, "EnableChasm config should be set") - configValue, _ := configValues[0].Value.(bool) - s.True(configValue, "EnableChasm config should be true") - rebuildMutableStateWorkflowHelper(s, true) -} + if testWithChasm { + configValues := env.GetTestCluster().Host().DcClient().GetValue(dynamicconfig.EnableChasm.Key()) + s.NotEmpty(configValues, "EnableChasm config should be set") + configValue, _ := configValues[0].Value.(bool) + s.True(configValue, "EnableChasm config should be true") + } -// common test helper -func rebuildMutableStateWorkflowHelper(s *AdminTestSuite, testWithChasm bool) { tv := testvars.New(s.T()) workflowFn := func(ctx workflow.Context) error { var randomUUID string @@ -65,18 +58,18 @@ func rebuildMutableStateWorkflowHelper(s *AdminTestSuite, testWithChasm bool) { return nil } - s.SdkWorker().RegisterWorkflow(workflowFn) + env.SdkWorker().RegisterWorkflow(workflowFn) workflowID := tv.Any().String() workflowOptions := sdkclient.StartWorkflowOptions{ ID: workflowID, - TaskQueue: s.TaskQueue(), + TaskQueue: env.WorkerTaskQueue(), WorkflowRunTimeout: 20 * time.Second, } - ctx, cancel := context.WithTimeout(s.testContext, 30*time.Second) + ctx, cancel := context.WithTimeout(env.Context(), 30*time.Second) defer cancel() - workflowRun, err := s.SdkClient().ExecuteWorkflow(s.testContext, workflowOptions, workflowFn) + workflowRun, err := env.SdkClient().ExecuteWorkflow(env.Context(), workflowOptions, workflowFn) s.NoError(err) runID := workflowRun.GetRunID() @@ -92,8 +85,8 @@ func rebuildMutableStateWorkflowHelper(s *AdminTestSuite, testWithChasm bool) { var response1 *adminservice.DescribeMutableStateResponse for { - response1, err = s.AdminClient().DescribeMutableState(ctx, &adminservice.DescribeMutableStateRequest{ - Namespace: s.Namespace().String(), + response1, err = env.AdminClient().DescribeMutableState(ctx, &adminservice.DescribeMutableStateRequest{ + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowID, RunId: runID, @@ -112,8 +105,8 @@ func rebuildMutableStateWorkflowHelper(s *AdminTestSuite, testWithChasm bool) { time.Sleep(20 * time.Millisecond) //nolint:forbidigo } - _, err = s.AdminClient().RebuildMutableState(ctx, &adminservice.RebuildMutableStateRequest{ - Namespace: s.Namespace().String(), + _, err = env.AdminClient().RebuildMutableState(ctx, &adminservice.RebuildMutableStateRequest{ + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowID, RunId: runID, @@ -121,8 +114,8 @@ func rebuildMutableStateWorkflowHelper(s *AdminTestSuite, testWithChasm bool) { }) s.NoError(err) - response2, err := s.AdminClient().DescribeMutableState(ctx, &adminservice.DescribeMutableStateRequest{ - Namespace: s.Namespace().String(), + response2, err := env.AdminClient().DescribeMutableState(ctx, &adminservice.DescribeMutableStateRequest{ + Namespace: env.Namespace().String(), Execution: &commonpb.WorkflowExecution{ WorkflowId: workflowID, RunId: runID, diff --git a/tests/client_misc_test.go b/tests/client_misc_test.go index 3b98c76d72..4f86345620 100644 --- a/tests/client_misc_test.go +++ b/tests/client_misc_test.go @@ -829,6 +829,7 @@ func (s *ClientMiscTestSuite) Test_BufferedQuery() { // wait until first wf task started wfStarted.Wait() + describeErrCh := make(chan error, 1) go func() { // sleep 2s to make sure DescribeMutableState is called after QueryWorkflow time.Sleep(2 * time.Second) //nolint:forbidigo @@ -841,7 +842,7 @@ func (s *ClientMiscTestSuite) Test_BufferedQuery() { }, Archetype: chasm.WorkflowArchetype, }) - s.Assert().NoError(err) + describeErrCh <- err }() // this query will be buffered in mutable state because workflow task is in-flight. @@ -855,6 +856,7 @@ func (s *ClientMiscTestSuite) Test_BufferedQuery() { err = workflowRun.Get(ctx, nil) s.NoError(err) + s.NoError(<-describeErrCh) // assert on test goroutine after workflow completes } func (s *ClientMiscTestSuite) assertHistory(wid, rid string, expected []enumspb.EventType) { diff --git a/tests/links_test.go b/tests/links_test.go index 84bc344198..210808219b 100644 --- a/tests/links_test.go +++ b/tests/links_test.go @@ -6,22 +6,22 @@ import ( "time" "github.com/google/uuid" - "github.com/stretchr/testify/suite" commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" taskqueuepb "go.temporal.io/api/taskqueue/v1" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/sdk/client" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/testing/protorequire" "go.temporal.io/server/tests/testcore" ) type LinksSuite struct { - testcore.FunctionalTestBase + parallelsuite.Suite[*LinksSuite] } func TestLinksTestSuite(t *testing.T) { - suite.Run(t, new(LinksSuite)) + parallelsuite.Run(t, &LinksSuite{}) } var links = []*commonpb.Link{ @@ -37,9 +37,10 @@ var links = []*commonpb.Link{ } func (s *LinksSuite) TestTerminateWorkflow_LinksAttachedToEvent() { + env := testcore.NewEnv(s.T()) ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) defer cancel() - run, err := s.SdkClient().ExecuteWorkflow( + run, err := env.SdkClient().ExecuteWorkflow( ctx, client.StartWorkflowOptions{ TaskQueue: "dont-care", @@ -49,8 +50,8 @@ func (s *LinksSuite) TestTerminateWorkflow_LinksAttachedToEvent() { s.NoError(err) // TODO(bergundy): Use SdkClient if and when it exposes links on TerminateWorkflow. - _, err = s.FrontendClient().TerminateWorkflowExecution(ctx, &workflowservice.TerminateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), + _, err = env.FrontendClient().TerminateWorkflowExecution(ctx, &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), WorkflowExecution: &commonpb.WorkflowExecution{ WorkflowId: run.GetID(), }, @@ -59,16 +60,17 @@ func (s *LinksSuite) TestTerminateWorkflow_LinksAttachedToEvent() { }) s.NoError(err) - history := s.SdkClient().GetWorkflowHistory(ctx, run.GetID(), "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_CLOSE_EVENT) + history := env.SdkClient().GetWorkflowHistory(ctx, run.GetID(), "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_CLOSE_EVENT) event, err := history.Next() s.NoError(err) protorequire.ProtoSliceEqual(s.T(), links, event.Links) } func (s *LinksSuite) TestRequestCancelWorkflow_LinksAttachedToEvent() { + env := testcore.NewEnv(s.T()) ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) defer cancel() - run, err := s.SdkClient().ExecuteWorkflow( + run, err := env.SdkClient().ExecuteWorkflow( ctx, client.StartWorkflowOptions{ TaskQueue: "dont-care", @@ -78,8 +80,8 @@ func (s *LinksSuite) TestRequestCancelWorkflow_LinksAttachedToEvent() { s.NoError(err) // TODO(bergundy): Use SdkClient if and when it exposes links on CancelWorkflow. - _, err = s.FrontendClient().RequestCancelWorkflowExecution(ctx, &workflowservice.RequestCancelWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), + _, err = env.FrontendClient().RequestCancelWorkflowExecution(ctx, &workflowservice.RequestCancelWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), WorkflowExecution: &commonpb.WorkflowExecution{ WorkflowId: run.GetID(), }, @@ -88,7 +90,7 @@ func (s *LinksSuite) TestRequestCancelWorkflow_LinksAttachedToEvent() { }) s.NoError(err) - history := s.SdkClient().GetWorkflowHistory(ctx, run.GetID(), "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) + history := env.SdkClient().GetWorkflowHistory(ctx, run.GetID(), "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) foundEvent := false for history.HasNext() { event, err := history.Next() @@ -103,9 +105,10 @@ func (s *LinksSuite) TestRequestCancelWorkflow_LinksAttachedToEvent() { } func (s *LinksSuite) TestSignalWorkflowExecution_LinksAttachedToEvent() { + env := testcore.NewEnv(s.T()) ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) defer cancel() - run, err := s.SdkClient().ExecuteWorkflow( + run, err := env.SdkClient().ExecuteWorkflow( ctx, client.StartWorkflowOptions{ TaskQueue: "dont-care", @@ -115,8 +118,8 @@ func (s *LinksSuite) TestSignalWorkflowExecution_LinksAttachedToEvent() { s.NoError(err) // TODO(bergundy): Use SdkClient if and when it exposes links on SignalWorkflow. - _, err = s.FrontendClient().SignalWorkflowExecution(ctx, &workflowservice.SignalWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), + _, err = env.FrontendClient().SignalWorkflowExecution(ctx, &workflowservice.SignalWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), WorkflowExecution: &commonpb.WorkflowExecution{ WorkflowId: run.GetID(), }, @@ -127,7 +130,7 @@ func (s *LinksSuite) TestSignalWorkflowExecution_LinksAttachedToEvent() { }) s.NoError(err) - history := s.SdkClient().GetWorkflowHistory(ctx, run.GetID(), "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) + history := env.SdkClient().GetWorkflowHistory(ctx, run.GetID(), "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) foundEvent := false for history.HasNext() { event, err := history.Next() @@ -142,6 +145,7 @@ func (s *LinksSuite) TestSignalWorkflowExecution_LinksAttachedToEvent() { } func (s *LinksSuite) TestSignalWithStartWorkflowExecution_LinksAttachedToRelevantEvents() { + env := testcore.NewEnv(s.T()) ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) defer cancel() @@ -149,7 +153,7 @@ func (s *LinksSuite) TestSignalWithStartWorkflowExecution_LinksAttachedToRelevan // TODO(bergundy): Use SdkClient if and when it exposes links on SignalWithStartWorkflow. request := &workflowservice.SignalWithStartWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), WorkflowId: workflowID, WorkflowType: &commonpb.WorkflowType{ Name: "dont-care", @@ -162,15 +166,15 @@ func (s *LinksSuite) TestSignalWithStartWorkflowExecution_LinksAttachedToRelevan RequestId: uuid.NewString(), Links: links, } - _, err := s.FrontendClient().SignalWithStartWorkflowExecution(ctx, request) + _, err := env.FrontendClient().SignalWithStartWorkflowExecution(ctx, request) s.NoError(err) // Send a second request and verify that the new signal has links attached to it too. request.RequestId = uuid.NewString() - _, err = s.FrontendClient().SignalWithStartWorkflowExecution(ctx, request) + _, err = env.FrontendClient().SignalWithStartWorkflowExecution(ctx, request) s.NoError(err) - history := s.SdkClient().GetWorkflowHistory(ctx, workflowID, "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) + history := env.SdkClient().GetWorkflowHistory(ctx, workflowID, "", false, enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) foundStartEvent := false foundFirstSignal := false foundSecondSignal := false diff --git a/tests/max_buffered_event_test.go b/tests/max_buffered_event_test.go index d85ecac537..4062e0c6b4 100644 --- a/tests/max_buffered_event_test.go +++ b/tests/max_buffered_event_test.go @@ -15,12 +15,20 @@ import ( "go.temporal.io/server/common" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/payloads" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/tests/testcore" ) +type MaxBufferedEventSuite struct { + parallelsuite.Suite[*MaxBufferedEventSuite] +} + func TestMaxBufferedEventSuite(t *testing.T) { - t.Parallel() - commonOpts := []testcore.TestOption{ + parallelsuite.Run(t, &MaxBufferedEventSuite{}) +} + +func (s *MaxBufferedEventSuite) opts() []testcore.TestOption { + return []testcore.TestOption{ testcore.WithSdkWorker(), // Set MaximumBufferedEventsSizeInBytes high so we don't hit that limit. testcore.WithDynamicConfig(dynamicconfig.MaximumBufferedEventsSizeInBytes, 10*1024*1024), // 10MB @@ -28,205 +36,205 @@ func TestMaxBufferedEventSuite(t *testing.T) { // Set MutableStateSizeLimitError low so buffered events exhaust mutable state size. testcore.WithDynamicConfig(dynamicconfig.MutableStateSizeLimitError, 410*1024), // 410KB } +} - t.Run("MaxBufferedEventsLimit", func(t *testing.T) { - s := testcore.NewEnv(t, commonOpts...) - - /* - This test starts a workflow, and block its workflow task, then sending - signals to it which will be buffered. The default max buffered event - count limit is 100. When the test sends 101 signal, the blocked workflow - task will be forced to close. - */ - closeStartChanOnce := sync.Once{} - waitStartChan := make(chan struct{}) - waitSignalChan := make(chan struct{}) - - localActivityFn := func(ctx context.Context) error { - // notify that workflow task has started - closeStartChanOnce.Do(func() { - close(waitStartChan) - }) - - // block workflow task so all signals will be buffered. - <-waitSignalChan - return nil - } +func (s *MaxBufferedEventSuite) TestMaxBufferedEventsLimit() { + env := testcore.NewEnv(s.T(), s.opts()...) + + /* + This test starts a workflow, and block its workflow task, then sending + signals to it which will be buffered. The default max buffered event + count limit is 100. When the test sends 101 signal, the blocked workflow + task will be forced to close. + */ + closeStartChanOnce := sync.Once{} + waitStartChan := make(chan struct{}) + waitSignalChan := make(chan struct{}) + + localActivityFn := func(ctx context.Context) error { + // notify that workflow task has started + closeStartChanOnce.Do(func() { + close(waitStartChan) + }) + + // block workflow task so all signals will be buffered. + <-waitSignalChan + return nil + } - workflowFn := func(ctx workflow.Context) (int, error) { - ctx1 := workflow.WithLocalActivityOptions(ctx, workflow.LocalActivityOptions{ - StartToCloseTimeout: 20 * time.Second, - RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, - }) - f1 := workflow.ExecuteLocalActivity(ctx1, localActivityFn) - if err := f1.Get(ctx, nil); err != nil { - return 0, err - } - - sigCh := workflow.GetSignalChannel(ctx, "test-signal") - - sigCount := 0 - for sigCh.ReceiveAsync(nil) { - sigCount++ - } - return sigCount, nil + workflowFn := func(ctx workflow.Context) (int, error) { + ctx1 := workflow.WithLocalActivityOptions(ctx, workflow.LocalActivityOptions{ + StartToCloseTimeout: 20 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }) + f1 := workflow.ExecuteLocalActivity(ctx1, localActivityFn) + if err := f1.Get(ctx, nil); err != nil { + return 0, err } - s.SdkWorker().RegisterWorkflow(workflowFn) + sigCh := workflow.GetSignalChannel(ctx, "test-signal") - testCtx, cancel := context.WithTimeout(context.Background(), time.Second*20) - defer cancel() - - wid := "test-max-buffered-events-limit" - wf1, err1 := s.SdkClient().ExecuteWorkflow(testCtx, client.StartWorkflowOptions{ - ID: wid, - TaskQueue: s.WorkerTaskQueue(), - WorkflowTaskTimeout: time.Second * 20, - }, workflowFn) + sigCount := 0 + for sigCh.ReceiveAsync(nil) { + sigCount++ + } + return sigCount, nil + } - s.NoError(err1) + env.SdkWorker().RegisterWorkflow(workflowFn) - // block until workflow task started - <-waitStartChan + testCtx, cancel := context.WithTimeout(context.Background(), time.Second*20) + defer cancel() - // now send 100 signals, all of them will be buffered - for i := range 100 { - err := s.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", i) - s.NoError(err) - } + wid := "test-max-buffered-events-limit" + wf1, err1 := env.SdkClient().ExecuteWorkflow(testCtx, client.StartWorkflowOptions{ + ID: wid, + TaskQueue: env.WorkerTaskQueue(), + WorkflowTaskTimeout: time.Second * 20, + }, workflowFn) - // send 101 signal, this will fail the started workflow task - err := s.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", 100) - s.NoError(err) + s.NoError(err1) - // unblock goroutine that runs local activity - close(waitSignalChan) + // block until workflow task started + <-waitStartChan - var sigCount int - err = wf1.Get(testCtx, &sigCount) + // now send 100 signals, all of them will be buffered + for i := range 100 { + err := env.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", i) s.NoError(err) - s.Equal(101, sigCount) - - historyEvents := s.GetHistory(s.Namespace().String(), &commonpb.WorkflowExecution{WorkflowId: wf1.GetID()}) - // Not using historyrequire here because history is not deterministic. - var failedCause enumspb.WorkflowTaskFailedCause - var failedCount int - for _, evt := range historyEvents { - if evt.GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_FAILED { - failedCause = evt.GetWorkflowTaskFailedEventAttributes().Cause - failedCount++ - } - } - s.Equal(enumspb.WORKFLOW_TASK_FAILED_CAUSE_FORCE_CLOSE_COMMAND, failedCause) - s.Equal(1, failedCount) - }) - - t.Run("BufferedEventsMutableStateSizeLimit", func(t *testing.T) { - s := testcore.NewEnv(t, commonOpts...) - - /* - This test starts a workflow, and blocks its workflow task, then sends - signals to it which will be buffered. The test is configured with - MaximumBufferedEventsSizeInBytes set to 10MB (high) and MutableStateSizeLimitError - set to 410KB (low). Each signal has a 100KB payload. The first three signals - succeed, and the fourth signal causes the mutable state size to exceed the limit, - resulting in workflow termination. - */ - closeStartChanOnce := sync.Once{} - waitStartChan := make(chan struct{}) - waitSignalChan := make(chan struct{}) - - localActivityFn := func(ctx context.Context) error { - // notify that workflow task has started - closeStartChanOnce.Do(func() { - close(waitStartChan) - }) - - // block workflow task so all signals will be buffered. - <-waitSignalChan - return nil - } + } - workflowFn := func(ctx workflow.Context) (int, error) { - ctx1 := workflow.WithLocalActivityOptions(ctx, workflow.LocalActivityOptions{ - StartToCloseTimeout: 20 * time.Second, - RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, - }) - f1 := workflow.ExecuteLocalActivity(ctx1, localActivityFn) - if err := f1.Get(ctx, nil); err != nil { - return 0, err - } - - sigCh := workflow.GetSignalChannel(ctx, "test-signal") - - sigCount := 0 - for sigCh.ReceiveAsync(nil) { - sigCount++ - } - return sigCount, nil + // send 101 signal, this will fail the started workflow task + err := env.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", 100) + s.NoError(err) + + // unblock goroutine that runs local activity + close(waitSignalChan) + + var sigCount int + err = wf1.Get(testCtx, &sigCount) + s.NoError(err) + s.Equal(101, sigCount) + + historyEvents := env.GetHistory(env.Namespace().String(), &commonpb.WorkflowExecution{WorkflowId: wf1.GetID()}) + // Not using historyrequire here because history is not deterministic. + var failedCause enumspb.WorkflowTaskFailedCause + var failedCount int + for _, evt := range historyEvents { + if evt.GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_FAILED { + failedCause = evt.GetWorkflowTaskFailedEventAttributes().Cause + failedCount++ } + } + s.Equal(enumspb.WORKFLOW_TASK_FAILED_CAUSE_FORCE_CLOSE_COMMAND, failedCause) + s.Equal(1, failedCount) +} - s.SdkWorker().RegisterWorkflow(workflowFn) - - testCtx, cancel := context.WithTimeout(context.Background(), 40*time.Second) - defer cancel() - - wid := "test-max-buffered-events-limit" - wf1, err1 := s.SdkClient().ExecuteWorkflow(testCtx, client.StartWorkflowOptions{ - ID: wid, - TaskQueue: s.WorkerTaskQueue(), - WorkflowTaskTimeout: time.Second * 20, - }, workflowFn) +func (s *MaxBufferedEventSuite) TestBufferedEventsMutableStateSizeLimit() { + env := testcore.NewEnv(s.T(), s.opts()...) + + /* + This test starts a workflow, and blocks its workflow task, then sends + signals to it which will be buffered. The test is configured with + MaximumBufferedEventsSizeInBytes set to 10MB (high) and MutableStateSizeLimitError + set to 410KB (low). Each signal has a 100KB payload. The first three signals + succeed, and the fourth signal causes the mutable state size to exceed the limit, + resulting in workflow termination. + */ + closeStartChanOnce := sync.Once{} + waitStartChan := make(chan struct{}) + waitSignalChan := make(chan struct{}) + + localActivityFn := func(ctx context.Context) error { + // notify that workflow task has started + closeStartChanOnce.Do(func() { + close(waitStartChan) + }) + + // block workflow task so all signals will be buffered. + <-waitSignalChan + return nil + } - s.NoError(err1) + workflowFn := func(ctx workflow.Context) (int, error) { + ctx1 := workflow.WithLocalActivityOptions(ctx, workflow.LocalActivityOptions{ + StartToCloseTimeout: 20 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }) + f1 := workflow.ExecuteLocalActivity(ctx1, localActivityFn) + if err := f1.Get(ctx, nil); err != nil { + return 0, err + } - // block until workflow task started - <-waitStartChan + sigCh := workflow.GetSignalChannel(ctx, "test-signal") - // now send signals with 100KB payload each, which will be buffered - buf := make([]byte, 100*1024) // 100KB - // fill the slice with random data to make sure the - // encoder does not zero out the data - _, err := rand.Read(buf) - s.NoError(err) - largePayload := payloads.EncodeBytes(buf) - - // Send signals until mutable state size limit is exceeded. - // With 410KB limit and 100KB payloads, the first 3 signals succeed but the 4th exceeds the limit. - // First three signals should succeed. - for i := range 3 { - err = s.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", largePayload) - s.NoError(err, "Signal %d should succeed", i+1) + sigCount := 0 + for sigCh.ReceiveAsync(nil) { + sigCount++ } + return sigCount, nil + } - // Fourth signal should fail due to mutable state size limit - err = s.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", largePayload) - s.Error(err, "Fourth signal should fail due to mutable state size limit") - s.Contains(err.Error(), "mutable state size exceeds limit", "Expected mutable state size limit error") - - // unblock goroutine that runs local activity - close(waitSignalChan) - - var sigCount int - err = wf1.Get(testCtx, &sigCount) - // The workflow should be terminated, so we expect an error - s.Error(err) - - historyEvents := s.GetHistory(s.Namespace().String(), &commonpb.WorkflowExecution{WorkflowId: wf1.GetID()}) - - // Verify that the workflow was terminated due to mutable state size limit - var terminated bool - var terminationReason string - for _, evt := range historyEvents { - if evt.GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_TERMINATED { - terminated = true - attrs := evt.GetWorkflowExecutionTerminatedEventAttributes() - terminationReason = attrs.GetReason() - break - } + env.SdkWorker().RegisterWorkflow(workflowFn) + + testCtx, cancel := context.WithTimeout(context.Background(), 40*time.Second) + defer cancel() + + wid := "test-max-buffered-events-limit" + wf1, err1 := env.SdkClient().ExecuteWorkflow(testCtx, client.StartWorkflowOptions{ + ID: wid, + TaskQueue: env.WorkerTaskQueue(), + WorkflowTaskTimeout: time.Second * 20, + }, workflowFn) + + s.NoError(err1) + + // block until workflow task started + <-waitStartChan + + // now send signals with 100KB payload each, which will be buffered + buf := make([]byte, 100*1024) // 100KB + // fill the slice with random data to make sure the + // encoder does not zero out the data + _, err := rand.Read(buf) + s.NoError(err) + largePayload := payloads.EncodeBytes(buf) + + // Send signals until mutable state size limit is exceeded. + // With 410KB limit and 100KB payloads, the first 3 signals succeed but the 4th exceeds the limit. + // First three signals should succeed. + for i := range 3 { + err = env.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", largePayload) + s.NoError(err, "Signal %d should succeed", i+1) + } + + // Fourth signal should fail due to mutable state size limit + err = env.SdkClient().SignalWorkflow(testCtx, wid, "", "test-signal", largePayload) + s.Error(err, "Fourth signal should fail due to mutable state size limit") + s.Contains(err.Error(), "mutable state size exceeds limit", "Expected mutable state size limit error") + + // unblock goroutine that runs local activity + close(waitSignalChan) + + var sigCount int + err = wf1.Get(testCtx, &sigCount) + // The workflow should be terminated, so we expect an error + s.Error(err) + + historyEvents := env.GetHistory(env.Namespace().String(), &commonpb.WorkflowExecution{WorkflowId: wf1.GetID()}) + + // Verify that the workflow was terminated due to mutable state size limit + var terminated bool + var terminationReason string + for _, evt := range historyEvents { + if evt.GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_TERMINATED { + terminated = true + attrs := evt.GetWorkflowExecutionTerminatedEventAttributes() + terminationReason = attrs.GetReason() + break } - s.True(terminated, "Expected workflow to be terminated") - s.Equal(common.FailureReasonMutableStateSizeExceedsLimit, terminationReason, - "Expected workflow to be terminated due to mutable state size limit") - }) + } + s.True(terminated, "Expected workflow to be terminated") + s.Equal(common.FailureReasonMutableStateSizeExceedsLimit, terminationReason, + "Expected workflow to be terminated due to mutable state size limit") } diff --git a/tests/nexus_api_test.go b/tests/nexus_api_test.go index 57d96fcd97..5faeabdece 100644 --- a/tests/nexus_api_test.go +++ b/tests/nexus_api_test.go @@ -131,6 +131,7 @@ func (s *NexusApiTestSuite) TestNexusStartOperation_Outcomes() { s.Equal("http://localhost/callback", start.Callback) s.Equal("request-id", start.RequestId) s.Equal("value", res.Request.Header["key"]) + s.NotContains(res.Request.Header, "temporal-nexus-failure-support") s.Len(start.GetLinks(), 1) s.Equal(callerNexusLink.URL.String(), start.Links[0].GetUrl()) s.Equal(callerNexusLink.Type, start.Links[0].Type) @@ -286,7 +287,7 @@ func (s *NexusApiTestSuite) TestNexusStartOperation_Outcomes() { pollerErrCh := s.nexusTaskPoller(ctx, tc.endpoint.Spec.Target.GetWorker().TaskQueue, tc.handler) eventuallyTick := 500 * time.Millisecond - header := nexus.Header{"key": "value"} + header := nexus.Header{"key": "value", "temporal-nexus-failure-support": "true"} if tc.timeout > 0 { eventuallyTick = tc.timeout + (100 * time.Millisecond) header[nexus.HeaderRequestTimeout] = tc.timeout.String() diff --git a/tests/nexus_endpoint_test.go b/tests/nexus_endpoint_test.go index 9e06880703..4a5a709c82 100644 --- a/tests/nexus_endpoint_test.go +++ b/tests/nexus_endpoint_test.go @@ -1,6 +1,7 @@ package tests import ( + "context" "fmt" "strings" "testing" @@ -55,21 +56,22 @@ func (s *CommonSuite) TestListOrdering() { // create some endpoints numEndpoints := 40 // minimum number of endpoints to test, there may be more in DB from other tests + ctx := testcore.NewContext() for range numEndpoints { - s.createNexusEndpoint(testcore.RandomizeStr("test-endpoint-name")) + s.createNexusEndpoint(ctx, testcore.RandomizeStr("test-endpoint-name")) } tableVersion := initialTableVersion + int64(numEndpoints) // list from persistence manager level persistence := s.GetTestCluster().TestBase().NexusEndpointManager - persistenceResp1, err := persistence.ListNexusEndpoints(testcore.NewContext(), &p.ListNexusEndpointsRequest{ + persistenceResp1, err := persistence.ListNexusEndpoints(ctx, &p.ListNexusEndpointsRequest{ LastKnownTableVersion: tableVersion, PageSize: numEndpoints / 2, }) s.NoError(err) s.Len(persistenceResp1.Entries, numEndpoints/2) s.NotNil(persistenceResp1.NextPageToken) - persistenceResp2, err := persistence.ListNexusEndpoints(testcore.NewContext(), &p.ListNexusEndpointsRequest{ + persistenceResp2, err := persistence.ListNexusEndpoints(ctx, &p.ListNexusEndpointsRequest{ LastKnownTableVersion: tableVersion, PageSize: numEndpoints / 2, NextPageToken: persistenceResp1.NextPageToken, @@ -79,14 +81,14 @@ func (s *CommonSuite) TestListOrdering() { // list from matching level matchingClient := s.GetTestCluster().MatchingClient() - matchingResp1, err := matchingClient.ListNexusEndpoints(testcore.NewContext(), &matchingservice.ListNexusEndpointsRequest{ + matchingResp1, err := matchingClient.ListNexusEndpoints(ctx, &matchingservice.ListNexusEndpointsRequest{ LastKnownTableVersion: tableVersion, PageSize: int32(numEndpoints / 2), }) s.NoError(err) s.Len(matchingResp1.Entries, numEndpoints/2) s.NotNil(matchingResp1.NextPageToken) - matchingResp2, err := matchingClient.ListNexusEndpoints(testcore.NewContext(), &matchingservice.ListNexusEndpointsRequest{ + matchingResp2, err := matchingClient.ListNexusEndpoints(ctx, &matchingservice.ListNexusEndpointsRequest{ LastKnownTableVersion: tableVersion, PageSize: int32(numEndpoints / 2), NextPageToken: matchingResp1.NextPageToken, @@ -95,13 +97,13 @@ func (s *CommonSuite) TestListOrdering() { s.Len(matchingResp2.Entries, numEndpoints/2) // list from operator level - operatorResp1, err := s.OperatorClient().ListNexusEndpoints(testcore.NewContext(), &operatorservice.ListNexusEndpointsRequest{ + operatorResp1, err := s.OperatorClient().ListNexusEndpoints(ctx, &operatorservice.ListNexusEndpointsRequest{ PageSize: int32(numEndpoints / 2), }) s.NoError(err) s.Len(operatorResp1.Endpoints, numEndpoints/2) s.NotNil(operatorResp1.NextPageToken) - operatorResp2, err := s.OperatorClient().ListNexusEndpoints(testcore.NewContext(), &operatorservice.ListNexusEndpointsRequest{ + operatorResp2, err := s.OperatorClient().ListNexusEndpoints(ctx, &operatorservice.ListNexusEndpointsRequest{ PageSize: int32(numEndpoints / 2), NextPageToken: operatorResp1.NextPageToken, }) @@ -123,8 +125,9 @@ type MatchingSuite struct { } func (s *MatchingSuite) TestCreate() { + ctx := testcore.NewContext() endpointName := testcore.RandomizedNexusEndpoint(s.T().Name()) - entry := s.createNexusEndpoint(endpointName) + entry := s.createNexusEndpoint(ctx, endpointName) s.Equal(int64(1), entry.Version) s.NotNil(entry.Endpoint.Clock) s.NotNil(entry.Endpoint.CreatedTime) @@ -132,7 +135,7 @@ func (s *MatchingSuite) TestCreate() { s.Equal(entry.Endpoint.Spec.Name, endpointName) s.Equal(entry.Endpoint.Spec.Target.GetWorker().NamespaceId, s.NamespaceID().String()) - _, err := s.GetTestCluster().MatchingClient().CreateNexusEndpoint(testcore.NewContext(), &matchingservice.CreateNexusEndpointRequest{ + _, err := s.GetTestCluster().MatchingClient().CreateNexusEndpoint(ctx, &matchingservice.CreateNexusEndpointRequest{ Spec: &persistencespb.NexusEndpointSpec{ Name: endpointName, Target: &persistencespb.NexusEndpointTarget{ @@ -152,7 +155,8 @@ func (s *MatchingSuite) TestCreate() { func (s *MatchingSuite) TestUpdate() { endpointName := testcore.RandomizedNexusEndpoint(s.T().Name()) updatedName := testcore.RandomizedNexusEndpoint(s.T().Name() + "-updated") - endpoint := s.createNexusEndpoint(endpointName) + ctx := testcore.NewContext() + endpoint := s.createNexusEndpoint(ctx, endpointName) type testcase struct { name string request *matchingservice.UpdateNexusEndpointRequest @@ -241,7 +245,8 @@ func (s *MatchingSuite) TestUpdate() { func (s *MatchingSuite) TestDelete() { endpointName := testcore.RandomizedNexusEndpoint(s.T().Name()) - endpoint := s.createNexusEndpoint(endpointName) + ctx := testcore.NewContext() + endpoint := s.createNexusEndpoint(ctx, endpointName) type testcase struct { name string endpointID string @@ -279,15 +284,16 @@ func (s *MatchingSuite) TestDelete() { } func (s *MatchingSuite) TestList() { + ctx := testcore.NewContext() // initialize some endpoints - s.createNexusEndpoint("list-test-endpoint0") - s.createNexusEndpoint("list-test-endpoint1") - s.createNexusEndpoint("list-test-endpoint2") + s.createNexusEndpoint(ctx, "list-test-endpoint0") + s.createNexusEndpoint(ctx, "list-test-endpoint1") + s.createNexusEndpoint(ctx, "list-test-endpoint2") // get expected table version and endpoints for the course of the tests matchingClient := s.GetTestCluster().MatchingClient() resp, err := matchingClient.ListNexusEndpoints( - testcore.NewContext(), + ctx, &matchingservice.ListNexusEndpointsRequest{ PageSize: 100, LastKnownTableVersion: 0, @@ -406,16 +412,21 @@ func (s *MatchingSuite) TestList() { for _, tc := range testCases { s.T().Run(tc.name, func(t *testing.T) { + ctx := testcore.NewContext() listReqDone := make(chan struct{}) go func() { defer close(listReqDone) - resp, err := matchingClient.ListNexusEndpoints(testcore.NewContext(), tc.request) //nolint:revive + resp, err := matchingClient.ListNexusEndpoints(ctx, tc.request) //nolint:revive tc.assertion(resp, err) }() if tc.request.Wait && tc.request.NextPageToken == nil && tc.request.LastKnownTableVersion != 0 { - s.createNexusEndpoint("new-endpoint") + s.createNexusEndpoint(ctx, "new-endpoint") + } + select { + case <-listReqDone: + case <-ctx.Done(): + s.Fail("timed out waiting for list nexus endpoints request to complete") } - <-listReqDone }) } } @@ -726,7 +737,8 @@ func (s *OperatorSuite) TestCreate() { func (s *OperatorSuite) TestUpdate() { endpointName := testcore.RandomizedNexusEndpoint(s.T().Name()) updatedName := testcore.RandomizedNexusEndpoint(s.T().Name() + "-updated") - endpoint := s.createNexusEndpoint(endpointName) + ctx := testcore.NewContext() + endpoint := s.createNexusEndpoint(ctx, endpointName) type testcase struct { name string request *operatorservice.UpdateNexusEndpointRequest @@ -813,7 +825,8 @@ func (s *OperatorSuite) TestUpdate() { } func (s *OperatorSuite) TestDelete() { - endpoint := s.createNexusEndpoint("endpoint-to-delete-operator") + ctx := testcore.NewContext() + endpoint := s.createNexusEndpoint(ctx, "endpoint-to-delete-operator") type testcase struct { name string serviceId string @@ -851,18 +864,19 @@ func (s *OperatorSuite) TestDelete() { } func (s *OperatorSuite) TestList() { + ctx := testcore.NewContext() // initialize some endpoints - s.createNexusEndpoint("operator-list-test-service0") - s.createNexusEndpoint("operator-list-test-service1") - entryToFilter := s.createNexusEndpoint("operator-list-test-service2") + s.createNexusEndpoint(ctx, "operator-list-test-service0") + s.createNexusEndpoint(ctx, "operator-list-test-service1") + entryToFilter := s.createNexusEndpoint(ctx, "operator-list-test-service2") // get ordered endpoints for the course of the tests - resp, err := s.OperatorClient().ListNexusEndpoints(testcore.NewContext(), &operatorservice.ListNexusEndpointsRequest{}) + resp, err := s.OperatorClient().ListNexusEndpoints(ctx, &operatorservice.ListNexusEndpointsRequest{}) s.NoError(err) s.NotNil(resp) endpointsOrdered := resp.Endpoints - resp, err = s.OperatorClient().ListNexusEndpoints(testcore.NewContext(), &operatorservice.ListNexusEndpointsRequest{PageSize: 2}) + resp, err = s.OperatorClient().ListNexusEndpoints(ctx, &operatorservice.ListNexusEndpointsRequest{PageSize: 2}) s.NoError(err) s.NotNil(resp) nextPageToken := resp.NextPageToken @@ -954,7 +968,8 @@ func (s *OperatorSuite) TestList() { func (s *OperatorSuite) TestGet() { endpointName := testcore.RandomizedNexusEndpoint(s.T().Name()) - endpoint := s.createNexusEndpoint(endpointName) + ctx := testcore.NewContext() + endpoint := s.createNexusEndpoint(ctx, endpointName) type testcase struct { name string @@ -1009,9 +1024,9 @@ func (s *NexusEndpointFunctionalSuite) defaultTaskQueue() *taskqueuepb.TaskQueue return &taskqueuepb.TaskQueue{Name: name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} } -func (s *NexusEndpointFunctionalSuite) createNexusEndpoint(name string) *persistencespb.NexusEndpointEntry { +func (s *NexusEndpointFunctionalSuite) createNexusEndpoint(ctx context.Context, name string) *persistencespb.NexusEndpointEntry { resp, err := s.GetTestCluster().MatchingClient().CreateNexusEndpoint( - testcore.NewContext(), + ctx, &matchingservice.CreateNexusEndpointRequest{ Spec: &persistencespb.NexusEndpointSpec{ Name: name, diff --git a/tests/nexus_workflow_test.go b/tests/nexus_workflow_test.go index 93ce8b1cd7..4acf8dd67f 100644 --- a/tests/nexus_workflow_test.go +++ b/tests/nexus_workflow_test.go @@ -69,7 +69,7 @@ func (s *NexusWorkflowTestSuite) TestNexusOperationCancelation() { h := nexustest.Handler{ OnStartOperation: func(ctx context.Context, service, operation string, input *nexus.LazyValue, options nexus.StartOperationOptions) (nexus.HandlerStartOperationResult[any], error) { if service != "service" { - return nil, nexus.HandlerErrorf(nexus.HandlerErrorTypeBadRequest, `expected service to equal "service"`) + return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeBadRequest, `expected service to equal "service"`) } return &nexus.HandlerStartOperationResultAsync{OperationToken: "test"}, nil }, @@ -77,7 +77,7 @@ func (s *NexusWorkflowTestSuite) TestNexusOperationCancelation() { if !firstCancelSeen { // Fail cancel request once to test NexusOperationCancelRequestFailed event is recorded and request is retried. firstCancelSeen = true - return nexus.HandlerErrorf(nexus.HandlerErrorTypeBadRequest, "intentional non-retyrable cancel error for test") + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeBadRequest, "intentional non-retyrable cancel error for test") } return nil }, @@ -2057,17 +2057,17 @@ func (s *NexusWorkflowTestSuite) TestNexusSyncOperationErrorRehydration() { op := nexus.NewSyncOperation("op", func(ctx context.Context, outcome string, soo nexus.StartOperationOptions) (nexus.NoValue, error) { switch outcome { case "fail-handler-internal": - return nil, nexus.HandlerErrorf(nexus.HandlerErrorTypeInternal, "intentional internal error") + return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "intentional internal error") case "fail-handler-app-error": return nil, temporal.NewApplicationError("app error", "TestError", "details") case "fail-handler-bad-request": - return nil, nexus.HandlerErrorf(nexus.HandlerErrorTypeBadRequest, "bad request") + return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeBadRequest, "bad request") case "fail-operation": - return nil, nexus.NewOperationFailedError("some error") + return nil, nexus.NewOperationFailedErrorf("some error") case "fail-operation-app-error": return nil, temporal.NewNonRetryableApplicationError("app error", "TestError", nil, "details") } - return nil, nexus.HandlerErrorf(nexus.HandlerErrorTypeBadRequest, "unexpected outcome: %s", outcome) + return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeBadRequest, "unexpected outcome: %s", outcome) }) s.NoError(svc.Register(op)) @@ -2095,9 +2095,7 @@ func (s *NexusWorkflowTestSuite) TestNexusSyncOperationErrorRehydration() { var handlerErr *nexus.HandlerError require.ErrorAs(t, pendingErr, &handlerErr) require.Equal(t, nexus.HandlerErrorTypeInternal, handlerErr.Type) - var appErr *temporal.ApplicationError - require.ErrorAs(t, handlerErr.Cause, &appErr) - require.Equal(t, "intentional internal error", appErr.Message()) + require.Equal(t, "intentional internal error", handlerErr.Message) }, }, { @@ -2125,9 +2123,7 @@ func (s *NexusWorkflowTestSuite) TestNexusSyncOperationErrorRehydration() { var handlerErr *nexus.HandlerError require.ErrorAs(t, opErr, &handlerErr) require.Equal(t, nexus.HandlerErrorTypeBadRequest, handlerErr.Type) - var appErr *temporal.ApplicationError - require.ErrorAs(t, handlerErr.Cause, &appErr) - require.Equal(t, "bad request", appErr.Message()) + require.Equal(t, "bad request", handlerErr.Message) }, }, { diff --git a/tests/nil_search_attribute_test.go b/tests/nil_search_attribute_test.go new file mode 100644 index 0000000000..760377507a --- /dev/null +++ b/tests/nil_search_attribute_test.go @@ -0,0 +1,375 @@ +package tests + +import ( + "testing" + "time" + + "github.com/google/uuid" + commonpb "go.temporal.io/api/common/v1" + enumspb "go.temporal.io/api/enums/v1" + historypb "go.temporal.io/api/history/v1" + taskqueuepb "go.temporal.io/api/taskqueue/v1" + "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/server/common/payload" + "go.temporal.io/server/tests/testcore" + "google.golang.org/protobuf/types/known/durationpb" +) + +func TestWorkflowStart_NilSearchAttributesFiltered(t *testing.T) { + s := testcore.NewEnv(t) + workflowID := "nil-sa-filter-" + uuid.NewString() + workflowType := &commonpb.WorkflowType{Name: "nil-sa-filter-workflow-type"} + taskQueue := &taskqueuepb.TaskQueue{Name: s.Tv().TaskQueue().Name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} + + nilPayload, err := payload.Encode(nil) + s.Require().NoError(err) + validPayload := payload.EncodeString("valid-value") + + searchAttributes := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "CustomKeywordField": validPayload, + "CustomTextField": nilPayload, + }, + } + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: s.Namespace().String(), + WorkflowId: workflowID, + WorkflowType: workflowType, + TaskQueue: taskQueue, + WorkflowRunTimeout: durationpb.New(10 * time.Second), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), + Identity: "test-identity", + SearchAttributes: searchAttributes, + } + + we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + s.Require().NoError(err) + s.Require().NotNil(we.GetRunId()) + + historyResp, err := s.FrontendClient().GetWorkflowExecutionHistory(testcore.NewContext(), &workflowservice.GetWorkflowExecutionHistoryRequest{ + Namespace: s.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowID, + RunId: we.GetRunId(), + }, + }) + s.Require().NoError(err) + + var startedEvent *historypb.HistoryEvent + for _, event := range historyResp.History.Events { + if event.EventType == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_STARTED { + startedEvent = event + break + } + } + + s.Require().NotNil(startedEvent) + attrs := startedEvent.GetWorkflowExecutionStartedEventAttributes() + s.Require().NotNil(attrs) + + if attrs.SearchAttributes != nil { + s.NotNil(attrs.SearchAttributes.IndexedFields["CustomKeywordField"]) + _, hasNilKey := attrs.SearchAttributes.IndexedFields["CustomTextField"] + s.False(hasNilKey, "nil search attribute key should be filtered out from history event") + } + + _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), + WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: workflowID}, + Reason: "test cleanup", + }) + s.Require().NoError(err) +} + +func TestWorkflowStart_AllNilSearchAttributesFiltered(t *testing.T) { + s := testcore.NewEnv(t) + workflowID := "nil-sa-filter-all-" + uuid.NewString() + workflowType := &commonpb.WorkflowType{Name: "nil-sa-filter-workflow-type"} + taskQueue := &taskqueuepb.TaskQueue{Name: s.Tv().TaskQueue().Name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} + + nilPayload, err := payload.Encode(nil) + s.Require().NoError(err) + + searchAttributes := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "CustomKeywordField": nilPayload, + "CustomTextField": nilPayload, + }, + } + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: s.Namespace().String(), + WorkflowId: workflowID, + WorkflowType: workflowType, + TaskQueue: taskQueue, + WorkflowRunTimeout: durationpb.New(10 * time.Second), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), + Identity: "test-identity", + SearchAttributes: searchAttributes, + } + + we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + s.Require().NoError(err) + s.Require().NotNil(we.GetRunId()) + + historyResp, err := s.FrontendClient().GetWorkflowExecutionHistory(testcore.NewContext(), &workflowservice.GetWorkflowExecutionHistoryRequest{ + Namespace: s.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowID, + RunId: we.GetRunId(), + }, + }) + s.Require().NoError(err) + + var startedEvent *historypb.HistoryEvent + for _, event := range historyResp.History.Events { + if event.EventType == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_STARTED { + startedEvent = event + break + } + } + + s.Require().NotNil(startedEvent) + attrs := startedEvent.GetWorkflowExecutionStartedEventAttributes() + s.Require().NotNil(attrs) + s.Nil(attrs.SearchAttributes, "SearchAttributes should be nil when all values are nil") + + _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), + WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: workflowID}, + Reason: "test cleanup", + }) + s.Require().NoError(err) +} + +func TestDescribeWorkflow_NilSearchAttributesNotVisible(t *testing.T) { + s := testcore.NewEnv(t) + workflowID := "nil-sa-filter-describe-" + uuid.NewString() + workflowType := &commonpb.WorkflowType{Name: "nil-sa-filter-workflow-type"} + taskQueue := &taskqueuepb.TaskQueue{Name: s.Tv().TaskQueue().Name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} + + nilPayload, err := payload.Encode(nil) + s.Require().NoError(err) + validPayload := payload.EncodeString("valid-value") + + searchAttributes := &commonpb.SearchAttributes{ + IndexedFields: map[string]*commonpb.Payload{ + "CustomKeywordField": validPayload, + "CustomTextField": nilPayload, + }, + } + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: s.Namespace().String(), + WorkflowId: workflowID, + WorkflowType: workflowType, + TaskQueue: taskQueue, + WorkflowRunTimeout: durationpb.New(10 * time.Second), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), + Identity: "test-identity", + SearchAttributes: searchAttributes, + } + + we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + s.Require().NoError(err) + s.Require().NotNil(we.GetRunId()) + + descResp, err := s.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowID, + RunId: we.GetRunId(), + }, + }) + s.Require().NoError(err) + s.Require().NotNil(descResp) + + if descResp.WorkflowExecutionInfo.SearchAttributes != nil { + s.NotNil(descResp.WorkflowExecutionInfo.SearchAttributes.IndexedFields["CustomKeywordField"]) + _, hasNilKey := descResp.WorkflowExecutionInfo.SearchAttributes.IndexedFields["CustomTextField"] + s.False(hasNilKey, "nil search attribute key should not be visible in mutable state") + } + + _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), + WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: workflowID}, + Reason: "test cleanup", + }) + s.Require().NoError(err) +} + +func TestWorkflowStart_NilMemoFiltered(t *testing.T) { + s := testcore.NewEnv(t) + workflowID := "nil-memo-filter-" + uuid.NewString() + workflowType := &commonpb.WorkflowType{Name: "nil-memo-filter-workflow-type"} + taskQueue := &taskqueuepb.TaskQueue{Name: s.Tv().TaskQueue().Name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} + + nilPayload, err := payload.Encode(nil) + s.Require().NoError(err) + validPayload := payload.EncodeString("valid-value") + + memo := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "ValidKey": validPayload, + "NilKey": nilPayload, + }, + } + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: s.Namespace().String(), + WorkflowId: workflowID, + WorkflowType: workflowType, + TaskQueue: taskQueue, + WorkflowRunTimeout: durationpb.New(10 * time.Second), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), + Identity: "test-identity", + Memo: memo, + } + + we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + s.Require().NoError(err) + s.Require().NotNil(we.GetRunId()) + + historyResp, err := s.FrontendClient().GetWorkflowExecutionHistory(testcore.NewContext(), &workflowservice.GetWorkflowExecutionHistoryRequest{ + Namespace: s.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: workflowID, + RunId: we.GetRunId(), + }, + }) + s.Require().NoError(err) + + var startedEvent *historypb.HistoryEvent + for _, event := range historyResp.History.Events { + if event.EventType == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_STARTED { + startedEvent = event + break + } + } + + s.Require().NotNil(startedEvent) + attrs := startedEvent.GetWorkflowExecutionStartedEventAttributes() + s.Require().NotNil(attrs) + + if attrs.Memo != nil { + s.NotNil(attrs.Memo.Fields["ValidKey"]) + _, hasNilKey := attrs.Memo.Fields["NilKey"] + s.False(hasNilKey, "nil memo key should be filtered out from history event") + } + + _, _ = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: workflowID}, Reason: "test cleanup", + }) +} + +func TestWorkflowStart_AllNilMemoFiltered(t *testing.T) { + s := testcore.NewEnv(t) + workflowID := "nil-memo-filter-all-" + uuid.NewString() + workflowType := &commonpb.WorkflowType{Name: "nil-memo-filter-workflow-type"} + taskQueue := &taskqueuepb.TaskQueue{Name: s.Tv().TaskQueue().Name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} + + nilPayload, err := payload.Encode(nil) + s.Require().NoError(err) + + memo := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "NilKey1": nilPayload, + "NilKey2": nilPayload, + }, + } + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: s.Namespace().String(), + WorkflowId: workflowID, + WorkflowType: workflowType, + TaskQueue: taskQueue, + WorkflowRunTimeout: durationpb.New(10 * time.Second), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), + Identity: "test-identity", + Memo: memo, + } + + we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + s.Require().NoError(err) + s.Require().NotNil(we.GetRunId()) + + historyResp, err := s.FrontendClient().GetWorkflowExecutionHistory(testcore.NewContext(), &workflowservice.GetWorkflowExecutionHistoryRequest{ + Namespace: s.Namespace().String(), + Execution: &commonpb.WorkflowExecution{WorkflowId: workflowID, RunId: we.GetRunId()}, + }) + s.Require().NoError(err) + + var startedEvent *historypb.HistoryEvent + for _, event := range historyResp.History.Events { + if event.EventType == enumspb.EVENT_TYPE_WORKFLOW_EXECUTION_STARTED { + startedEvent = event + break + } + } + + s.Require().NotNil(startedEvent) + attrs := startedEvent.GetWorkflowExecutionStartedEventAttributes() + s.Require().NotNil(attrs) + s.Nil(attrs.Memo, "Memo should be nil when all values are nil") + + _, _ = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: workflowID}, Reason: "test cleanup", + }) +} + +func TestDescribeWorkflow_NilMemoNotVisible(t *testing.T) { + s := testcore.NewEnv(t) + workflowID := "nil-memo-filter-describe-" + uuid.NewString() + workflowType := &commonpb.WorkflowType{Name: "nil-memo-filter-workflow-type"} + taskQueue := &taskqueuepb.TaskQueue{Name: s.Tv().TaskQueue().Name, Kind: enumspb.TASK_QUEUE_KIND_NORMAL} + + nilPayload, err := payload.Encode(nil) + s.Require().NoError(err) + validPayload := payload.EncodeString("valid-value") + + memo := &commonpb.Memo{ + Fields: map[string]*commonpb.Payload{ + "ValidKey": validPayload, + "NilKey": nilPayload, + }, + } + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: s.Namespace().String(), + WorkflowId: workflowID, + WorkflowType: workflowType, + TaskQueue: taskQueue, + WorkflowRunTimeout: durationpb.New(10 * time.Second), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), + Identity: "test-identity", + Memo: memo, + } + + we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) + s.Require().NoError(err) + s.Require().NotNil(we.GetRunId()) + + descResp, err := s.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), + Execution: &commonpb.WorkflowExecution{WorkflowId: workflowID, RunId: we.GetRunId()}, + }) + s.Require().NoError(err) + s.Require().NotNil(descResp) + + if descResp.WorkflowExecutionInfo.Memo != nil { + s.NotNil(descResp.WorkflowExecutionInfo.Memo.Fields["ValidKey"]) + _, hasNilKey := descResp.WorkflowExecutionInfo.Memo.Fields["NilKey"] + s.False(hasNilKey, "nil memo key should not be visible in mutable state / describe") + } + + _, _ = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: s.Namespace().String(), WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: workflowID}, Reason: "test cleanup", + }) +} diff --git a/tests/premature_eos_test.go b/tests/premature_eos_test.go index a85c169842..9f72fafefa 100644 --- a/tests/premature_eos_test.go +++ b/tests/premature_eos_test.go @@ -8,9 +8,18 @@ import ( commonpb "go.temporal.io/api/common/v1" historypb "go.temporal.io/api/history/v1" "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/tests/testcore" ) +type PrematureEosTestSuite struct { + parallelsuite.Suite[*PrematureEosTestSuite] +} + +func TestPrematureEosTestSuite(t *testing.T) { + parallelsuite.Run(t, &PrematureEosTestSuite{}) +} + // Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination demonstrates the // "premature end of stream" bug in a scenario mimicking SDK workflow cache eviction: // the SDK uses GetWorkflowExecutionHistory (not PollWorkflowTaskQueue) to replay @@ -34,7 +43,7 @@ import ( // returns events 8 and 9; assembled history has 9 events (no premature EOS). // // This test asserts the FIXED behavior. -func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) { +func (s *PrematureEosTestSuite) Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination() { // MaximumPageSize controls the number of DB event batches per page, not individual // events. The 7 persisted events are stored in 5 batches: // [1,2] StartWorkflow, [3] WFTStarted, [4,5] WFTCompleted+WFTScheduled, @@ -43,9 +52,9 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) // leaving batches [6] and [7] for the second page. const maxBatchesPerPage = 3 - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - tv := s.Tv() - runID := mustStartWorkflow(s, tv) + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + tv := env.Tv() + runID := mustStartWorkflow(env, tv) wfExecution := &commonpb.WorkflowExecution{WorkflowId: tv.WorkflowID(), RunId: runID} // Build 7 persisted events: @@ -56,7 +65,7 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) // 5: WorkflowTaskScheduled (force-created) // 6: WorkflowTaskStarted // 7: WorkflowTaskCompleted - _, err := s.TaskPoller().PollAndHandleWorkflowTask(tv, + _, err := env.TaskPoller().PollAndHandleWorkflowTask(tv, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return &workflowservice.RespondWorkflowTaskCompletedRequest{ ForceCreateNewWorkflowTask: true, @@ -64,7 +73,7 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) }) s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(tv, + _, err = env.TaskPoller().PollAndHandleWorkflowTask(tv, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return &workflowservice.RespondWorkflowTaskCompletedRequest{}, nil }) @@ -73,7 +82,7 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) // Send an update to create a speculative WFT (event 8 in memory, scheduled but not polled). ctx, cancel := context.WithCancel(testcore.NewContext()) defer cancel() - updateCh := sendUpdate(ctx, s, tv) + updateCh := sendUpdate(ctx, env, tv) defer func() { go func() { <-updateCh }() }() // Wait until the speculative WFT is scheduled before fetching page 1. @@ -84,9 +93,9 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) // would only add event 8 (SignalReceived) with freshNextEventId=9, producing 8 events // instead of the expected 9 and causing a false test failure. s.Eventually(func() bool { - desc, descErr := s.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), + desc, descErr := env.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: wfExecution, }) return descErr == nil && desc.GetPendingWorkflowTask() != nil @@ -99,10 +108,10 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) // batches are returned ([1,2]+[3]+[4,5] = events 1..5), leaving batches [6] and [7] for // the next page. The continuation token encodes NextEventId=8 and PersistenceToken // pointing to the next DB batch — this is the "stale token" that exercises the bug. - histPage1, err := s.FrontendClient().GetWorkflowExecutionHistory( + histPage1, err := env.FrontendClient().GetWorkflowExecutionHistory( testcore.NewContext(), &workflowservice.GetWorkflowExecutionHistoryRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: wfExecution, MaximumPageSize: maxBatchesPerPage, }, @@ -110,7 +119,7 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) s.NoError(err) s.NotNil(histPage1.NextPageToken, "NextPageToken must be set: with maxBatchesPerPage=3 and 5 total batches, page 1 must not be the last page") - t.Logf("NEXTPAGETOKEN: %s", histPage1.NextPageToken) + s.T().Logf("NEXTPAGETOKEN: %s", histPage1.NextPageToken) firstPageEvents := histPage1.History.Events staleNextPageToken := histPage1.NextPageToken @@ -120,9 +129,9 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) // 8: WorkflowTaskScheduled (normal WFT scheduled to handle the pending update) // 9: WorkflowExecutionSignaled (flushed immediately: HasStartedWorkflowTask=false) // After this transaction, freshNextEventId=10. - _, signalErr := s.FrontendClient().SignalWorkflowExecution(testcore.NewContext(), + _, signalErr := env.FrontendClient().SignalWorkflowExecution(testcore.NewContext(), &workflowservice.SignalWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), WorkflowExecution: wfExecution, SignalName: tv.Any().String(), }) @@ -132,9 +141,9 @@ func Test_SpeculativeWFTEventsLostAfterSignalMidHistoryPagination(t *testing.T) allEvents := make([]*historypb.HistoryEvent, len(firstPageEvents)) copy(allEvents, firstPageEvents) for nextPageToken := staleNextPageToken; nextPageToken != nil; { - histResp, histErr := s.FrontendClient().GetWorkflowExecutionHistory(testcore.NewContext(), + histResp, histErr := env.FrontendClient().GetWorkflowExecutionHistory(testcore.NewContext(), &workflowservice.GetWorkflowExecutionHistoryRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), Execution: wfExecution, NextPageToken: nextPageToken, MaximumPageSize: maxBatchesPerPage, diff --git a/tests/priority_fairness_test.go b/tests/priority_fairness_test.go index 231f3cfb0f..09112d552e 100644 --- a/tests/priority_fairness_test.go +++ b/tests/priority_fairness_test.go @@ -660,16 +660,24 @@ func (s *FairnessSuite) testMigration(newMatcher, fairness bool) { waitForTasks := func(tp enumspb.TaskQueueType, onDraining, onActive int64) { s.T().Helper() s.EventuallyWithT(func(c *assert.CollectT) { - tasksOnDraining, tasksOnActive, _, err := s.countTasksByDrainingActive(ctx, tv, tp) + tasksOnDraining, tasksOnActive, loadedOnDraining, loadedOnActive, _, err := s.countTasksByDrainingActive(ctx, tv, tp) require.NoError(c, err) require.Equal(c, onDraining, tasksOnDraining) require.Equal(c, onActive, tasksOnActive) + // ensure that expected tasks are actually loaded to avoid poller getting regular + // task before draining loads + if tasksOnDraining > 0 { + require.NotZero(c, loadedOnDraining) + } + if tasksOnActive > 0 { + require.NotZero(c, loadedOnActive) + } }, 15*time.Second, 250*time.Millisecond) } waitForNoDraining := func(tp enumspb.TaskQueueType) { s.T().Helper() s.EventuallyWithT(func(c *assert.CollectT) { - _, _, hasDraining, err := s.countTasksByDrainingActive(ctx, tv, tp) + _, _, _, _, hasDraining, err := s.countTasksByDrainingActive(ctx, tv, tp) require.NoError(c, err) require.False(c, hasDraining, "draining queue should be unloaded after drain completes") }, 15*time.Second, 250*time.Millisecond) @@ -800,7 +808,7 @@ func (s *FairnessSuite) testMigration(newMatcher, fairness bool) { } func (s *FairnessSuite) countTasksByDrainingActive(ctx context.Context, tv *testvars.TestVars, tp enumspb.TaskQueueType) ( - tasksOnDraining, tasksOnActive int64, hasDraining bool, retErr error, + tasksOnDraining, tasksOnActive, loadedOnDraining, loadedOnActive int64, hasDraining bool, retErr error, ) { for i := range s.partitions { res, err := s.AdminClient().DescribeTaskQueuePartition(ctx, &adminservice.DescribeTaskQueuePartitionRequest{ @@ -813,15 +821,17 @@ func (s *FairnessSuite) countTasksByDrainingActive(ctx context.Context, tv *test BuildIds: &taskqueuepb.TaskQueueVersionSelection{Unversioned: true}, }) if err != nil { - return 0, 0, false, err + return 0, 0, 0, 0, false, err } for _, versionInfoInternal := range res.VersionsInfoInternal { for _, st := range versionInfoInternal.PhysicalTaskQueueInfo.InternalTaskQueueStatus { if st.Draining { hasDraining = true tasksOnDraining += st.ApproximateBacklogCount + loadedOnDraining += st.LoadedTasks } else { tasksOnActive += st.ApproximateBacklogCount + loadedOnActive += st.LoadedTasks } } } diff --git a/tests/schedule_migration_test.go b/tests/schedule_migration_test.go index 52a028560f..e27057b86a 100644 --- a/tests/schedule_migration_test.go +++ b/tests/schedule_migration_test.go @@ -7,7 +7,6 @@ import ( "time" "github.com/google/uuid" - "github.com/stretchr/testify/require" commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" schedulepb "go.temporal.io/api/schedule/v1" @@ -23,14 +22,24 @@ import ( "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/primitives" "go.temporal.io/server/common/sdk" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/service/worker/scheduler" "go.temporal.io/server/tests/testcore" "google.golang.org/protobuf/types/known/durationpb" + "google.golang.org/protobuf/types/known/timestamppb" ) -func TestScheduleMigrationV2AlreadyExists(t *testing.T) { +type ScheduleMigrationTestSuite struct { + parallelsuite.Suite[*ScheduleMigrationTestSuite] +} + +func TestScheduleMigrationTestSuite(t *testing.T) { + parallelsuite.Run(t, &ScheduleMigrationTestSuite{}) +} + +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV2AlreadyExists() { env := testcore.NewEnv( - t, + s.T(), testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), ) @@ -73,7 +82,7 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { }, }, ) - require.NoError(t, err) + s.NoError(err) _, err = env.GetTestCluster().SchedulerClient().DescribeSchedule( ctx, @@ -82,7 +91,7 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, }, ) - require.NoError(t, err) + s.NoError(err) // Directly calling CreateFromMigrationState when a CHASM schedule already // exists should return AlreadyExists, matching CreateSchedule's behavior. @@ -103,8 +112,8 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { }, ) var alreadyExists *serviceerror.AlreadyExists - require.ErrorAs(t, err, &alreadyExists) - require.Contains(t, alreadyExists.Error(), sid) + s.ErrorAs(err, &alreadyExists) + s.Contains(alreadyExists.Error(), sid) // Create the V1 (workflow-backed) scheduler directly startArgs := &schedulespb.StartScheduleArgs{ @@ -117,7 +126,7 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { }, } inputPayloads, err := sdk.PreferProtoDataConverter.ToPayloads(startArgs) - require.NoError(t, err) + s.NoError(err) v1WorkflowID := scheduler.WorkflowIDPrefix + sid startReq := &workflowservice.StartWorkflowExecutionRequest{ Namespace: nsName, @@ -134,7 +143,7 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { ctx, common.CreateHistoryStartWorkflowRequest(nsID, startReq, nil, nil, time.Now().UTC()), ) - require.NoError(t, err) + s.NoError(err) _, err = env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, @@ -146,7 +155,7 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { }, }, ) - require.NoError(t, err) + s.NoError(err) // Issue migration. The CHASM handler will return AlreadyStarted, // and the V1 activity treats that as success (logs warning, returns nil). @@ -159,9 +168,9 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { Identity: "test", RequestId: testcore.RandomizeStr("request-id"), }) - require.NoError(t, err) + s.NoError(err) - require.Eventually(t, func() bool { + s.Eventually(func() bool { desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, &historyservice.DescribeWorkflowExecutionRequest{ @@ -186,12 +195,12 @@ func TestScheduleMigrationV2AlreadyExists(t *testing.T) { FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, }, ) - require.NoError(t, err) + s.NoError(err) } -func TestScheduleMigrationDynamicConfig(t *testing.T) { +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationDynamicConfig() { env := testcore.NewEnv( - t, + s.T(), testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true), ) @@ -232,7 +241,7 @@ func TestScheduleMigrationDynamicConfig(t *testing.T) { }, } inputPayloads, err := sdk.PreferProtoDataConverter.ToPayloads(startArgs) - require.NoError(t, err) + s.NoError(err) v1WorkflowID := scheduler.WorkflowIDPrefix + sid startReq := &workflowservice.StartWorkflowExecutionRequest{ Namespace: nsName, @@ -249,10 +258,10 @@ func TestScheduleMigrationDynamicConfig(t *testing.T) { ctx, common.CreateHistoryStartWorkflowRequest(nsID, startReq, nil, nil, time.Now().UTC()), ) - require.NoError(t, err) + s.NoError(err) // Wait for the per-namespace worker to pick up the V1 workflow. - require.Eventually(t, func() bool { + s.Eventually(func() bool { desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, &historyservice.DescribeWorkflowExecutionRequest{ @@ -270,7 +279,7 @@ func TestScheduleMigrationDynamicConfig(t *testing.T) { }, 10*time.Second, 500*time.Millisecond) // V1 workflow should automatically migrate due to dynamic config and complete. - require.Eventually(t, func() bool { + s.Eventually(func() bool { desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, &historyservice.DescribeWorkflowExecutionRequest{ @@ -295,12 +304,12 @@ func TestScheduleMigrationDynamicConfig(t *testing.T) { FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, }, ) - require.NoError(t, err) + s.NoError(err) } -func TestScheduleMigrationV1ToV2(t *testing.T) { +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV1ToV2() { env := testcore.NewEnv( - t, + s.T(), testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), ) @@ -340,7 +349,7 @@ func TestScheduleMigrationV1ToV2(t *testing.T) { }, } inputPayloads, err := sdk.PreferProtoDataConverter.ToPayloads(startArgs) - require.NoError(t, err) + s.NoError(err) v1WorkflowID := scheduler.WorkflowIDPrefix + sid startReq := &workflowservice.StartWorkflowExecutionRequest{ Namespace: nsName, @@ -357,10 +366,10 @@ func TestScheduleMigrationV1ToV2(t *testing.T) { ctx, common.CreateHistoryStartWorkflowRequest(nsID, startReq, nil, nil, time.Now().UTC()), ) - require.NoError(t, err) + s.NoError(err) // Wait for the per-namespace worker to pick up the V1 workflow. - require.Eventually(t, func() bool { + s.Eventually(func() bool { desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, &historyservice.DescribeWorkflowExecutionRequest{ @@ -385,10 +394,10 @@ func TestScheduleMigrationV1ToV2(t *testing.T) { Identity: "test", RequestId: testcore.RandomizeStr("request-id"), }) - require.NoError(t, err) + s.NoError(err) // Wait for V1 workflow to complete. - require.Eventually(t, func() bool { + s.Eventually(func() bool { desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, &historyservice.DescribeWorkflowExecutionRequest{ @@ -413,12 +422,12 @@ func TestScheduleMigrationV1ToV2(t *testing.T) { FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, }, ) - require.NoError(t, err) + s.NoError(err) } -func TestScheduleMigrationV2ToV1(t *testing.T) { +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV2ToV1() { env := testcore.NewEnv( - t, + s.T(), testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, false), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerRouting, false), @@ -470,7 +479,7 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { }, }, ) - require.NoError(t, err) + s.NoError(err) // Describe the CHASM schedule before migration to capture its state. v2Desc, err := env.GetTestCluster().SchedulerClient().DescribeSchedule( @@ -480,7 +489,7 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, }, ) - require.NoError(t, err) + s.NoError(err) v2Schedule := v2Desc.GetFrontendResponse().GetSchedule() v2ConflictToken := v2Desc.GetFrontendResponse().GetConflictToken() @@ -492,11 +501,11 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { Identity: "test", RequestId: testcore.RandomizeStr("request-id"), }) - require.NoError(t, err) + s.NoError(err) // Wait for the CHASM scheduler to be closed after migration. var failedPreconditionErr *serviceerror.FailedPrecondition - require.Eventually(t, func() bool { + s.Eventually(func() bool { _, chasmErr := env.GetTestCluster().SchedulerClient().DescribeSchedule( ctx, &schedulerpb.DescribeScheduleRequest{ @@ -509,7 +518,7 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { // Wait for the V1 system scheduler workflow to be running. sysWorkflowID := scheduler.WorkflowIDPrefix + sid - require.Eventually(t, func() bool { + s.Eventually(func() bool { _, descErr := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( ctx, &historyservice.DescribeWorkflowExecutionRequest{ @@ -527,7 +536,7 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { // goes directly to the V1 path. The per-namespace worker must pick up // the workflow and register query handlers before this succeeds. var v1Desc *workflowservice.DescribeScheduleResponse - require.Eventually(t, func() bool { + s.Eventually(func() bool { v1Desc, err = env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ Namespace: nsName, ScheduleId: sid, @@ -538,8 +547,8 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { v1Schedule := v1Desc.GetSchedule() // Validate the schedule spec is preserved across migration. - require.Len(t, v1Schedule.GetSpec().GetInterval(), len(v2Schedule.GetSpec().GetInterval())) - require.Equal(t, + s.Len(v1Schedule.GetSpec().GetInterval(), len(v2Schedule.GetSpec().GetInterval())) + s.Equal( v2Schedule.GetSpec().GetInterval()[0].GetInterval().AsDuration(), v1Schedule.GetSpec().GetInterval()[0].GetInterval().AsDuration(), ) @@ -547,49 +556,49 @@ func TestScheduleMigrationV2ToV1(t *testing.T) { // Validate the action is preserved. v2Action := v2Schedule.GetAction().GetStartWorkflow() v1Action := v1Schedule.GetAction().GetStartWorkflow() - require.Equal(t, v2Action.GetWorkflowId(), v1Action.GetWorkflowId()) - require.Equal(t, v2Action.GetWorkflowType().GetName(), v1Action.GetWorkflowType().GetName()) - require.Equal(t, v2Action.GetTaskQueue().GetName(), v1Action.GetTaskQueue().GetName()) + s.Equal(v2Action.GetWorkflowId(), v1Action.GetWorkflowId()) + s.Equal(v2Action.GetWorkflowType().GetName(), v1Action.GetWorkflowType().GetName()) + s.Equal(v2Action.GetTaskQueue().GetName(), v1Action.GetTaskQueue().GetName()) // Validate policies are preserved. - require.Equal(t, + s.Equal( v2Schedule.GetPolicies().GetOverlapPolicy(), v1Schedule.GetPolicies().GetOverlapPolicy(), ) - require.Equal(t, + s.Equal( v2Schedule.GetPolicies().GetCatchupWindow().AsDuration(), v1Schedule.GetPolicies().GetCatchupWindow().AsDuration(), ) // Validate the paused state is correctly restored (not the migration-imposed pause). - require.Equal(t, v2Schedule.GetState().GetPaused(), v1Schedule.GetState().GetPaused()) - require.Equal(t, v2Schedule.GetState().GetNotes(), v1Schedule.GetState().GetNotes()) + s.Equal(v2Schedule.GetState().GetPaused(), v1Schedule.GetState().GetPaused()) + s.Equal(v2Schedule.GetState().GetNotes(), v1Schedule.GetState().GetNotes()) // Validate the conflict token value is preserved across migration. // V2 (CHASM) serializes as LittleEndian, V1 (workflow) as BigEndian, so decode both to int64. - require.Len(t, v2ConflictToken, 8) + s.Len(v2ConflictToken, 8) v2Token := int64(binary.LittleEndian.Uint64(v2ConflictToken)) v1ConflictToken := v1Desc.GetConflictToken() - require.Len(t, v1ConflictToken, 8) + s.Len(v1ConflictToken, 8) v1Token := int64(binary.BigEndian.Uint64(v1ConflictToken)) - require.Equal(t, v2Token, v1Token) + s.Equal(v2Token, v1Token) // Validate ListSchedules returns exactly one entry once the V1 workflow // has written its visibility records (no duplicates from V1+V2). var listResp *workflowservice.ListSchedulesResponse - require.Eventually(t, func() bool { + s.Eventually(func() bool { listResp, err = env.FrontendClient().ListSchedules(ctx, &workflowservice.ListSchedulesRequest{ Namespace: nsName, MaximumPageSize: 10, }) return err == nil && len(listResp.GetSchedules()) == 1 }, 30*time.Second, 500*time.Millisecond) - require.Equal(t, sid, listResp.GetSchedules()[0].GetScheduleId()) + s.Equal(sid, listResp.GetSchedules()[0].GetScheduleId()) } -func TestScheduleMigrationV2ToV1Idempotent(t *testing.T) { +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV2ToV1Idempotent() { env := testcore.NewEnv( - t, + s.T(), testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, false), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerRouting, false), @@ -634,7 +643,7 @@ func TestScheduleMigrationV2ToV1Idempotent(t *testing.T) { }, }, ) - require.NoError(t, err) + s.NoError(err) // First migration call. _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ @@ -644,7 +653,7 @@ func TestScheduleMigrationV2ToV1Idempotent(t *testing.T) { Identity: "test", RequestId: testcore.RandomizeStr("request-id"), }) - require.NoError(t, err) + s.NoError(err) // Second migration call should also succeed (idempotent). _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ @@ -654,12 +663,12 @@ func TestScheduleMigrationV2ToV1Idempotent(t *testing.T) { Identity: "test", RequestId: testcore.RandomizeStr("request-id"), }) - require.NoError(t, err) + s.NoError(err) } -func TestCHASMScheduleDescribeAfterDisablingCreationAndMigration(t *testing.T) { +func (s *ScheduleMigrationTestSuite) TestCHASMScheduleDescribeAfterDisablingCreationAndMigration() { env := testcore.NewEnv( - t, + s.T(), testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, true), testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true), @@ -696,15 +705,15 @@ func TestCHASMScheduleDescribeAfterDisablingCreationAndMigration(t *testing.T) { Identity: "test", RequestId: uuid.NewString(), }) - require.NoError(t, err) + s.NoError(err) firstDescribe, err := env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ Namespace: nsName, ScheduleId: sid, }) - require.NoError(t, err) - require.NotNil(t, firstDescribe.GetSchedule()) - require.Eventually(t, func() bool { + s.NoError(err) + s.NotNil(firstDescribe.GetSchedule()) + s.Eventually(func() bool { listResp, listErr := env.FrontendClient().ListSchedules(ctx, &workflowservice.ListSchedulesRequest{Namespace: nsName}) if listErr != nil { return false @@ -726,12 +735,12 @@ func TestCHASMScheduleDescribeAfterDisablingCreationAndMigration(t *testing.T) { FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, }, ) - require.NoError(t, err) + s.NoError(err) env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, false) env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, false) - require.Eventually(t, func() bool { + s.Eventually(func() bool { describeResp, describeErr := env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ Namespace: nsName, ScheduleId: sid, @@ -754,3 +763,400 @@ func TestCHASMScheduleDescribeAfterDisablingCreationAndMigration(t *testing.T) { return false }, 10*time.Second, 200*time.Millisecond) } + +// TestScheduleMigrationV2ToV1RoutingFallback verifies that after migrating a +// CHASM schedule to V1, frontend operations with CHASM routing enabled fall +// through to the V1 workflow stack when the CHASM scheduler returns ErrClosed. +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV2ToV1RoutingFallback() { + env := testcore.NewEnv( + s.T(), + testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, true), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerRouting, true), + ) + + ctx := testcore.NewContext() + sid := testcore.RandomizeStr("sched-v2-to-v1-routing") + wid := testcore.RandomizeStr("sched-v2-to-v1-routing-wf") + wt := testcore.RandomizeStr("sched-v2-to-v1-routing-wt") + tq := testcore.RandomizeStr("tq") + + nsName := env.Namespace().String() + nsID := env.NamespaceID().String() + sched := &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{ + {Interval: durationpb.New(1 * time.Hour)}, + }, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: tq, Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + + // Create CHASM schedule directly. + _, err := env.GetTestCluster().SchedulerClient().CreateSchedule( + ctx, + &schedulerpb.CreateScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.CreateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Schedule: sched, + Identity: "test", + RequestId: testcore.RandomizeStr("request-id"), + }, + }, + ) + s.NoError(err) + + // Migrate from V2 (CHASM) to V1 (workflow). + _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Target: adminservice.MigrateScheduleRequest_SCHEDULER_TARGET_WORKFLOW, + Identity: "test", + RequestId: testcore.RandomizeStr("request-id"), + }) + s.NoError(err) + + // Wait for the CHASM scheduler to be closed after migration. + var failedPreconditionErr *serviceerror.FailedPrecondition + s.Eventually(func() bool { + _, chasmErr := env.GetTestCluster().SchedulerClient().DescribeSchedule( + ctx, + &schedulerpb.DescribeScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, + }, + ) + return errors.As(chasmErr, &failedPreconditionErr) + }, 10*time.Second, 500*time.Millisecond) + + // Wait for the V1 workflow to be running and query handlers registered. + s.Eventually(func() bool { + _, descErr := env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + }) + return descErr == nil + }, 30*time.Second, 500*time.Millisecond) + + // With CHASM routing still enabled, DescribeSchedule through the frontend + // should succeed by falling through from the closed CHASM schedule to V1. + descResp, err := env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + }) + s.NoError(err) + s.NotNil(descResp.GetSchedule()) + s.Equal(wt, descResp.GetSchedule().GetAction().GetStartWorkflow().GetWorkflowType().GetName()) + // The schedule was created unpaused; migration should preserve that state + // (not the temporary migration-imposed pause). + s.False(descResp.GetSchedule().GetState().GetPaused()) + + // ListScheduleMatchingTimes should also fall through to V1. + now := time.Now().UTC() + matchResp, err := env.FrontendClient().ListScheduleMatchingTimes(ctx, &workflowservice.ListScheduleMatchingTimesRequest{ + Namespace: nsName, + ScheduleId: sid, + StartTime: timestamppb.New(now), + EndTime: timestamppb.New(now.Add(5 * time.Hour)), + }) + s.NoError(err) + s.NotEmpty(matchResp.GetStartTime()) + + // PatchSchedule (pause) should also fall through to V1. + _, err = env.FrontendClient().PatchSchedule(ctx, &workflowservice.PatchScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Patch: &schedulepb.SchedulePatch{ + Pause: "pausing via routing fallback test", + }, + Identity: "test", + }) + s.NoError(err) + + // Verify the pause took effect on V1. The patch is delivered as a signal, + // so the workflow needs time to process it. + s.Eventually(func() bool { + descResp, err = env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + }) + return err == nil && descResp.GetSchedule().GetState().GetPaused() + }, 10*time.Second, 500*time.Millisecond) + + // DeleteSchedule should also fall through to V1. + _, err = env.FrontendClient().DeleteSchedule(ctx, &workflowservice.DeleteScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Identity: "test", + }) + s.NoError(err) +} + +func (s *ScheduleMigrationTestSuite) TestScheduleUpdateAfterDelete() { + env := testcore.NewEnv( + s.T(), + testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, true), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerRouting, true), + ) + + ctx := testcore.NewContext() + sid := testcore.RandomizeStr("sched-update-after-delete") + wid := testcore.RandomizeStr("sched-update-after-delete-wf") + wt := testcore.RandomizeStr("sched-update-after-delete-wt") + tq := testcore.RandomizeStr("tq") + + nsName := env.Namespace().String() + nsID := env.NamespaceID().String() + + schedule := &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{ + {Interval: durationpb.New(1 * time.Hour)}, + }, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: tq, Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + + // Create CHASM schedule. + _, err := env.GetTestCluster().SchedulerClient().CreateSchedule( + ctx, + &schedulerpb.CreateScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.CreateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Schedule: schedule, + Identity: "test", + RequestId: testcore.RandomizeStr("request-id"), + }, + }, + ) + s.NoError(err) + + // Delete via scheduler client. + _, err = env.GetTestCluster().SchedulerClient().DeleteSchedule( + ctx, + &schedulerpb.DeleteScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DeleteScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Identity: "test", + }, + }, + ) + s.NoError(err) + + // Update via scheduler client should fail on the closed schedule. + _, err = env.GetTestCluster().SchedulerClient().UpdateSchedule( + ctx, + &schedulerpb.UpdateScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.UpdateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Schedule: schedule, + Identity: "test", + }, + }, + ) + var failedPreconditionErr *serviceerror.FailedPrecondition + s.ErrorAs(err, &failedPreconditionErr) + + // Patch via scheduler client should also fail on the closed schedule. + _, err = env.GetTestCluster().SchedulerClient().PatchSchedule( + ctx, + &schedulerpb.PatchScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.PatchScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Patch: &schedulepb.SchedulePatch{Pause: "test"}, + Identity: "test", + }, + }, + ) + s.ErrorAs(err, &failedPreconditionErr) + + // Delete again is idempotent in CHASM — sets Closed=true again. + _, err = env.GetTestCluster().SchedulerClient().DeleteSchedule( + ctx, + &schedulerpb.DeleteScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DeleteScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Identity: "test", + }, + }, + ) + s.NoError(err) +} + +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV1ToV2WithClosedV2() { + env := testcore.NewEnv( + s.T(), + testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), + ) + + ctx := testcore.NewContext() + sid := testcore.RandomizeStr("sched-migrate-v1-v2-closed") + wid := testcore.RandomizeStr("sched-migrate-v1-v2-closed-wf") + wt := testcore.RandomizeStr("sched-migrate-v1-v2-closed-wt") + tq := testcore.RandomizeStr("tq") + + nsName := env.Namespace().String() + nsID := env.NamespaceID().String() + sched := &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{ + {Interval: durationpb.New(1 * time.Hour)}, + }, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: tq, Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + + // Create a CHASM schedule and then delete it. + _, err := env.GetTestCluster().SchedulerClient().CreateSchedule( + ctx, + &schedulerpb.CreateScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.CreateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Schedule: sched, + Identity: "test", + RequestId: testcore.RandomizeStr("request-id"), + }, + }, + ) + s.NoError(err) + + _, err = env.GetTestCluster().SchedulerClient().DeleteSchedule( + ctx, + &schedulerpb.DeleteScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DeleteScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Identity: "test", + }, + }, + ) + s.NoError(err) + + // Create a V1 (workflow-backed) scheduler with the same ID. + startArgs := &schedulespb.StartScheduleArgs{ + Schedule: sched, + State: &schedulespb.InternalState{ + Namespace: nsName, + NamespaceId: nsID, + ScheduleId: sid, + ConflictToken: scheduler.InitialConflictToken, + }, + } + inputPayloads, err := sdk.PreferProtoDataConverter.ToPayloads(startArgs) + s.NoError(err) + v1WorkflowID := scheduler.WorkflowIDPrefix + sid + startReq := &workflowservice.StartWorkflowExecutionRequest{ + Namespace: nsName, + WorkflowId: v1WorkflowID, + WorkflowType: &commonpb.WorkflowType{Name: scheduler.WorkflowType}, + TaskQueue: &taskqueuepb.TaskQueue{Name: primitives.PerNSWorkerTaskQueue}, + Input: inputPayloads, + Identity: "test", + RequestId: testcore.RandomizeStr("request-id"), + WorkflowIdReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + WorkflowIdConflictPolicy: enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + } + _, err = env.GetTestCluster().HistoryClient().StartWorkflowExecution( + ctx, + common.CreateHistoryStartWorkflowRequest(nsID, startReq, nil, nil, time.Now().UTC()), + ) + s.NoError(err) + + // Wait for the per-namespace worker to pick up the V1 workflow. + s.Eventually(func() bool { + desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( + ctx, + &historyservice.DescribeWorkflowExecutionRequest{ + NamespaceId: nsID, + Request: &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: nsName, + Execution: &commonpb.WorkflowExecution{WorkflowId: v1WorkflowID}, + }, + }, + ) + if err != nil { + return false + } + return desc.GetWorkflowExecutionInfo().GetHistoryLength() > 3 + }, 10*time.Second, 500*time.Millisecond) + + // Issue migration from V1 to V2. The previously deleted CHASM execution + // does not block creation of a new one -- StartExecution succeeds because + // closed executions allow reuse of the business ID. + _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Target: adminservice.MigrateScheduleRequest_SCHEDULER_TARGET_CHASM, + Identity: "test", + RequestId: testcore.RandomizeStr("request-id"), + }) + s.NoError(err) + + // Wait for the V1 workflow to complete (migration activity ran). + s.Eventually(func() bool { + desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( + ctx, + &historyservice.DescribeWorkflowExecutionRequest{ + NamespaceId: nsID, + Request: &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: nsName, + Execution: &commonpb.WorkflowExecution{WorkflowId: v1WorkflowID}, + }, + }, + ) + if err != nil { + return false + } + return desc.GetWorkflowExecutionInfo().GetStatus() == enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED + }, 10*time.Second, 500*time.Millisecond) + + // The new V2 schedule should be describable. + _, err = env.GetTestCluster().SchedulerClient().DescribeSchedule( + ctx, + &schedulerpb.DescribeScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, + }, + ) + s.NoError(err) +} diff --git a/tests/schedule_test.go b/tests/schedule_test.go index 21daffeb5b..69790c0cac 100644 --- a/tests/schedule_test.go +++ b/tests/schedule_test.go @@ -102,6 +102,63 @@ func runSharedScheduleTests(t *testing.T, newContext contextFactory) { t.Run("TestLimitMemoSpecSize", func(t *testing.T) { testLimitMemoSpecSize(t, newContext) }) t.Run("TestCountSchedules", func(t *testing.T) { testCountSchedules(t, newContext) }) t.Run("TestSchedule_InternalTaskQueue", func(t *testing.T) { testScheduleInternalTaskQueue(t, newContext) }) + t.Run("TestDeletedScheduleOperations", func(t *testing.T) { testDeletedScheduleOperations(t, newContext) }) +} + +func testDeletedScheduleOperations(t *testing.T, newContext contextFactory) { + s := testcore.NewEnv(t, scheduleCommonOpts()...) + + sid := "sched-test-deleted-ops" + wid := "sched-test-deleted-ops-wf" + wt := "sched-test-deleted-ops-wt" + + schedule := &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{ + {Interval: durationpb.New(1 * time.Hour)}, + }, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: s.WorkerTaskQueue(), Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + + // Create a schedule. + _, err := s.FrontendClient().CreateSchedule(newContext(s.Context()), &workflowservice.CreateScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + Schedule: schedule, + Identity: "test", + RequestId: uuid.NewString(), + }) + s.NoError(err) + + // Delete the schedule. + _, err = s.FrontendClient().DeleteSchedule(newContext(s.Context()), &workflowservice.DeleteScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + Identity: "test", + }) + s.NoError(err) + + // Describe should return NotFound. + var notFoundErr *serviceerror.NotFound + s.Eventually(func() bool { + _, descErr := s.FrontendClient().DescribeSchedule(newContext(s.Context()), &workflowservice.DescribeScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + }) + return errors.As(descErr, ¬FoundErr) + }, 10*time.Second, 200*time.Millisecond) + + // Update, Patch, and Delete behave differently across CHASM and V1, + // so they are not tested here. See TestScheduleUpdateAfterDelete. } func testBasics(t *testing.T, newContext contextFactory) { @@ -520,7 +577,8 @@ func testBasics(t *testing.T, newContext contextFactory) { Namespace: s.Namespace().String(), ScheduleId: sid, }) - s.Error(err) + var notFoundErr *serviceerror.NotFound + s.ErrorAs(err, ¬FoundErr) s.Eventually(func() bool { // wait for visibility listResp, err := s.FrontendClient().ListSchedules(newContext(s.Context()), &workflowservice.ListSchedulesRequest{ diff --git a/tests/task_queue_stats_test.go b/tests/task_queue_stats_test.go index d4d658ad2f..a2646aeddd 100644 --- a/tests/task_queue_stats_test.go +++ b/tests/task_queue_stats_test.go @@ -20,211 +20,151 @@ import ( deploymentspb "go.temporal.io/server/api/deployment/v1" "go.temporal.io/server/common" "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/worker_versioning" "go.temporal.io/server/tests/testcore" "google.golang.org/protobuf/types/known/durationpb" ) -type ( - // taskQueueStatsSuite encapsulates the test environment and parameters for task queue stats tests. - taskQueueStatsSuite struct { - testcore.Env - usePriMatcher bool - minPriority int - maxPriority int - defaultPriority int - partitionCount int - } +type taskQueueExpectations struct { + BacklogCount int + MaxExtraTasks int + CachedEnabled bool +} - TaskQueueExpectations struct { - BacklogCount int - MaxExtraTasks int - CachedEnabled bool - } +// taskQueueExpectationsByType maps task queue types to their expectations +type taskQueueExpectationsByType map[enumspb.TaskQueueType]taskQueueExpectations - // TaskQueueExpectationsByType maps task queue types to their expectations - TaskQueueExpectationsByType map[enumspb.TaskQueueType]TaskQueueExpectations -) +type workflowTasksAndActivitiesPollerParams struct { + tqName string + deploymentName string + buildID string + identity string + logPrefix string + activityIDPrefix string + maxToSchedule int + maxConsecEmptyPoll int + versioningBehavior enumspb.VersioningBehavior +} -func newTaskQueueStatsSuite(env testcore.Env, usePriMatcher bool) *taskQueueStatsSuite { - return &taskQueueStatsSuite{ - Env: env, - usePriMatcher: usePriMatcher, - minPriority: 1, - maxPriority: 5, - defaultPriority: 3, - partitionCount: 2, // kept low to reduce test time on CI - } +// TaskQueueStatsSuite groups task queue stats tests that are run with different matcher configurations. +type TaskQueueStatsSuite struct { + parallelsuite.Suite[*TaskQueueStatsSuite] } // TODO(pri): remove once the classic matcher is removed func TestTaskQueueStats_Classic_Suite(t *testing.T) { - t.Parallel() - runTaskQueueStatsTests(t, false) // usePriMatcher = false + parallelsuite.Run(t, &TaskQueueStatsSuite{}, false) // usePriMatcher = false } func TestTaskQueueStats_Pri_Suite(t *testing.T) { - t.Parallel() - runTaskQueueStatsTests(t, true) // usePriMatcher = true + parallelsuite.Run(t, &TaskQueueStatsSuite{}, true) // usePriMatcher = true } -func runTaskQueueStatsTests(t *testing.T, usePriMatcher bool) { - baseOpts := []testcore.TestOption{ - testcore.WithDynamicConfig(dynamicconfig.EnableDeploymentVersions, true), - testcore.WithDynamicConfig(dynamicconfig.FrontendEnableWorkerVersioningWorkflowAPIs, true), - testcore.WithDynamicConfig(dynamicconfig.MatchingUseNewMatcher, usePriMatcher), - testcore.WithDynamicConfig(dynamicconfig.MatchingPriorityLevels, 5), // maxPriority - } - - // Tests WITHOUT RunTestWithMatchingBehavior - t.Run("TestDescribeTaskQueue_NonRoot", func(t *testing.T) { - env := testcore.NewEnv(t, baseOpts...) - s := newTaskQueueStatsSuite(env, usePriMatcher) - s.testDescribeTaskQueueNonRoot() - }) - - t.Run("TestNoTasks_ValidateStats", func(t *testing.T) { - opts := append(baseOpts, - testcore.WithDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 2), - testcore.WithDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 2), - testcore.WithDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second), - testcore.WithDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond), - ) - env := testcore.NewEnv(t, opts...) - s := newTaskQueueStatsSuite(env, usePriMatcher) - s.publishConsumeWorkflowTasksValidateStats(0, false) - }) - - t.Run("TestAddMultipleTasks_ValidateStats_Cached", func(t *testing.T) { - opts := append(baseOpts, - testcore.WithDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second), - testcore.WithDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Hour), - ) - env := testcore.NewEnv(t, opts...) - s := newTaskQueueStatsSuite(env, usePriMatcher) - s.testAddMultipleTasksValidateStatsCached() - }) - - // Tests WITH RunTestWithMatchingBehavior - // Note: runWithMatchingBehavior already configures partition count based on forwarding behavior. - // Do NOT override MatchingNumTaskqueueReadPartitions/WritePartitions inside the subtest. - t.Run("TestMultipleTasks_WithMatchingBehavior_ValidateStats", func(t *testing.T) { - runSuiteWithMatchingBehaviors(t, baseOpts, usePriMatcher, func(s *taskQueueStatsSuite) { - s.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) - s.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) - s.publishConsumeWorkflowTasksValidateStats(4, false) - }) - }) - - t.Run("TestCurrentVersionAbsorbsUnversionedBacklog_NoRamping", func(t *testing.T) { - runSuiteWithMatchingBehaviors(t, baseOpts, usePriMatcher, func(s *taskQueueStatsSuite) { - s.currentVersionAbsorbsUnversionedBacklogNoRamping() - }) - }) - - t.Run("TestRampingAndCurrentAbsorbUnversionedBacklog", func(t *testing.T) { - runSuiteWithMatchingBehaviors(t, baseOpts, usePriMatcher, func(s *taskQueueStatsSuite) { - s.rampingAndCurrentAbsorbsUnversionedBacklog() - }) - }) - - t.Run("TestCurrentAbsorbsUnversionedBacklog_WhenRampingToUnversioned", func(t *testing.T) { - runSuiteWithMatchingBehaviors(t, baseOpts, usePriMatcher, func(s *taskQueueStatsSuite) { - s.currentAbsorbsUnversionedBacklogWhenRampingToUnversioned() - }) - }) - - t.Run("TestRampingAbsorbsUnversionedBacklog_WhenCurrentIsUnversioned", func(t *testing.T) { - runSuiteWithMatchingBehaviors(t, baseOpts, usePriMatcher, func(s *taskQueueStatsSuite) { - s.rampingAbsorbsUnversionedBacklogWhenCurrentIsUnversioned() - }) - }) - - t.Run("TestInactiveVersionDoesNotAbsorbUnversionedBacklog", func(t *testing.T) { - runSuiteWithMatchingBehaviors(t, baseOpts, usePriMatcher, func(s *taskQueueStatsSuite) { - s.inactiveVersionDoesNotAbsorbUnversionedBacklog() - }) - }) -} - -// runSuiteWithMatchingBehaviors runs a test with all combinations of matching behaviors. -func runSuiteWithMatchingBehaviors( - t *testing.T, - baseOpts []testcore.TestOption, - usePriMatcher bool, - subtest func(s *taskQueueStatsSuite), -) { - runWithMatchingBehaviors(t, baseOpts, func(env *testcore.TestEnv, behavior testcore.MatchingBehavior) { - s := newTaskQueueStatsSuite(env, usePriMatcher) - subtest(s) - }) -} - -func (s *taskQueueStatsSuite) testDescribeTaskQueueNonRoot() { - resp, err := s.FrontendClient().DescribeTaskQueue(context.Background(), &workflowservice.DescribeTaskQueueRequest{ - Namespace: s.Namespace().String(), +func (s *TaskQueueStatsSuite) TestDescribeTaskQueue_NonRoot(usePriMatcher bool) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, testcore.MatchingBehavior{}) + resp, err := env.FrontendClient().DescribeTaskQueue(context.Background(), &workflowservice.DescribeTaskQueueRequest{ + Namespace: env.Namespace().String(), TaskQueue: &taskqueuepb.TaskQueue{Name: "/_sys/foo/1", Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, }) - require.NoError(s.T(), err) - require.NotNil(s.T(), resp) + s.NoError(err) + s.NotNil(resp) - _, err = s.FrontendClient().DescribeTaskQueue(context.Background(), + _, err = env.FrontendClient().DescribeTaskQueue(context.Background(), &workflowservice.DescribeTaskQueueRequest{ - Namespace: s.Namespace().String(), + Namespace: env.Namespace().String(), TaskQueue: &taskqueuepb.TaskQueue{Name: "/_sys/foo/1", Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, ReportStats: true, }) - require.ErrorContains(s.T(), err, "DescribeTaskQueue stats are only supported for the root partition") + s.ErrorContains(err, "DescribeTaskQueue stats are only supported for the root partition") +} + +func (s *TaskQueueStatsSuite) TestNoTasks_ValidateStats(usePriMatcher bool) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, testcore.MatchingBehavior{}, + testcore.WithDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 2), + testcore.WithDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 2), + testcore.WithDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second), + testcore.WithDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond), + ) + env.publishConsumeWorkflowTasksValidateStats(0, false) } -func (s *taskQueueStatsSuite) testAddMultipleTasksValidateStatsCached() { +func (s *TaskQueueStatsSuite) TestAddMultipleTasks_ValidateStats_Cached(usePriMatcher bool) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, testcore.MatchingBehavior{}, + testcore.WithDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second), + testcore.WithDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Hour), + ) tqName := "tq-" + common.GenerateRandomString(5) - s.createDeploymentInTaskQueue(tqName) + env.createDeploymentInTaskQueue(tqName) // Enqueue all workflows. - total := s.enqueueWorkflows(2, tqName) + total := env.enqueueWorkflows(2, tqName) // Verify workflow add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) // Expect at least *one* of the workflow/activity tasks to be in the stats. - expectations := TaskQueueExpectations{ + expectations := taskQueueExpectations{ BacklogCount: 1, // ie at least one task in the backlog MaxExtraTasks: total, // ie at most all tasks can be in the backlog CachedEnabled: true, } // Enqueue 1 activity set, to make sure the workflow backlog has some tasks. - s.enqueueActivitiesForEachWorkflow(1, tqName) + env.enqueueActivitiesForEachWorkflow(1, tqName) // Expect the workflow backlog to be non-empty now. // This query will cache the stats for the remainder of the test. - s.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, expectations, false) + env.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, expectations, false) // Enqueue remaining activities. - s.enqueueActivitiesForEachWorkflow(1, tqName) + env.enqueueActivitiesForEachWorkflow(1, tqName) // Poll 2 activities, ie 1 per version, to make sure the activity backlog has some tasks. - s.pollActivities(2, tqName) + env.pollActivities(2, tqName) // Verify activity dispatch rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, false, true) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, false, true) // Expect the activity backlog to be non-empty now. // This query will cache the stats for the remainder of the test. - s.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, expectations, false) + env.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, expectations, false) // Poll remaining activities. - s.pollActivities(total-2, tqName) + env.pollActivities(total-2, tqName) // Despite having polled all the workflows/activies; the stats won't have changed at all since they were cached. - s.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, expectations, false) - s.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, expectations, false) + env.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, expectations, false) + env.validateTaskQueueStatsByType(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, expectations, false) } -func (s *taskQueueStatsSuite) currentVersionAbsorbsUnversionedBacklogNoRamping() { - s.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) - s.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL +// TestVersioningSuite runs version-specific tests across all matching behavior combinations. +// Note: matching behaviors configure partition count based on forwarding behavior. +// Do NOT override MatchingNumTaskqueueReadPartitions/WritePartitions inside the subtest. +func (s *TaskQueueStatsSuite) TestVersioningSuite(usePriMatcher bool) { + for _, behavior := range testcore.AllMatchingBehaviors() { + s.T().Run(behavior.Name()+"Suite", func(t *testing.T) { //nolint:testifylint // nested parallelsuite.Run needs raw *testing.T + parallelsuite.Run(t, &TaskQueueStatsVersionSuite{}, usePriMatcher, behavior) + }) + } +} + +// TaskQueueStatsVersionSuite groups task queue stats tests that run across matching behavior combinations. +type TaskQueueStatsVersionSuite struct { + parallelsuite.Suite[*TaskQueueStatsVersionSuite] +} + +func (s *TaskQueueStatsVersionSuite) TestMultipleTasks_ValidateStats(usePriMatcher bool, behavior testcore.MatchingBehavior) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, behavior) + env.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) + env.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) + env.publishConsumeWorkflowTasksValidateStats(4, false) +} + +func (s *TaskQueueStatsVersionSuite) TestCurrentVersionAbsorbsUnversionedBacklog_NoRamping(usePriMatcher bool, behavior testcore.MatchingBehavior) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, behavior) + env.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) + env.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() @@ -235,30 +175,30 @@ func (s *taskQueueStatsSuite) currentVersionAbsorbsUnversionedBacklogNoRamping() // Register this version in the task queue pollerCtx, cancelPoller := context.WithCancel(testcore.NewContext()) - s.createVersionsInTaskQueue(pollerCtx, tqName, deploymentName, currentBuildID) + env.createVersionsInTaskQueue(pollerCtx, tqName, deploymentName, currentBuildID) // Set current version only (no ramping) - s.setCurrentVersion(deploymentName, currentBuildID) + env.setCurrentVersion(deploymentName, currentBuildID) // Stopping the pollers so that we verify the backlog expectations cancelPoller() // Enqueue unversioned backlog - unversionedWorkflowCount := 10 * s.partitionCount - s.startUnversionedWorkflows(unversionedWorkflowCount, tqName) + unversionedWorkflowCount := 10 * env.partitionCount + env.startUnversionedWorkflows(unversionedWorkflowCount, tqName) // Verify workflow add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) - currentStatsExpectation := TaskQueueExpectations{ + currentStatsExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount, MaxExtraTasks: 0, } - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // DescribeWorkerDeploymentVersion: current version should also show the full backlog for this task queue. - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[workflow]", @@ -270,7 +210,7 @@ func (s *taskQueueStatsSuite) currentVersionAbsorbsUnversionedBacklogNoRamping() ) // DescribeTaskQueue Legacy Mode: Since the task queue is part of the current version, the legacy mode should report the total backlog count. - s.requireLegacyTaskQueueStatsRelaxed( + env.requireLegacyTaskQueueStatsRelaxed( ctx, a, "DescribeTaskQueue[legacy]", @@ -281,23 +221,23 @@ func (s *taskQueueStatsSuite) currentVersionAbsorbsUnversionedBacklogNoRamping() }, 10*time.Second, 200*time.Millisecond) // The backlog count for the activity task queue should be equal to the number of activities scheduled since the activity task queue is part of the current version. - activitesToSchedule := 10 * s.partitionCount - s.completeWorkflowTasksAndScheduleActivities(tqName, deploymentName, currentBuildID, activitesToSchedule) + activitesToSchedule := 10 * env.partitionCount + env.completeWorkflowTasksAndScheduleActivities(tqName, deploymentName, currentBuildID, activitesToSchedule) // Verify activity add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, true, false) - activityStatsExpectation := TaskQueueExpectations{ + activityStatsExpectation := taskQueueExpectations{ BacklogCount: activitesToSchedule, MaxExtraTasks: 0, } - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // Since the activity task queue is part of the current version, // the DescribeWorkerDeploymentVersion should report the backlog count for the activity task queue. - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[activity][after-scheduling-activities]", @@ -309,7 +249,7 @@ func (s *taskQueueStatsSuite) currentVersionAbsorbsUnversionedBacklogNoRamping() ) // DescribeTaskQueue Legacy Mode: Since the activity task queue is part of the current version, the legacy mode should report the total backlog count. - s.requireLegacyTaskQueueStatsRelaxed( + env.requireLegacyTaskQueueStatsRelaxed( ctx, a, "DescribeTaskQueue[legacy][activity]", @@ -320,9 +260,10 @@ func (s *taskQueueStatsSuite) currentVersionAbsorbsUnversionedBacklogNoRamping() }, 10*time.Second, 200*time.Millisecond) } -func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { - s.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) - s.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL +func (s *TaskQueueStatsVersionSuite) TestRampingAndCurrentAbsorbUnversionedBacklog(usePriMatcher bool, behavior testcore.MatchingBehavior) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, behavior) + env.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) + env.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) defer cancel() @@ -333,33 +274,33 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { rampingBuildID := "v2" pollCtx, cancelPoll := context.WithCancel(ctx) - s.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, currentBuildID) - s.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, rampingBuildID) + env.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, currentBuildID) + env.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, rampingBuildID) cancelPoll() // cancel the pollers so that we can verify the backlog expectations // Set ramping version to 30% rampPercentage := 30 - s.setRampingVersion(deploymentName, rampingBuildID, rampPercentage) + env.setRampingVersion(deploymentName, rampingBuildID, rampPercentage) // Set current version - s.setCurrentVersion(deploymentName, currentBuildID) + env.setCurrentVersion(deploymentName, currentBuildID) // Enqueue unversioned backlog. - unversionedWorkflowCount := 10 * s.partitionCount - s.startUnversionedWorkflows(unversionedWorkflowCount, tqName) + unversionedWorkflowCount := 10 * env.partitionCount + env.startUnversionedWorkflows(unversionedWorkflowCount, tqName) // Verify workflow add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) - currentExpectation := TaskQueueExpectations{ + currentExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount * (100 - rampPercentage) / 100, MaxExtraTasks: 0, } - rampingExpectation := TaskQueueExpectations{ + rampingExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount * rampPercentage / 100, MaxExtraTasks: 0, } - legacyExpectation := TaskQueueExpectations{ + legacyExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount, MaxExtraTasks: 0, } @@ -367,12 +308,12 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { // Currently only testing the following API's: // - DescribeWorkerDeploymentVersion for the current and ramping versions. // - DescribeTaskQueue Legacy Mode for the current and ramping versions. - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // DescribeWorkerDeploymentVersion: current version should also show only 70% of the unversioned backlog for this task queue // as a ramping version, with ramp set to 30%, exists and absorbs 30% of the unversioned backlog. - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[current][workflow]", @@ -384,7 +325,7 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { ) // DescribeWorkerDeploymentVersion: ramping version should show the remaining 30% of the unversioned backlog for this task queue - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[ramping][workflow]", @@ -395,7 +336,7 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { rampingExpectation, ) // Since the task queue is part of both the current and ramping versions, the legacy mode should report the total backlog count. - s.requireLegacyTaskQueueStatsRelaxed( + env.requireLegacyTaskQueueStatsRelaxed( ctx, a, "DescribeTaskQueue[legacy][workflow]", @@ -407,7 +348,7 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { // Here, since the activity task queue is present both in the current and in the ramping version, the backlog count would differ depending on the version described. // Poll with BOTH buildIDs in parallel to drain all workflow tasks (hash distribution splits them between current and ramping) - s.pollWorkflowTasksAndScheduleActivitiesParallel( + env.pollWorkflowTasksAndScheduleActivitiesParallel( workflowTasksAndActivitiesPollerParams{ tqName: tqName, deploymentName: deploymentName, @@ -433,27 +374,27 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { ) // Verify activity add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, true, false) // It is important to note that the expected values here are theoretical values based on the ramp percentage. In other words, 70% of the unversioned backlog // may not be scheduled on the current version by matching since it makes it's decision based on the workflowID of the workflow. However, when the number of workflows // to schedule is high, the expected value of workflows scheduled on the current version will be close to the theoretical value. Here, we shall just be verifying if // the theoretical statistics that are being reported are correct. - activitiesOnCurrentVersionExpectation := TaskQueueExpectations{ + activitiesOnCurrentVersionExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount * (100 - rampPercentage) / 100, MaxExtraTasks: 0, } - activitiesOnRampingVersionExpectation := TaskQueueExpectations{ + activitiesOnRampingVersionExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount * rampPercentage / 100, MaxExtraTasks: 0, } - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // Validate current version activity stats - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[activity][after-scheduling-activities][current-version]", @@ -465,7 +406,7 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { ) // Validate ramping version activity stats - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[activity][after-scheduling-activities][ramping-version]", @@ -479,9 +420,10 @@ func (s *taskQueueStatsSuite) rampingAndCurrentAbsorbsUnversionedBacklog() { }, 10*time.Second, 200*time.Millisecond) } -func (s *taskQueueStatsSuite) currentAbsorbsUnversionedBacklogWhenRampingToUnversioned() { - s.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) - s.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL +func (s *TaskQueueStatsVersionSuite) TestCurrentAbsorbsUnversionedBacklog_WhenRampingToUnversioned(usePriMatcher bool, behavior testcore.MatchingBehavior) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, behavior) + env.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) + env.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() @@ -491,37 +433,37 @@ func (s *taskQueueStatsSuite) currentAbsorbsUnversionedBacklogWhenRampingToUnver currentBuildID := "v1" pollCtx, cancelPoll := context.WithCancel(ctx) - s.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, currentBuildID) + env.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, currentBuildID) cancelPoll() // cancel the pollers so that we can verify the backlog expectations // Set current version. - s.setCurrentVersion(deploymentName, currentBuildID) + env.setCurrentVersion(deploymentName, currentBuildID) rampPercentage := 20 - s.setRampingVersion(deploymentName, "", rampPercentage) + env.setRampingVersion(deploymentName, "", rampPercentage) // Enqueue unversioned backlog. - unversionedWorkflowCount := 10 * s.partitionCount - s.startUnversionedWorkflows(unversionedWorkflowCount, tqName) + unversionedWorkflowCount := 10 * env.partitionCount + env.startUnversionedWorkflows(unversionedWorkflowCount, tqName) // Verify workflow add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) - currentExpectation := TaskQueueExpectations{ + currentExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount * (100 - rampPercentage) / 100, MaxExtraTasks: 0, } - legacyExpectation := TaskQueueExpectations{ + legacyExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount, MaxExtraTasks: 0, } - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // There is no way right now for a user to query stats of the "unversioned" version. All we can do in this case // is to query the current version's stats and see that it is attributed 80% of the unversioned backlog. - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[current][workflow][ramping-to-unversioned]", @@ -533,7 +475,7 @@ func (s *taskQueueStatsSuite) currentAbsorbsUnversionedBacklogWhenRampingToUnver ) // Since the task queue is part of both the current and ramping versions, the legacy mode should report the total backlog count. - s.requireLegacyTaskQueueStatsRelaxed( + env.requireLegacyTaskQueueStatsRelaxed( ctx, a, "DescribeTaskQueue[legacy][workflow][ramping-to-unversioned]", @@ -544,9 +486,10 @@ func (s *taskQueueStatsSuite) currentAbsorbsUnversionedBacklogWhenRampingToUnver }, 10*time.Second, 200*time.Millisecond) } -func (s *taskQueueStatsSuite) rampingAbsorbsUnversionedBacklogWhenCurrentIsUnversioned() { - s.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) - s.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL +func (s *TaskQueueStatsVersionSuite) TestRampingAbsorbsUnversionedBacklog_WhenCurrentIsUnversioned(usePriMatcher bool, behavior testcore.MatchingBehavior) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, behavior) + env.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) + env.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() @@ -556,38 +499,38 @@ func (s *taskQueueStatsSuite) rampingAbsorbsUnversionedBacklogWhenCurrentIsUnver rampingBuildID := "v2" pollCtx, cancelPoll := context.WithCancel(ctx) - s.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, rampingBuildID) + env.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, rampingBuildID) cancelPoll() // cancel the pollers so that we can verify the backlog expectations // Set current to unversioned (nil current version). - s.setCurrentVersion(deploymentName, "") + env.setCurrentVersion(deploymentName, "") // Set ramping to a versioned deployment. rampPercentage := 30 - s.setRampingVersion(deploymentName, rampingBuildID, rampPercentage) + env.setRampingVersion(deploymentName, rampingBuildID, rampPercentage) // Enqueue unversioned backlog. - unversionedWorkflowCount := 10 * s.partitionCount - s.startUnversionedWorkflows(unversionedWorkflowCount, tqName) + unversionedWorkflowCount := 10 * env.partitionCount + env.startUnversionedWorkflows(unversionedWorkflowCount, tqName) // Verify workflow add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) - rampingExpectation := TaskQueueExpectations{ + rampingExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount * rampPercentage / 100, MaxExtraTasks: 0, } - legacyExpectation := TaskQueueExpectations{ + legacyExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflowCount, MaxExtraTasks: 0, } - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // We can't query "unversioned" as a WorkerDeploymentVersion, but we can validate that the ramping version // is attributed its ramp share of the unversioned backlog. - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[ramping][workflow][current-unversioned]", @@ -599,7 +542,7 @@ func (s *taskQueueStatsSuite) rampingAbsorbsUnversionedBacklogWhenCurrentIsUnver ) // Legacy mode should continue to report the total backlog for the task queue. - s.requireLegacyTaskQueueStatsRelaxed( + env.requireLegacyTaskQueueStatsRelaxed( ctx, a, "DescribeTaskQueue[legacy][workflow][current-unversioned]", @@ -610,12 +553,13 @@ func (s *taskQueueStatsSuite) rampingAbsorbsUnversionedBacklogWhenCurrentIsUnver }, 10*time.Second, 200*time.Millisecond) } -func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { +func (s *TaskQueueStatsVersionSuite) TestInactiveVersionDoesNotAbsorbUnversionedBacklog(usePriMatcher bool, behavior testcore.MatchingBehavior) { + env := newTaskQueueStatsContext(s.T(), usePriMatcher, behavior) ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() - s.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) - s.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL + env.OverrideDynamicConfig(dynamicconfig.MatchingLongPollExpirationInterval, 10*time.Second) + env.OverrideDynamicConfig(dynamicconfig.TaskQueueInfoByBuildIdTTL, 1*time.Millisecond) // zero means no TTL tqName := "tq-" + common.GenerateRandomString(5) deploymentName := testcore.RandomizeStr("deployment") @@ -624,31 +568,31 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { pollCtx, cancelPoll := context.WithCancel(testcore.NewContext()) - s.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, currentBuildID) - s.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, inactiveBuildID) + env.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, currentBuildID) + env.createVersionsInTaskQueue(pollCtx, tqName, deploymentName, inactiveBuildID) // Set current version - s.setCurrentVersion(deploymentName, currentBuildID) + env.setCurrentVersion(deploymentName, currentBuildID) // Stopping the pollers so that we verify the backlog expectations cancelPoll() // Enqueue unversioned backlog. - unversionedWorkflows := 10 * s.partitionCount - s.startUnversionedWorkflows(unversionedWorkflows, tqName) + unversionedWorkflows := 10 * env.partitionCount + env.startUnversionedWorkflows(unversionedWorkflows, tqName) // Enqueue pinned workflows. - pinnedWorkflows := 10 * s.partitionCount - s.startPinnedWorkflows(pinnedWorkflows, tqName, deploymentName, inactiveBuildID) + pinnedWorkflows := 10 * env.partitionCount + env.startPinnedWorkflows(pinnedWorkflows, tqName, deploymentName, inactiveBuildID) // Verify workflow add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, false) - currentExpectation := TaskQueueExpectations{ + currentExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflows, MaxExtraTasks: 0, } - inactiveExpectation := TaskQueueExpectations{ + inactiveExpectation := taskQueueExpectations{ BacklogCount: pinnedWorkflows, MaxExtraTasks: 0, } @@ -656,11 +600,11 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { // Currently only testing the following API's: // - DescribeWorkerDeploymentVersion // - DescribeTaskQueue Legacy Mode - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // DescribeWorkerDeploymentVersion: current version should should show 100% of the unversioned backlog for this task queue - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[current][workflow]", @@ -672,7 +616,7 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { ) // DescribeWorkerDeploymentVersion: inactive version should only show the pinned workflows that are scheduled on it. - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[inactive][workflow]", @@ -685,7 +629,7 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { }, 10*time.Second, 200*time.Millisecond) // Polling the workflow tasks and scheduling activities - s.pollWorkflowTasksAndScheduleActivitiesParallel( + env.pollWorkflowTasksAndScheduleActivitiesParallel( workflowTasksAndActivitiesPollerParams{ tqName: tqName, deploymentName: deploymentName, @@ -711,28 +655,28 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { ) // Verify workflow dispatch rate and activity add rate - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, true) - s.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, true, false) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_WORKFLOW, true, true) + env.validateRates(tqName, enumspb.TASK_QUEUE_TYPE_ACTIVITY, true, false) // Validate activity backlogs - currentActivityExpectation := TaskQueueExpectations{ + currentActivityExpectation := taskQueueExpectations{ BacklogCount: unversionedWorkflows, MaxExtraTasks: 0, } - inactiveActivityExpectation := TaskQueueExpectations{ + inactiveActivityExpectation := taskQueueExpectations{ BacklogCount: pinnedWorkflows, MaxExtraTasks: 0, } - workflowTaskQueueEmptyExpectation := TaskQueueExpectations{ + workflowTaskQueueEmptyExpectation := taskQueueExpectations{ BacklogCount: 0, MaxExtraTasks: 0, } - require.EventuallyWithT(s.T(), func(c *assert.CollectT) { + s.EventuallyWithT(func(c *assert.CollectT) { a := require.New(c) // The activity task queue of the current version should have the backlog count for the activities that were scheduled - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[current][activity]", @@ -744,7 +688,7 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { ) // The workflow task queue of the current version should be empty since activities were scheduled - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[current][workflow]", @@ -756,7 +700,7 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { ) // The workflow task queue of the inactive version should be empty since activities were scheduled - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[inactive][workflow]", @@ -768,7 +712,7 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { ) // The activity task queue of the inactive version should have the backlog count for the activities that were scheduled - s.requireWDVTaskQueueStatsRelaxed( + env.requireWDVTaskQueueStatsRelaxed( ctx, a, "DescribeWorkerDeploymentVersion[inactive][activity]", @@ -781,10 +725,46 @@ func (s *taskQueueStatsSuite) inactiveVersionDoesNotAbsorbUnversionedBacklog() { }, 10*time.Second, 200*time.Millisecond) } +// taskQueueStatsContext holds the per-test environment and configuration for task queue stats tests. +type taskQueueStatsContext struct { + testcore.Env + usePriMatcher bool + minPriority int + maxPriority int + defaultPriority int + partitionCount int +} + +func newTaskQueueStatsContext( + t *testing.T, + usePriMatcher bool, + behavior testcore.MatchingBehavior, + extraOpts ...testcore.TestOption, +) *taskQueueStatsContext { + opts := []testcore.TestOption{ + testcore.WithDynamicConfig(dynamicconfig.EnableDeploymentVersions, true), + testcore.WithDynamicConfig(dynamicconfig.FrontendEnableWorkerVersioningWorkflowAPIs, true), + testcore.WithDynamicConfig(dynamicconfig.MatchingUseNewMatcher, usePriMatcher), + testcore.WithDynamicConfig(dynamicconfig.MatchingPriorityLevels, 5), // maxPriority + } + opts = append(opts, behavior.Options()...) + opts = append(opts, extraOpts...) + env := testcore.NewEnv(t, opts...) + behavior.InjectHooks(env) + return &taskQueueStatsContext{ + Env: env, + usePriMatcher: usePriMatcher, + minPriority: 1, + maxPriority: 5, + defaultPriority: 3, + partitionCount: 2, // kept low to reduce test time on CI + } +} + // requireWDVTaskQueueStatsRelaxed asserts task queue statistics by allowing for over-counting in multi-partition ramping scenarios. // The production code intentionally uses math.Ceil for both ramping and current percentage // calculations across partitions, which can result in slight over-counting. -func (s *taskQueueStatsSuite) requireWDVTaskQueueStatsRelaxed( +func (s *taskQueueStatsContext) requireWDVTaskQueueStatsRelaxed( ctx context.Context, a *require.Assertions, label string, @@ -792,7 +772,7 @@ func (s *taskQueueStatsSuite) requireWDVTaskQueueStatsRelaxed( tqType enumspb.TaskQueueType, deploymentName string, buildID string, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, ) { stats, found, err := s.describeWDVTaskQueueStats(ctx, tqName, tqType, deploymentName, buildID) a.NoError(err) @@ -808,13 +788,13 @@ func (s *taskQueueStatsSuite) requireWDVTaskQueueStatsRelaxed( // requireLegacyTaskQueueStatsRelaxed asserts task queue statistics by allowing for over-counting in multi-partition scenarios. // The production code intentionally uses math.Ceil for both ramping and current percentage calculations across partitions, // which can result in slight over-counting. -func (s *taskQueueStatsSuite) requireLegacyTaskQueueStatsRelaxed( +func (s *taskQueueStatsContext) requireLegacyTaskQueueStatsRelaxed( ctx context.Context, a *require.Assertions, label string, tqName string, tqType enumspb.TaskQueueType, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, ) { stats, found, err := s.describeLegacyTaskQueueStats(ctx, tqName, tqType) a.NoError(err) @@ -828,12 +808,12 @@ func (s *taskQueueStatsSuite) requireLegacyTaskQueueStatsRelaxed( } // Publishes versioned and unversioned entities; with one entity per priority (plus default priority). Multiplied by `sets`. -func (s *taskQueueStatsSuite) publishConsumeWorkflowTasksValidateStats(sets int, singlePartition bool) { +func (s *taskQueueStatsContext) publishConsumeWorkflowTasksValidateStats(sets int, singlePartition bool) { tqName := "tq-" + common.GenerateRandomString(5) s.createDeploymentInTaskQueue(tqName) // verify both workflow and activity backlogs are empty - expectations := TaskQueueExpectationsByType{ + expectations := taskQueueExpectationsByType{ enumspb.TASK_QUEUE_TYPE_WORKFLOW: { BacklogCount: 0, MaxExtraTasks: 0, @@ -862,7 +842,7 @@ func (s *taskQueueStatsSuite) publishConsumeWorkflowTasksValidateStats(sets int, } // verify workflow backlog is not empty, activity backlog is empty - expectations[enumspb.TASK_QUEUE_TYPE_WORKFLOW] = TaskQueueExpectations{ + expectations[enumspb.TASK_QUEUE_TYPE_WORKFLOW] = taskQueueExpectations{ BacklogCount: total, MaxExtraTasks: maxExtraTasksAllowed, } @@ -880,11 +860,11 @@ func (s *taskQueueStatsSuite) publishConsumeWorkflowTasksValidateStats(sets int, } // verify workflow backlog is empty, activity backlog is not - expectations[enumspb.TASK_QUEUE_TYPE_WORKFLOW] = TaskQueueExpectations{ + expectations[enumspb.TASK_QUEUE_TYPE_WORKFLOW] = taskQueueExpectations{ BacklogCount: 0, MaxExtraTasks: maxExtraTasksAllowed, } - expectations[enumspb.TASK_QUEUE_TYPE_ACTIVITY] = TaskQueueExpectations{ + expectations[enumspb.TASK_QUEUE_TYPE_ACTIVITY] = taskQueueExpectations{ BacklogCount: total, MaxExtraTasks: maxExtraTasksAllowed, } @@ -900,11 +880,11 @@ func (s *taskQueueStatsSuite) publishConsumeWorkflowTasksValidateStats(sets int, } // verify both workflow and activity backlogs are empty - expectations[enumspb.TASK_QUEUE_TYPE_WORKFLOW] = TaskQueueExpectations{ + expectations[enumspb.TASK_QUEUE_TYPE_WORKFLOW] = taskQueueExpectations{ BacklogCount: 0, MaxExtraTasks: maxExtraTasksAllowed, } - expectations[enumspb.TASK_QUEUE_TYPE_ACTIVITY] = TaskQueueExpectations{ + expectations[enumspb.TASK_QUEUE_TYPE_ACTIVITY] = taskQueueExpectations{ BacklogCount: 0, MaxExtraTasks: maxExtraTasksAllowed, } @@ -912,7 +892,7 @@ func (s *taskQueueStatsSuite) publishConsumeWorkflowTasksValidateStats(sets int, s.validateAllTaskQueueStats(tqName, expectations, singlePartition) } -func (s *taskQueueStatsSuite) startUnversionedWorkflows(count int, tqName string) { +func (s *taskQueueStatsContext) startUnversionedWorkflows(count int, tqName string) { wt := "functional-workflow-current-absorbs-unversioned" workflowType := &commonpb.WorkflowType{Name: wt} request := &workflowservice.StartWorkflowExecutionRequest{ @@ -932,7 +912,7 @@ func (s *taskQueueStatsSuite) startUnversionedWorkflows(count int, tqName string } } -func (s *taskQueueStatsSuite) startPinnedWorkflows(count int, tqName string, deploymentName string, buildID string) { +func (s *taskQueueStatsContext) startPinnedWorkflows(count int, tqName string, deploymentName string, buildID string) { wt := "functional-workflow-pinned" workflowType := &commonpb.WorkflowType{Name: wt} @@ -963,20 +943,7 @@ func (s *taskQueueStatsSuite) startPinnedWorkflows(count int, tqName string, dep } } -type workflowTasksAndActivitiesPollerParams struct { - tqName string - deploymentName string - buildID string - identity string - logPrefix string - activityIDPrefix string - maxToSchedule int - maxConsecEmptyPoll int - versioningBehavior enumspb.VersioningBehavior -} - -// pollWorkflowTasksAndScheduleActivitiesParallel polls workflow tasks and schedules activities in parallel for workers of two different buildID's. -func (s *taskQueueStatsSuite) pollWorkflowTasksAndScheduleActivitiesParallel(params ...workflowTasksAndActivitiesPollerParams) { +func (s *taskQueueStatsContext) pollWorkflowTasksAndScheduleActivitiesParallel(params ...workflowTasksAndActivitiesPollerParams) { var wg sync.WaitGroup errCh := make(chan error, len(params)) @@ -994,7 +961,7 @@ func (s *taskQueueStatsSuite) pollWorkflowTasksAndScheduleActivitiesParallel(par } } -func (s *taskQueueStatsSuite) pollWorkflowTasksAndScheduleActivities(params workflowTasksAndActivitiesPollerParams) (int, error) { +func (s *taskQueueStatsContext) pollWorkflowTasksAndScheduleActivities(params workflowTasksAndActivitiesPollerParams) (int, error) { deploymentOpts := createDeploymentOptions(params.deploymentName, params.buildID) scheduled := 0 @@ -1053,7 +1020,7 @@ func (s *taskQueueStatsSuite) pollWorkflowTasksAndScheduleActivities(params work return scheduled, nil } -func (s *taskQueueStatsSuite) completeWorkflowTasksAndScheduleActivities( +func (s *taskQueueStatsContext) completeWorkflowTasksAndScheduleActivities( tqName string, deploymentName string, buildID string, @@ -1104,7 +1071,7 @@ func (s *taskQueueStatsSuite) completeWorkflowTasksAndScheduleActivities( } // TODO (Shivam): We may have to wait for the propagation status to show completed if we are using async workflows here. -func (s *taskQueueStatsSuite) setCurrentVersion(deploymentName, buildID string) { +func (s *taskQueueStatsContext) setCurrentVersion(deploymentName, buildID string) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -1117,7 +1084,7 @@ func (s *taskQueueStatsSuite) setCurrentVersion(deploymentName, buildID string) } // TODO (Shivam): We may have to wait for the propagation status to show completed if we are using async workflows here. -func (s *taskQueueStatsSuite) setRampingVersion(deploymentName, buildID string, rampPercentage int) { +func (s *taskQueueStatsContext) setRampingVersion(deploymentName, buildID string, rampPercentage int) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -1130,7 +1097,7 @@ func (s *taskQueueStatsSuite) setRampingVersion(deploymentName, buildID string, require.NoError(s.T(), err) } -func (s *taskQueueStatsSuite) describeWDVTaskQueueStats( +func (s *taskQueueStatsContext) describeWDVTaskQueueStats( ctx context.Context, tqName string, tqType enumspb.TaskQueueType, @@ -1158,7 +1125,7 @@ func (s *taskQueueStatsSuite) describeWDVTaskQueueStats( // DescribeTaskQueue Legacy Mode shall report the stats for this task queue from all the different versions // that the task queue is part of. -func (s *taskQueueStatsSuite) describeLegacyTaskQueueStats( +func (s *taskQueueStatsContext) describeLegacyTaskQueueStats( ctx context.Context, tqName string, tqType enumspb.TaskQueueType, @@ -1175,7 +1142,7 @@ func (s *taskQueueStatsSuite) describeLegacyTaskQueueStats( return resp.GetStats(), true, nil } -func (s *taskQueueStatsSuite) enqueueWorkflows(sets int, tqName string) int { +func (s *taskQueueStatsContext) enqueueWorkflows(sets int, tqName string) int { deploymentOpts := s.deploymentOptions(tqName) var total int @@ -1224,7 +1191,7 @@ func (s *taskQueueStatsSuite) enqueueWorkflows(sets int, tqName string) int { return total } -func (s *taskQueueStatsSuite) createVersionsInTaskQueue(ctx context.Context, tqName string, deploymentName string, buildID string) { +func (s *taskQueueStatsContext) createVersionsInTaskQueue(ctx context.Context, tqName string, deploymentName string, buildID string) { go func() { _, _ = s.FrontendClient().PollWorkflowTaskQueue(ctx, &workflowservice.PollWorkflowTaskQueueRequest{ Namespace: s.Namespace().String(), @@ -1259,7 +1226,7 @@ func (s *taskQueueStatsSuite) createVersionsInTaskQueue(ctx context.Context, tqN } // TODO (Shivam): Remove this guy. -func (s *taskQueueStatsSuite) createDeploymentInTaskQueue(tqName string) { +func (s *taskQueueStatsContext) createDeploymentInTaskQueue(tqName string) { // Using old DeploymentData format var wg sync.WaitGroup @@ -1284,7 +1251,7 @@ func (s *taskQueueStatsSuite) createDeploymentInTaskQueue(tqName string) { wg.Wait() } -func (s *taskQueueStatsSuite) enqueueActivitiesForEachWorkflow(sets int, tqName string) int { +func (s *taskQueueStatsContext) enqueueActivitiesForEachWorkflow(sets int, tqName string) int { deploymentOpts := s.deploymentOptions(tqName) var total int @@ -1342,7 +1309,7 @@ func (s *taskQueueStatsSuite) enqueueActivitiesForEachWorkflow(sets int, tqName return total } -func (s *taskQueueStatsSuite) pollActivities(count int, tqName string) { +func (s *taskQueueStatsContext) pollActivities(count int, tqName string) { for i := 0; i < count; { pollReq := &workflowservice.PollActivityTaskQueueRequest{ Namespace: s.Namespace().String(), @@ -1367,9 +1334,9 @@ func (s *taskQueueStatsSuite) pollActivities(count int, tqName string) { s.T().Logf("Polled %d activities", count) } -func (s *taskQueueStatsSuite) validateAllTaskQueueStats( +func (s *taskQueueStatsContext) validateAllTaskQueueStats( tqName string, - expectations TaskQueueExpectationsByType, + expectations taskQueueExpectationsByType, singlePartition bool, ) { for tqType, expectation := range expectations { @@ -1380,7 +1347,7 @@ func (s *taskQueueStatsSuite) validateAllTaskQueueStats( // validateRates verifies TasksAddRate and/or TasksDispatchRate in a dedicated EventuallyWithT block. // This should be called immediately after the relevant operation (enqueue for add rate, poll for dispatch rate) // to ensure the rate is checked while still fresh (before the 30-second sliding window decays). -func (s *taskQueueStatsSuite) validateRates( +func (s *taskQueueStatsContext) validateRates( tqName string, tqType enumspb.TaskQueueType, expectAddRate bool, @@ -1416,10 +1383,10 @@ func (s *taskQueueStatsSuite) validateRates( }, 5*time.Second, 100*time.Millisecond) } -func (s *taskQueueStatsSuite) validateTaskQueueStatsByType( +func (s *taskQueueStatsContext) validateTaskQueueStatsByType( tqName string, tqType enumspb.TaskQueueType, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, singlePartition bool, ) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) @@ -1437,11 +1404,11 @@ func (s *taskQueueStatsSuite) validateTaskQueueStatsByType( s.validateDescribeWorkerDeploymentVersion(ctx, tqName, tqType, halfExpectation) } -func (s *taskQueueStatsSuite) validateDescribeTaskQueueWithDefaultMode( +func (s *taskQueueStatsContext) validateDescribeTaskQueueWithDefaultMode( ctx context.Context, tqName string, tqType enumspb.TaskQueueType, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, singlePartition bool, ) { req := &workflowservice.DescribeTaskQueueRequest{ @@ -1484,11 +1451,11 @@ func (s *taskQueueStatsSuite) validateDescribeTaskQueueWithDefaultMode( }, 5*time.Second, 100*time.Millisecond) } -func (s *taskQueueStatsSuite) validateDescribeTaskQueueWithEnhancedMode( +func (s *taskQueueStatsContext) validateDescribeTaskQueueWithEnhancedMode( ctx context.Context, tqName string, tqType enumspb.TaskQueueType, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, ) { deploymentOpts := s.deploymentOptions(tqName) req := &workflowservice.DescribeTaskQueueRequest{ @@ -1545,11 +1512,11 @@ func (s *taskQueueStatsSuite) validateDescribeTaskQueueWithEnhancedMode( }, 5*time.Second, 100*time.Millisecond) } -func (s *taskQueueStatsSuite) validateDescribeWorkerDeploymentVersion( +func (s *taskQueueStatsContext) validateDescribeWorkerDeploymentVersion( ctx context.Context, tqName string, tqType enumspb.TaskQueueType, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, ) { deploymentOpts := s.deploymentOptions(tqName) req := &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -1573,7 +1540,7 @@ func (s *taskQueueStatsSuite) validateDescribeWorkerDeploymentVersion( req.ReportTaskQueueStats = true resp, err := s.FrontendClient().DescribeWorkerDeploymentVersion(ctx, req) - require.NoError(s.T(), err) + a.NoError(err) a.Len(resp.VersionTaskQueues, 2, "should be 1 task queue for Workflows and 1 for Activities") for _, info := range resp.VersionTaskQueues { @@ -1591,11 +1558,11 @@ func (s *taskQueueStatsSuite) validateDescribeWorkerDeploymentVersion( }, 5*time.Second, 100*time.Millisecond) } -func (s *taskQueueStatsSuite) validateTaskQueueStatsByPriority( +func (s *taskQueueStatsContext) validateTaskQueueStatsByPriority( label string, a *require.Assertions, stats map[int32]*taskqueuepb.TaskQueueStats, - taskQueueExpectation TaskQueueExpectations, + taskQueueExpectation taskQueueExpectations, ) { a.Len(stats, s.maxPriority, "%s: stats should contain %d priorities", label, s.maxPriority) @@ -1626,11 +1593,20 @@ func (s *taskQueueStatsSuite) validateTaskQueueStatsByPriority( label, taskQueueExpectation.BacklogCount, accBacklogCount) } +// TODO: Remove this once older stats tests are refactored to use the createDeploymentOptions function. +func (s *taskQueueStatsContext) deploymentOptions(tqName string) *deploymentpb.WorkerDeploymentOptions { + return &deploymentpb.WorkerDeploymentOptions{ + DeploymentName: tqName + "-deployment", + BuildId: "build-id", + WorkerVersioningMode: enumspb.WORKER_VERSIONING_MODE_VERSIONED, + } +} + func validateTaskQueueStatsStrict( label string, a *require.Assertions, stats *taskqueuepb.TaskQueueStats, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, ) { a.Equal(int64(expectation.BacklogCount), stats.ApproximateBacklogCount, "%s: ApproximateBacklogCount should be %d, got %d", @@ -1645,7 +1621,7 @@ func validateTaskQueueStats( label string, a *require.Assertions, stats *taskqueuepb.TaskQueueStats, - expectation TaskQueueExpectations, + expectation taskQueueExpectations, ) { // Actual counter can be greater than the expected due to history retries. We make sure the counter is in // range [expected, expected+maxBacklogExtraTasks] @@ -1663,15 +1639,6 @@ func validateTaskQueueStats( label, stats.ApproximateBacklogAge.AsDuration()) } -// TODO: Remove this once older stats tests are refactored to use the createDeploymentOptions function. -func (s *taskQueueStatsSuite) deploymentOptions(tqName string) *deploymentpb.WorkerDeploymentOptions { - return &deploymentpb.WorkerDeploymentOptions{ - DeploymentName: tqName + "-deployment", - BuildId: "build-id", - WorkerVersioningMode: enumspb.WORKER_VERSIONING_MODE_VERSIONED, - } -} - func createDeploymentOptions(deploymentName string, buildID string) *deploymentpb.WorkerDeploymentOptions { return &deploymentpb.WorkerDeploymentOptions{ DeploymentName: deploymentName, diff --git a/tests/testcore/functional_test_base.go b/tests/testcore/functional_test_base.go index 5ca9061306..8a4ca4bf7a 100644 --- a/tests/testcore/functional_test_base.go +++ b/tests/testcore/functional_test_base.go @@ -346,6 +346,7 @@ func (s *FunctionalTestBase) SetupSubTest() { s.initAssertions() } +// TODO: remove once `parallelsuite` and testEnv is rolled out everywhere func (s *FunctionalTestBase) initAssertions() { // `s.Assertions` (as well as other test helpers which depends on `s.T()`) must be initialized on // both test and subtest levels (but not suite level, where `s.T()` is `nil`). diff --git a/tests/testcore/shard_salt.txt b/tests/testcore/shard_salt.txt index 8a01c2103b..b0d2298901 100644 --- a/tests/testcore/shard_salt.txt +++ b/tests/testcore/shard_salt.txt @@ -1 +1 @@ --salt-824 +-salt-5180 diff --git a/tests/testcore/test_env.go b/tests/testcore/test_env.go index fa3c3fcc5a..af2570d440 100644 --- a/tests/testcore/test_env.go +++ b/tests/testcore/test_env.go @@ -22,7 +22,6 @@ import ( "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/log" "go.temporal.io/server/common/namespace" - "go.temporal.io/server/common/testing/historyrequire" "go.temporal.io/server/common/testing/taskpoller" "go.temporal.io/server/common/testing/testhooks" "go.temporal.io/server/common/testing/testvars" @@ -37,11 +36,11 @@ var shardSalt string var ( _ Env = (*TestEnv)(nil) - sequentialSuites sync.Map - defaultTestTimeout = 90 * time.Second * debug.TimeoutMultiplier + defaultTestTimeout = 90 * time.Second * debug.TimeoutMultiplier ) type Env interface { + // T returns the *testing.T. Deprecated: use the suite's T() method instead. T() *testing.T Namespace() namespace.Name NamespaceID() namespace.ID @@ -56,8 +55,12 @@ type Env interface { type TestEnv struct { *FunctionalTestBase + + // Shadows FunctionalTestBase.Assertions with a per-test instance bound to + // this TestEnv's own *testing.T, avoiding data races when parallel tests + // share the same *FunctionalTestBase cluster. + // TODO: remove once all tests are migrated to TestEnv (and no longer use FunctionalTestBase directly). *require.Assertions - historyrequire.HistoryRequire Logger log.Logger @@ -69,9 +72,11 @@ type TestEnv struct { tv *testvars.TestVars ctx context.Context - sdkClient sdkclient.Client - worker sdkworker.Worker - workerTaskQueue string + sdkClientOnce sync.Once + sdkClient sdkclient.Client + sdkWorkerOnce sync.Once + sdkWorker sdkworker.Worker + sdkWorkerTQ string } type TestOption func(*testOptions) @@ -80,7 +85,6 @@ type testOptions struct { dedicatedCluster bool dynamicConfigSettings []dynamicConfigOverride timeout time.Duration - sdkWorker bool } type dynamicConfigOverride struct { @@ -96,11 +100,9 @@ func WithDedicatedCluster() TestOption { } } -// WithSdkWorker sets up an SDK client and worker for the test. -// Cleanup is handled automatically via t.Cleanup(). +// Deprecated: this option is no longer required and will be removed once all callers have been updated. func WithSdkWorker() TestOption { return func(o *testOptions) { - o.sdkWorker = true } } @@ -128,81 +130,29 @@ func WithTimeout(duration time.Duration) TestOption { } } -// sequentialSuite holds state for a suite marked with MustRunSequential. -// It manages a single dedicated cluster shared by all tests in the suite. -type sequentialSuite struct { - cluster *FunctionalTestBase -} - -// MustRunSequential marks a test suite to run its tests sequentially instead -// of in parallel. Call this at the start of your test suite before any -// subtests are created. A single dedicated cluster will be created for this -// suite and torn down when the suite completes. -func MustRunSequential(t *testing.T, reason string) { - if strings.Contains(t.Name(), "/") { - panic("MustRunSequential must be called from a top-level test, not a subtest") - } - if reason == "" { - panic("MustRunSequential requires a reason") - } - - // Create a dedicated cluster for this suite. - suite := &sequentialSuite{ - cluster: testClusterPool.createCluster(t, nil, false), - } - sequentialSuites.Store(t.Name(), suite) - - // Register cleanup to tear down the suite's cluster when the parent test completes. - t.Cleanup(func() { - sequentialSuites.Delete(t.Name()) - if err := suite.cluster.testCluster.TearDownCluster(); err != nil { - t.Logf("Failed to tear down sequential suite cluster: %v", err) - } - }) -} - // NewEnv creates a new test environment with access to a Temporal cluster. -// -// By default, tests are marked as parallel. Use MustRunSequential on the -// test's parent `testing.T` to run them sequentially instead. func NewEnv(t *testing.T, opts ...TestOption) *TestEnv { + t.Helper() + // Check test sharding early, before any expensive operations. checkTestShard(t) - // Check if this is a sequential suite by looking up the parent test name. - suiteName := t.Name() - if idx := strings.Index(suiteName, "/"); idx != -1 { - suiteName = suiteName[:idx] - } - suiteVal, sequential := sequentialSuites.Load(suiteName) - if !sequential { - t.Parallel() - } - var options testOptions for _, opt := range opts { opt(&options) } - var base *FunctionalTestBase - if sequential { - // Sequential suites use a single dedicated cluster for all tests. - suite := suiteVal.(*sequentialSuite) - base = suite.cluster - base.SetT(t) - } else { - // For dedicated clusters, pass all dynamic config settings at cluster creation. - var startupConfig map[dynamicconfig.Key]any - if options.dedicatedCluster && len(options.dynamicConfigSettings) > 0 { - startupConfig = make(map[dynamicconfig.Key]any, len(options.dynamicConfigSettings)) - for _, override := range options.dynamicConfigSettings { - startupConfig[override.setting.Key()] = override.value - } + // For dedicated clusters, pass all dynamic config settings at cluster creation. + var startupConfig map[dynamicconfig.Key]any + if options.dedicatedCluster && len(options.dynamicConfigSettings) > 0 { + startupConfig = make(map[dynamicconfig.Key]any, len(options.dynamicConfigSettings)) + for _, override := range options.dynamicConfigSettings { + startupConfig[override.setting.Key()] = override.value } - - // Obtain the test cluster from the pool. - base = testClusterPool.get(t, options.dedicatedCluster, startupConfig) } + + // Obtain the test cluster from the pool. + base := testClusterPool.get(t, options.dedicatedCluster, startupConfig) cluster := base.GetTestCluster() // Create a dedicated namespace for the test to help with test isolation. @@ -221,7 +171,6 @@ func NewEnv(t *testing.T, opts ...TestOption) *TestEnv { env := &TestEnv{ FunctionalTestBase: base, Assertions: require.New(t), - HistoryRequire: historyrequire.New(t), cluster: cluster, nsName: ns, nsID: nsID, @@ -230,6 +179,7 @@ func NewEnv(t *testing.T, opts ...TestOption) *TestEnv { t: t, tv: testvars.New(t), ctx: setupTestTimeoutWithContext(t, options.timeout), + sdkWorkerTQ: RandomizeStr("tq-" + t.Name()), } // For shared clusters, apply all dynamic config settings as overrides. @@ -239,10 +189,6 @@ func NewEnv(t *testing.T, opts ...TestOption) *TestEnv { } } - if options.sdkWorker { - env.setupSdk() - } - return env } @@ -280,6 +226,7 @@ func (e *TestEnv) TaskPoller() *taskpoller.TaskPoller { return e.taskPoller } +// T returns the *testing.T. Deprecated: use the suite's T() method instead. func (e *TestEnv) T() *testing.T { return e.t } @@ -300,64 +247,52 @@ func (e *TestEnv) Context() context.Context { return e.ctx } -// SdkClient returns the SDK client created by WithSdkWorker. -// Panics if WithSdkWorker was not passed to NewEnv. +// SdkClient returns the SDK client. It is lazily initialized on the first call. func (e *TestEnv) SdkClient() sdkclient.Client { - if e.sdkClient == nil { - panic("SdkClient() requires WithSdkWorker option to be passed to NewEnv") - } + e.sdkClientOnce.Do(func() { + clientOptions := sdkclient.Options{ + HostPort: e.FrontendGRPCAddress(), + Namespace: e.nsName.String(), + Logger: log.NewSdkLogger(e.Logger), + } + + if provider := e.cluster.host.tlsConfigProvider; provider != nil { + clientOptions.ConnectionOptions.TLS = provider.FrontendClientConfig + } + + if interceptor := e.cluster.host.grpcClientInterceptor; interceptor != nil { + clientOptions.ConnectionOptions.DialOptions = []grpc.DialOption{ + grpc.WithUnaryInterceptor(interceptor.Unary()), + grpc.WithStreamInterceptor(interceptor.Stream()), + } + } + + var err error + e.sdkClient, err = sdkclient.Dial(clientOptions) + if err != nil { + e.t.Fatalf("Failed to create SDK client: %v", err) + } + e.t.Cleanup(func() { e.sdkClient.Close() }) + }) return e.sdkClient } -// SdkWorker returns the SDK worker created by WithSdkWorker. -// Panics if WithSdkWorker was not passed to NewEnv. +// SdkWorker returns the SDK worker. It is lazily initialized on the first call. func (e *TestEnv) SdkWorker() sdkworker.Worker { - if e.worker == nil { - panic("SdkWorker() requires WithSdkWorker option to be passed to NewEnv") - } - return e.worker + e.sdkWorkerOnce.Do(func() { + client := e.SdkClient() // Ensure client is initialized + e.sdkWorker = sdkworker.New(client, e.sdkWorkerTQ, sdkworker.Options{}) + if err := e.sdkWorker.Start(); err != nil { + e.t.Fatalf("Failed to start SDK worker: %v", err) + } + e.t.Cleanup(func() { e.sdkWorker.Stop() }) + }) + return e.sdkWorker } // WorkerTaskQueue returns the task queue name used by the SDK Worker. -// Panics if WithSdkWorker was not passed to NewEnv. func (e *TestEnv) WorkerTaskQueue() string { - if e.workerTaskQueue == "" { - panic("WorkerTaskQueue() requires WithSdkWorker option to be passed to NewEnv") - } - return e.workerTaskQueue -} - -func (e *TestEnv) setupSdk() { - clientOptions := sdkclient.Options{ - HostPort: e.FrontendGRPCAddress(), - Namespace: e.nsName.String(), - Logger: log.NewSdkLogger(e.Logger), - } - - if provider := e.cluster.host.tlsConfigProvider; provider != nil { - clientOptions.ConnectionOptions.TLS = provider.FrontendClientConfig - } - - if interceptor := e.cluster.host.grpcClientInterceptor; interceptor != nil { - clientOptions.ConnectionOptions.DialOptions = []grpc.DialOption{ - grpc.WithUnaryInterceptor(interceptor.Unary()), - grpc.WithStreamInterceptor(interceptor.Stream()), - } - } - - var err error - e.sdkClient, err = sdkclient.Dial(clientOptions) - if err != nil { - e.t.Fatalf("Failed to create SDK client: %v", err) - } - e.t.Cleanup(func() { e.sdkClient.Close() }) - - e.workerTaskQueue = RandomizeStr(e.t.Name()) - e.worker = sdkworker.New(e.sdkClient, e.workerTaskQueue, sdkworker.Options{}) - if err = e.worker.Start(); err != nil { - e.t.Fatalf("Failed to start SDK worker: %v", err) - } - e.t.Cleanup(func() { e.worker.Stop() }) + return e.sdkWorkerTQ } // OverrideDynamicConfig overrides a dynamic config setting for the duration of this test. diff --git a/tests/update_workflow_test.go b/tests/update_workflow_test.go index 9640c61f71..48804817c7 100644 --- a/tests/update_workflow_test.go +++ b/tests/update_workflow_test.go @@ -18,14 +18,13 @@ import ( updatepb "go.temporal.io/api/update/v1" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/server/api/adminservice/v1" - "go.temporal.io/server/api/matchingservice/v1" - taskqueuespb "go.temporal.io/server/api/taskqueue/v1" "go.temporal.io/server/chasm" "go.temporal.io/server/common" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/log/tag" "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/metrics/metricstest" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/testing/protoutils" "go.temporal.io/server/common/testing/taskpoller" "go.temporal.io/server/common/testing/testhooks" @@ -70,524 +69,766 @@ func closeShard(s testcore.Env, wid string) { s.CloseShard(resp.NamespaceInfo.Id, wid) } +type WorkflowUpdateSuite struct { + parallelsuite.Suite[*WorkflowUpdateSuite] +} + func TestWorkflowUpdateSuite(t *testing.T) { - t.Parallel() - t.Run("EmptySpeculativeWorkflowTask_AcceptComplete", func(t *testing.T) { - testCases := []struct { - name string - useRunID bool - }{ - { - name: "with RunID", - useRunID: true, - }, - { - name: "without RunID", - useRunID: false, - }, - } + parallelsuite.Run(t, &WorkflowUpdateSuite{}) +} - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - // Uses CaptureMetricsHandler which requires a dedicated cluster to avoid metric interference. - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - runID := mustStartWorkflow(s, s.Tv()) +func (s *WorkflowUpdateSuite) TestEmptySpeculativeWorkflowTask_AcceptComplete() { + testCases := []struct { + name string + useRunID bool + }{ + { + name: "with RunID", + useRunID: true, + }, + { + name: "without RunID", + useRunID: false, + }, + } - tv := s.Tv() - if tc.useRunID { - tv = tv.WithRunID(runID) - } + for _, tc := range testCases { + s.Run(tc.name, func(s *WorkflowUpdateSuite) { + // Uses CaptureMetricsHandler which requires a dedicated cluster to avoid metric interference. + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + runID := mustStartWorkflow(env, env.Tv()) - capture := s.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() - defer s.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + tv := env.Tv() + if tc.useRunID { + tv = tv.WithRunID(runID) + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT events are not written to the history yet. - 6 WorkflowTaskStarted -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } + capture := env.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() + defer env.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT events are not written to the history yet. + 6 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(5, updRequestMsg.GetEventId()) + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(5, updRequestMsg.GetEventId()) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - updateResultCh := sendUpdateNoError(s, tv) + updateResultCh := sendUpdateNoError(env, tv) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res.NewTask) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) + + // Test non-blocking poll + for _, waitPolicy := range []*updatepb.WaitPolicy{{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_UNSPECIFIED}, nil} { + pollUpdateResp, err := pollUpdate(env, env.Tv(), waitPolicy) s.NoError(err) - s.NotNil(res.NewTask) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) - - // Test non-blocking poll - for _, waitPolicy := range []*updatepb.WaitPolicy{{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_UNSPECIFIED}, nil} { - pollUpdateResp, err := pollUpdate(s, s.Tv(), waitPolicy) - s.NoError(err) - s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, pollUpdateResp.Stage) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), pollUpdateResp.Outcome.GetSuccess())) - // Even if tv doesn't have RunID, it should be returned as part of UpdateRef. - s.Equal(runID, pollUpdateResp.UpdateRef.GetWorkflowExecution().RunId) - } + s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, pollUpdateResp.Stage) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), pollUpdateResp.Outcome.GetSuccess())) + // Even if tv doesn't have RunID, it should be returned as part of UpdateRef. + s.Equal(runID, pollUpdateResp.UpdateRef.GetWorkflowExecution().RunId) + } - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - commits, rollbacks := speculativeWorkflowTaskOutcomes(capture.Snapshot()) - s.Equal(1, commits) - s.Equal(0, rollbacks) + commits, rollbacks := speculativeWorkflowTaskOutcomes(capture.Snapshot()) + s.Equal(1, commits) + s.Equal(0, rollbacks) - events := s.GetHistory(s.Namespace().String(), tv.WorkflowExecution()) + events := env.GetHistory(env.Namespace().String(), tv.WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Was speculative WT... - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted // ...and events were written to the history when WT completes. - 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. - 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} -`, events) - }) - } - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Was speculative WT... + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted // ...and events were written to the history when WT completes. + 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. + 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} + `, events) + }) + } +} - t.Run("NotEmptySpeculativeWorkflowTask_AcceptComplete", func(t *testing.T) { - testCases := []struct { - name string - useRunID bool - }{ - { - name: "with RunID", - useRunID: true, - }, - { - name: "without RunID", - useRunID: false, - }, - } +func (s *WorkflowUpdateSuite) TestNotEmptySpeculativeWorkflowTask_AcceptComplete() { + testCases := []struct { + name string + useRunID bool + }{ + { + name: "with RunID", + useRunID: true, + }, + { + name: "without RunID", + useRunID: false, + }, + } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - s := testcore.NewEnv(t) - runID := mustStartWorkflow(s, s.Tv()) - tv := s.Tv() - if tc.useRunID { - tv = tv.WithRunID(runID) - } + for _, tc := range testCases { + s.Run(tc.name, func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + runID := mustStartWorkflow(env, env.Tv()) + tv := env.Tv() + if tc.useRunID { + tv = tv.WithRunID(runID) + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with update unrelated command. - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: s.Tv().ActivityID(), - ActivityType: s.Tv().ActivityType(), - TaskQueue: s.Tv().TaskQueue(), - ScheduleToCloseTimeout: s.Tv().Any().InfiniteTimeout(), - }}, - }}, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled // Speculative WFT with ActivityTaskScheduled(5) event after WorkflowTaskCompleted(4). - 7 WorkflowTaskStarted -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with update unrelated command. + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: env.Tv().ActivityID(), + ActivityType: env.Tv().ActivityType(), + TaskQueue: env.Tv().TaskQueue(), + ScheduleToCloseTimeout: env.Tv().Any().InfiniteTimeout(), + }}, + }}, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled // Speculative WFT with ActivityTaskScheduled(5) event after WorkflowTaskCompleted(4). + 7 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(6, updRequestMsg.GetEventId()) + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(6, updRequestMsg.GetEventId()) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - updateResultCh := sendUpdateNoError(s, tv) + updateResultCh := sendUpdateNoError(env, tv) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - events := s.GetHistory(s.Namespace().String(), tv.WorkflowExecution()) + events := env.GetHistory(env.Namespace().String(), tv.WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled // Speculative WFT was persisted when completed (event 8) - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 6} // WTScheduled event which delivered update to the worker. - 10 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 9} -`, events) - }) - } - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled // Speculative WFT was persisted when completed (event 8) + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 6} // WTScheduled event which delivered update to the worker. + 10 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 9} + `, events) + }) + } +} - t.Run("FirstNormalScheduledWorkflowTask_AcceptComplete", func(t *testing.T) { - testCases := []struct { - name string - useRunID bool - }{ - { - name: "with RunID", - useRunID: true, - }, - { - name: "without RunID", - useRunID: false, - }, - } +func (s *WorkflowUpdateSuite) TestFirstNormalScheduledWorkflowTask_AcceptComplete() { + testCases := []struct { + name string + useRunID bool + }{ + { + name: "with RunID", + useRunID: true, + }, + { + name: "without RunID", + useRunID: false, + }, + } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - s := testcore.NewEnv(t) - runID := mustStartWorkflow(s, s.Tv()) - tv := s.Tv() - if tc.useRunID { - tv = tv.WithRunID(runID) - } + for _, tc := range testCases { + s.Run(tc.name, func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + runID := mustStartWorkflow(env, env.Tv()) + tv := env.Tv() + if tc.useRunID { + tv = tv.WithRunID(runID) + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted // First normal WT. No speculative WT was created. -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted // First normal WT. No speculative WT was created. + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(2, updRequestMsg.GetEventId()) + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(2, updRequestMsg.GetEventId()) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updateResultCh := sendUpdateNoError(s, tv) + updateResultCh := sendUpdateNoError(env, tv) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - s.Equal(1, wtHandlerCalls) - s.Equal(1, msgHandlerCalls) + s.Equal(1, wtHandlerCalls) + s.Equal(1, msgHandlerCalls) - events := s.GetHistory(s.Namespace().String(), tv.WorkflowExecution()) + events := env.GetHistory(env.Namespace().String(), tv.WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 2} // WTScheduled event which delivered update to the worker. - 6 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 5} -`, events) - }) - } - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 2} // WTScheduled event which delivered update to the worker. + 6 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 5} + `, events) + }) + } +} - t.Run("NormalScheduledWorkflowTask_AcceptComplete", func(t *testing.T) { - testCases := []struct { - name string - useRunID bool - }{ - { - name: "with RunID", - useRunID: true, - }, - { - name: "without RunID", - useRunID: false, - }, - } +func (s *WorkflowUpdateSuite) TestNormalScheduledWorkflowTask_AcceptComplete() { + testCases := []struct { + name string + useRunID bool + }{ + { + name: "with RunID", + useRunID: true, + }, + { + name: "without RunID", + useRunID: false, + }, + } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - s := testcore.NewEnv(t) - runID := mustStartWorkflow(s, s.Tv()) - tv := s.Tv() - if tc.useRunID { - tv = tv.WithRunID(runID) - } + for _, tc := range testCases { + s.Run(tc.name, func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + runID := mustStartWorkflow(env, env.Tv()) + tv := env.Tv() + if tc.useRunID { + tv = tv.WithRunID(runID) + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled // This WT was already created by signal and no speculative WT was created. - 7 WorkflowTaskStarted`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled // This WT was already created by signal and no speculative WT was created. + 7 WorkflowTaskStarted`, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - s.Require().NotEmpty(task.Messages, "Task has no messages", task) - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + s.NotEmpty(task.Messages, "Task has no messages", task) + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(6, updRequestMsg.GetEventId()) + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(6, updRequestMsg.GetEventId()) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Send signal to schedule new WT. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + // Send signal to schedule new WT. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) - updateResultCh := sendUpdateNoError(s, tv) + updateResultCh := sendUpdateNoError(env, tv) - // Process update in workflow. It will be attached to existing WT. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) + // Process update in workflow. It will be attached to existing WT. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - events := s.GetHistory(s.Namespace().String(), tv.WorkflowExecution()) + events := env.GetHistory(env.Namespace().String(), tv.WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 6} // WTScheduled event which delivered update to the worker. - 10 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 9} -`, events) - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 6} // WTScheduled event which delivered update to the worker. + 10 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 9} + `, events) + }) + } +} + +func (s *WorkflowUpdateSuite) TestRunningWorkflowTask_NewEmptySpeculativeWorkflowTask_Rejected() { + // Uses CaptureMetricsHandler which requires a dedicated cluster to avoid metric interference. + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + mustStartWorkflow(env, env.Tv()) + + capture := env.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() + defer env.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + + var updateResultCh <-chan *workflowservice.UpdateWorkflowExecutionResponse + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Send update after 1st WT has started. + updateResultCh = sendUpdateNoError(env, env.Tv()) + // Completes WT with empty command list to create next WFT w/o events. + return nil, nil + case 2: + s.EqualHistory(` + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted`, task.History) + // Message handled rejects update. + return nil, nil + case 3: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted // Speculative WT2 disappeared and new normal WT was created. + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted`, task.History) + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } - }) + } + + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1, 3: + return nil, nil + case 2: + s.Len(task.Messages, 1) + updRequestMsg := task.Messages[0] + s.EqualValues(5, updRequestMsg.GetEventId()) + + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } + + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } + + // Drain first WT which starts 1st update. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + wt1Resp := res.NewTask + + // Reject update in 2nd WT. + wt2Resp, err := poller.HandlePartialWorkflowTask(wt1Resp.GetWorkflowTask(), false) + s.NoError(err) + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(3, wt2Resp.ResetHistoryEventId) + + // Send signal to create WT. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) + + // Complete workflow. + completeWorkflowResp, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(completeWorkflowResp) + + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) + + commits, rollbacks := speculativeWorkflowTaskOutcomes(capture.Snapshot()) + s.Equal(0, commits) + s.Equal(1, rollbacks) + + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 WorkflowExecutionCompleted`, events) +} + +func (s *WorkflowUpdateSuite) TestRunningWorkflowTask_NewNotEmptySpeculativeWorkflowTask_Rejected() { + env := testcore.NewEnv(s.T()) + + mustStartWorkflow(env, env.Tv()) + + var updateResultCh <-chan *workflowservice.UpdateWorkflowExecutionResponse + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Send update after 1st WT has started. + updateResultCh = sendUpdateNoError(env, env.Tv()) + // Completes WT with update unrelated commands to create events that will be in the next speculative WFT. + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: env.Tv().ActivityID(), + ActivityType: env.Tv().ActivityType(), + TaskQueue: env.Tv().TaskQueue(), + ScheduleToCloseTimeout: env.Tv().Any().InfiniteTimeout(), + }}, + }}, nil + case 2: + s.EqualHistory(` + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted`, task.History) + // Message handled rejects update. + return nil, nil + case 3: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted // Empty speculative WFT was written in to the history because it shipped events. + 9 ActivityTaskStarted + 10 ActivityTaskCompleted + 11 WorkflowTaskScheduled + 12 WorkflowTaskStarted + `, task.History) + + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil + } + } - t.Run("RunningWorkflowTask_NewEmptySpeculativeWorkflowTask_Rejected", func(t *testing.T) { - // Uses CaptureMetricsHandler which requires a dedicated cluster to avoid metric interference. - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - mustStartWorkflow(s, s.Tv()) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1, 3: + return nil, nil + case 2: + s.Len(task.Messages, 1) + updRequestMsg := task.Messages[0] + s.EqualValues(6, updRequestMsg.GetEventId()) + + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } + + atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { + return env.Tv().Any().Payloads(), false, nil + } + + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + ActivityTaskHandler: atHandler, + Logger: env.Logger, + T: s.T(), + } + + // Drain first WT which starts 1st update. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + wt1Resp := res.NewTask + + // Reject update in 2nd WT. + wt2Resp, err := poller.HandlePartialWorkflowTask(wt1Resp.GetWorkflowTask(), false) + s.NoError(err) + s.NotNil(wt2Resp) + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, wt2Resp.ResetHistoryEventId) - capture := s.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() - defer s.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + // Schedule new WFT. + err = poller.PollAndProcessActivityTask(false) + s.NoError(err) + + // Complete workflow. + completeWorkflowResp, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(completeWorkflowResp) + s.EqualValues(0, completeWorkflowResp.NewTask.ResetHistoryEventId) + + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) + + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 ActivityTaskStarted + 10 ActivityTaskCompleted + 11 WorkflowTaskScheduled + 12 WorkflowTaskStarted + 13 WorkflowTaskCompleted + 14 WorkflowExecutionCompleted`, events) +} - var updateResultCh <-chan *workflowservice.UpdateWorkflowExecutionResponse +func (s *WorkflowUpdateSuite) TestCompletedWorkflow() { + s.Run("receive outcome from completed Update", func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) wtHandlerCalls := 0 wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { wtHandlerCalls++ switch wtHandlerCalls { case 1: - // Send update after 1st WT has started. - updateResultCh = sendUpdateNoError(s, s.Tv()) - // Completes WT with empty command list to create next WFT w/o events. + // Completes first WT with empty command list. return nil, nil case 2: - s.EqualHistory(` - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted`, task.History) - // Message handled rejects update. - return nil, nil - case 3: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted // Speculative WT2 disappeared and new normal WT was created. - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted`, task.History) - return []*commandpb.Command{{ + res := env.UpdateAcceptCompleteCommands(env.Tv()) + res = append(res, &commandpb.Command{ CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil + }) + return res, nil default: s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) return nil, nil @@ -598,14 +839,10 @@ func TestWorkflowUpdateSuite(t *testing.T) { msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { msgHandlerCalls++ switch msgHandlerCalls { - case 1, 3: + case 1: return nil, nil case 2: - s.Len(task.Messages, 1) - updRequestMsg := task.Messages[0] - s.EqualValues(5, updRequestMsg.GetEventId()) - - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil + return env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), nil default: s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) return nil, nil @@ -614,111 +851,55 @@ func TestWorkflowUpdateSuite(t *testing.T) { //nolint:staticcheck // SA1019 TaskPoller replacement needed poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), WorkflowTaskHandler: wtHandler, MessageHandler: msgHandler, - Logger: s.Logger, + Logger: env.Logger, T: s.T(), } - // Drain first WT which starts 1st update. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - wt1Resp := res.NewTask - - // Reject update in 2nd WT. - wt2Resp, err := poller.HandlePartialWorkflowTask(wt1Resp.GetWorkflowTask(), false) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) s.NoError(err) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(3, wt2Resp.ResetHistoryEventId) - // Send signal to create WT. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + // Send Update request. + updateResultCh := sendUpdateNoError(env, env.Tv()) - // Complete workflow. - completeWorkflowResp, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + // Complete Update and Workflow. + _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) s.NoError(err) - s.NotNil(completeWorkflowResp) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - commits, rollbacks := speculativeWorkflowTaskOutcomes(capture.Snapshot()) - s.Equal(0, commits) - s.Equal(1, rollbacks) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 WorkflowExecutionCompleted`, events) - }) - t.Run("RunningWorkflowTask_NewNotEmptySpeculativeWorkflowTask_Rejected", func(t *testing.T) { - s := testcore.NewEnv(t) + // Receive Update result. + updateResult1 := <-updateResultCh + s.NotNil(updateResult1.GetOutcome().GetSuccess()) - mustStartWorkflow(s, s.Tv()) + // Send same Update request again, receiving the same Update result. + updateResultCh = sendUpdateNoError(env, env.Tv()) + updateResult2 := <-updateResultCh + s.Equal(updateResult1, updateResult2) + }) - var updateResultCh <-chan *workflowservice.UpdateWorkflowExecutionResponse + s.Run("receive update failure from accepted Update", func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) wtHandlerCalls := 0 wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { wtHandlerCalls++ switch wtHandlerCalls { case 1: - // Send update after 1st WT has started. - updateResultCh = sendUpdateNoError(s, s.Tv()) - // Completes WT with update unrelated commands to create events that will be in the next speculative WFT. - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: s.Tv().ActivityID(), - ActivityType: s.Tv().ActivityType(), - TaskQueue: s.Tv().TaskQueue(), - ScheduleToCloseTimeout: s.Tv().Any().InfiniteTimeout(), - }}, - }}, nil - case 2: - s.EqualHistory(` - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted`, task.History) - // Message handled rejects update. + // Completes first WT with empty command list. return nil, nil - case 3: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted // Empty speculative WFT was written in to the history because it shipped events. - 9 ActivityTaskStarted - 10 ActivityTaskCompleted - 11 WorkflowTaskScheduled - 12 WorkflowTaskStarted -`, task.History) - - return []*commandpb.Command{{ + case 2: + res := env.UpdateAcceptCommands(env.Tv()) + res = append(res, &commandpb.Command{ CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil + }) + return res, nil default: s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) return nil, nil @@ -729,891 +910,739 @@ func TestWorkflowUpdateSuite(t *testing.T) { msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { msgHandlerCalls++ switch msgHandlerCalls { - case 1, 3: + case 1: return nil, nil case 2: - s.Len(task.Messages, 1) - updRequestMsg := task.Messages[0] - s.EqualValues(6, updRequestMsg.GetEventId()) - - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil + return env.UpdateAcceptMessages(env.Tv(), task.Messages[0]), nil default: s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) return nil, nil } } - atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { - return s.Tv().Any().Payloads(), false, nil - } - //nolint:staticcheck // SA1019 TaskPoller replacement needed poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), WorkflowTaskHandler: wtHandler, MessageHandler: msgHandler, - ActivityTaskHandler: atHandler, - Logger: s.Logger, + Logger: env.Logger, T: s.T(), } - // Drain first WT which starts 1st update. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) s.NoError(err) - wt1Resp := res.NewTask - // Reject update in 2nd WT. - wt2Resp, err := poller.HandlePartialWorkflowTask(wt1Resp.GetWorkflowTask(), false) - s.NoError(err) - s.NotNil(wt2Resp) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, wt2Resp.ResetHistoryEventId) + // Send Update request. + updateResultCh := sendUpdate(testcore.NewContext(env.Context()), env, env.Tv()) - // Schedule new WFT. - err = poller.PollAndProcessActivityTask(false) + // Accept Update and complete Workflow. + _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) s.NoError(err) - // Complete workflow. - completeWorkflowResp, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(completeWorkflowResp) - s.EqualValues(0, completeWorkflowResp.NewTask.ResetHistoryEventId) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 ActivityTaskStarted - 10 ActivityTaskCompleted - 11 WorkflowTaskScheduled - 12 WorkflowTaskStarted - 13 WorkflowTaskCompleted - 14 WorkflowExecutionCompleted`, events) - }) + // Receive Update result. + updateResult1 := <-updateResultCh + s.NoError(updateResult1.err) + s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", updateResult1.response.GetOutcome().GetFailure().GetMessage()) - t.Run("CompletedWorkflow", func(t *testing.T) { - t.Run("receive outcome from completed Update", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + // Send same Update request again, receiving the same failure. + updateResultCh = sendUpdate(testcore.NewContext(env.Context()), env, env.Tv()) + updateResult2 := <-updateResultCh + s.NoError(updateResult2.err) + s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", updateResult2.response.GetOutcome().GetFailure().GetMessage()) + }) +} - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - res := s.UpdateAcceptCompleteCommands(s.Tv()) - res = append(res, &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }) - return res, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil +func (s *WorkflowUpdateSuite) TestValidateWorkerMessages() { + testCases := []struct { + Name string + RespondWorkflowTaskError string + MessageFn func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message + CommandFn func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command + }{ + { + Name: "message-update-id-not-found-and-accepted-request-not-set", + RespondWorkflowTaskError: "wasn't found", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + return []*protocolpb.Message{ + { + Id: tv.MessageID() + "_update-accepted", + ProtocolInstanceId: tv.UpdateID() + tv.Any().String(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: nil, // Important not to pass original request back. + }), + }, } - } - - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - return s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil + }, + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-accepted", + }}, + }, } - } - - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } - - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - - // Send Update request. - updateResultCh := sendUpdateNoError(s, s.Tv()) - - // Complete Update and Workflow. - _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - - // Receive Update result. - updateResult1 := <-updateResultCh - s.NotNil(updateResult1.GetOutcome().GetSuccess()) - - // Send same Update request again, receiving the same Update result. - updateResultCh = sendUpdateNoError(s, s.Tv()) - updateResult2 := <-updateResultCh - s.Equal(updateResult1, updateResult2) - }) - - t.Run("receive update failure from accepted Update", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) - - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - res := s.UpdateAcceptCommands(s.Tv()) - res = append(res, &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }) - return res, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil + }, + }, + { + Name: "message-update-id-not-found-and-accepted-request-is-set", + RespondWorkflowTaskError: "", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.MessageID() + "_update-accepted", + ProtocolInstanceId: tv.UpdateID() + tv.Any().String(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, // Update will be resurrected from original request. + }), + }, } - } - - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - return s.UpdateAcceptMessages(s.Tv(), task.Messages[0]), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil + }, + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-accepted", + }}, + }, } - } - - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } - - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - - // Send Update request. - updateResultCh := sendUpdate(testcore.NewContext(s.Context()), s, s.Tv()) - - // Accept Update and complete Workflow. - _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - - // Receive Update result. - updateResult1 := <-updateResultCh - s.NoError(updateResult1.err) - s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", updateResult1.response.GetOutcome().GetFailure().GetMessage()) - - // Send same Update request again, receiving the same failure. - updateResultCh = sendUpdate(testcore.NewContext(s.Context()), s, s.Tv()) - updateResult2 := <-updateResultCh - s.NoError(updateResult2.err) - s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", updateResult2.response.GetOutcome().GetFailure().GetMessage()) - }) - }) - - t.Run("ValidateWorkerMessages", func(t *testing.T) { - testCases := []struct { - Name string - RespondWorkflowTaskError string - MessageFn func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message - CommandFn func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command - }{ - { - Name: "message-update-id-not-found-and-accepted-request-not-set", - RespondWorkflowTaskError: "wasn't found", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - return []*protocolpb.Message{ - { - Id: tv.MessageID() + "_update-accepted", - ProtocolInstanceId: tv.UpdateID() + tv.Any().String(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: nil, // Important not to pass original request back. - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-accepted", - }}, - }, - } - }, }, - { - Name: "message-update-id-not-found-and-accepted-request-is-set", - RespondWorkflowTaskError: "", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.MessageID() + "_update-accepted", - ProtocolInstanceId: tv.UpdateID() + tv.Any().String(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, // Update will be resurrected from original request. - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-accepted", - }}, - }, - } - }, + }, + { + Name: "command-reference-missed-message", + RespondWorkflowTaskError: "referenced absent message ID", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.Any().String(), + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, + }), + }, + } }, - { - Name: "command-reference-missed-message", - RespondWorkflowTaskError: "referenced absent message ID", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.Any().String(), - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-accepted", - }}, - }, - } - }, + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-accepted", + }}, + }, + } }, - { - Name: "complete-without-accept", - RespondWorkflowTaskError: "invalid state transition attempted", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.MessageID() + "_update-completed", - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Response{ - Meta: updRequest.GetMeta(), - Outcome: &updatepb.Outcome{ - Value: &updatepb.Outcome_Success{ - Success: tv.Any().Payloads(), - }, + }, + { + Name: "complete-without-accept", + RespondWorkflowTaskError: "invalid state transition attempted", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.MessageID() + "_update-completed", + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Response{ + Meta: updRequest.GetMeta(), + Outcome: &updatepb.Outcome{ + Value: &updatepb.Outcome_Success{ + Success: tv.Any().Payloads(), }, - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-completed", - }}, - }, - } - }, + }, + }), + }, + } }, - { - Name: "accept-twice", - RespondWorkflowTaskError: "invalid state transition attempted", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.WithMessageIDNumber(1).MessageID(), - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, - }), - }, - { - Id: tv.WithMessageIDNumber(2).MessageID(), - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.WithMessageIDNumber(1).MessageID(), - }}, - }, - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.WithMessageIDNumber(2).MessageID(), - }}, - }, - } - }, + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-completed", + }}, + }, + } }, - { - Name: "success-case", - RespondWorkflowTaskError: "", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.MessageID() + "_update-accepted", - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, - }), - }, - { - Id: tv.MessageID() + "_update-completed", - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Response{ - Meta: updRequest.GetMeta(), - Outcome: &updatepb.Outcome{ - Value: &updatepb.Outcome_Success{ - Success: tv.Any().Payloads(), - }, + }, + { + Name: "accept-twice", + RespondWorkflowTaskError: "invalid state transition attempted", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.WithMessageIDNumber(1).MessageID(), + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, + }), + }, + { + Id: tv.WithMessageIDNumber(2).MessageID(), + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, + }), + }, + } + }, + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.WithMessageIDNumber(1).MessageID(), + }}, + }, + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.WithMessageIDNumber(2).MessageID(), + }}, + }, + } + }, + }, + { + Name: "success-case", + RespondWorkflowTaskError: "", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.MessageID() + "_update-accepted", + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, + }), + }, + { + Id: tv.MessageID() + "_update-completed", + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Response{ + Meta: updRequest.GetMeta(), + Outcome: &updatepb.Outcome{ + Value: &updatepb.Outcome_Success{ + Success: tv.Any().Payloads(), }, - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-accepted", - }}, - }, - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-completed", - }}, - }, - } - }, + }, + }), + }, + } }, - { - Name: "success-case-no-commands", // PROTOCOL_MESSAGE commands are optional. - RespondWorkflowTaskError: "", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.Any().String(), - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, - }), - }, - { - Id: tv.Any().String(), - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Response{ - Meta: updRequest.GetMeta(), - Outcome: &updatepb.Outcome{ - Value: &updatepb.Outcome_Success{ - Success: tv.Any().Payloads(), - }, + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-accepted", + }}, + }, + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-completed", + }}, + }, + } + }, + }, + { + Name: "success-case-no-commands", // PROTOCOL_MESSAGE commands are optional. + RespondWorkflowTaskError: "", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.Any().String(), + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, + }), + }, + { + Id: tv.Any().String(), + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Response{ + Meta: updRequest.GetMeta(), + Outcome: &updatepb.Outcome{ + Value: &updatepb.Outcome_Success{ + Success: tv.Any().Payloads(), }, - }), - }, - } - }, + }, + }), + }, + } }, - { - Name: "invalid-command-order", - RespondWorkflowTaskError: "invalid state transition attempted", - MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { - updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) - return []*protocolpb.Message{ - { - Id: tv.MessageID() + "_update-accepted", - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ - AcceptedRequestMessageId: reqMsg.GetId(), - AcceptedRequestSequencingEventId: reqMsg.GetEventId(), - AcceptedRequest: updRequest, - }), - }, - { - Id: tv.MessageID() + "_update-completed", - ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), - SequencingId: nil, - Body: protoutils.MarshalAny(t, &updatepb.Response{ - Meta: updRequest.GetMeta(), - Outcome: &updatepb.Outcome{ - Value: &updatepb.Outcome_Success{ - Success: tv.Any().Payloads(), - }, + }, + { + Name: "invalid-command-order", + RespondWorkflowTaskError: "invalid state transition attempted", + MessageFn: func(t *testing.T, tv *testvars.TestVars, reqMsg *protocolpb.Message) []*protocolpb.Message { + updRequest := protoutils.UnmarshalAny[*updatepb.Request](t, reqMsg.GetBody()) + return []*protocolpb.Message{ + { + Id: tv.MessageID() + "_update-accepted", + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Acceptance{ + AcceptedRequestMessageId: reqMsg.GetId(), + AcceptedRequestSequencingEventId: reqMsg.GetEventId(), + AcceptedRequest: updRequest, + }), + }, + { + Id: tv.MessageID() + "_update-completed", + ProtocolInstanceId: updRequest.GetMeta().GetUpdateId(), + SequencingId: nil, + Body: protoutils.MarshalAny(t, &updatepb.Response{ + Meta: updRequest.GetMeta(), + Outcome: &updatepb.Outcome{ + Value: &updatepb.Outcome_Success{ + Success: tv.Any().Payloads(), }, - }), - }, - } - }, - CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { - return []*commandpb.Command{ - // Complete command goes before Accept command. - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-completed", - }}, - }, - { - CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, - Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ - MessageId: tv.MessageID() + "_update-accepted", - }}, - }, - } - }, + }, + }), + }, + } }, - } - - for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) - - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - if tc.CommandFn == nil { - return nil, nil - } - return tc.CommandFn(t, s.Tv(), task.History), nil + CommandFn: func(t *testing.T, tv *testvars.TestVars, history *historypb.History) []*commandpb.Command { + return []*commandpb.Command{ + // Complete command goes before Accept command. + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-completed", + }}, + }, + { + CommandType: enumspb.COMMAND_TYPE_PROTOCOL_MESSAGE, + Attributes: &commandpb.Command_ProtocolMessageCommandAttributes{ProtocolMessageCommandAttributes: &commandpb.ProtocolMessageCommandAttributes{ + MessageId: tv.MessageID() + "_update-accepted", + }}, + }, } + }, + }, + } - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - if tc.MessageFn == nil { - return nil, nil - } - s.Require().NotEmpty(task.Messages, "expected update message in task") - updRequestMsg := task.Messages[0] - return tc.MessageFn(t, s.Tv(), updRequestMsg), nil - } + for _, tc := range testCases { + s.Run(tc.Name, func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: t, + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + if tc.CommandFn == nil { + return nil, nil } + return tc.CommandFn(s.T(), env.Tv(), task.History), nil + } - halfSecondTimeoutCtx, cancel := context.WithTimeout(s.Context(), 500*time.Millisecond) - defer cancel() - updateResultCh := sendUpdate(halfSecondTimeoutCtx, s, s.Tv()) - - // Process update in workflow. - _, err := poller.PollAndProcessWorkflowTask() - updateResult := <-updateResultCh - if tc.RespondWorkflowTaskError != "" { - s.Error(err, "RespondWorkflowTaskCompleted should return an error contains `%v`", tc.RespondWorkflowTaskError) - s.Contains(err.Error(), tc.RespondWorkflowTaskError) - - var wfNotReady *serviceerror.WorkflowNotReady - s.ErrorAs(updateResult.err, &wfNotReady, "API caller should get serviceerror.WorkflowNotReady, if server got a validation error while processing worker response.") - s.Contains(updateResult.err.Error(), "Unable to perform workflow execution update due to unexpected workflow task failure.") - s.Nil(updateResult.response) - } else { - s.NoError(err) - s.NoError(updateResult.err) + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + if tc.MessageFn == nil { + return nil, nil } - }) - } - }) - - t.Run("StickySpeculativeWorkflowTask_AcceptComplete", func(t *testing.T) { - testCases := []struct { - name string - useRunID bool - }{ - { - name: "with RunID", - useRunID: true, - }, - { - name: "without RunID", - useRunID: false, - }, - } + s.NotEmpty(task.Messages, "expected update message in task") + updRequestMsg := task.Messages[0] + return tc.MessageFn(s.T(), env.Tv(), updRequestMsg), nil + } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - s := testcore.NewEnv(t) - runID := mustStartWorkflow(s, s.Tv()) - tv := s.Tv() - if tc.useRunID { - tv = tv.WithRunID(runID) - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Drain existing first WT from regular task queue, but respond with sticky queue enabled response, next WT will go to sticky queue. - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - StickyAttributes: s.Tv().StickyExecutionAttributes(3 * time.Second), - }, nil - }) + halfSecondTimeoutCtx, cancel := context.WithTimeout(env.Context(), 500*time.Millisecond) + defer cancel() + updateResultCh := sendUpdate(halfSecondTimeoutCtx, env, env.Tv()) + + // Process update in workflow. + _, err := poller.PollAndProcessWorkflowTask() + updateResult := <-updateResultCh + if tc.RespondWorkflowTaskError != "" { + s.Error(err, "RespondWorkflowTaskCompleted should return an error contains `%v`", tc.RespondWorkflowTaskError) + s.Contains(err.Error(), tc.RespondWorkflowTaskError) + + var wfNotReady *serviceerror.WorkflowNotReady + s.ErrorAs(updateResult.err, &wfNotReady, "API caller should get serviceerror.WorkflowNotReady, if server got a validation error while processing worker response.") + s.Contains(updateResult.err.Error(), "Unable to perform workflow execution update due to unexpected workflow task failure.") + s.Nil(updateResult.response) + } else { s.NoError(err) + s.NoError(updateResult.err) + } + }) + } +} - go func() { - // Process update in workflow task (it is sticky). - res, err := s.TaskPoller(). - PollWorkflowTask(&workflowservice.PollWorkflowTaskQueueRequest{TaskQueue: s.Tv().StickyTaskQueue()}). - HandleTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - // This WT contains partial history because sticky was enabled. - s.EqualHistory(` - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted`, task.History) - - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - //nolint:testifylint // callback runs synchronously within HandleTask - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) //nolint:testifylint // callback runs synchronously within HandleTask - s.EqualValues(5, updRequestMsg.GetEventId()) //nolint:testifylint // callback runs synchronously within HandleTask - - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), - }, nil - }) - //nolint:testifylint // intentional async polling pattern - s.NoError(err) - s.NotNil(res) //nolint:testifylint // intentional async polling pattern - s.EqualValues(0, res.ResetHistoryEventId) //nolint:testifylint // intentional async polling pattern - }() - - // This is to make sure that sticky poller above reached server first. - // And when update comes, stick poller is already available. - time.Sleep(500 * time.Millisecond) //nolint:forbidigo - updateResult := <-sendUpdateNoError(s, tv) - - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. - 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} - `, s.GetHistory(s.Namespace().String(), tv.WorkflowExecution())) - }) - } - }) +func (s *WorkflowUpdateSuite) TestStickySpeculativeWorkflowTask_AcceptComplete() { + testCases := []struct { + name string + useRunID bool + }{ + { + name: "with RunID", + useRunID: true, + }, + { + name: "without RunID", + useRunID: false, + }, + } - t.Run("StickySpeculativeWorkflowTask_AcceptComplete_StickyWorkerUnavailable", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + for _, tc := range testCases { + s.Run(tc.name, func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + runID := mustStartWorkflow(env, env.Tv()) + tv := env.Tv() + if tc.useRunID { + tv = tv.WithRunID(runID) + } - // Drain existing WT from regular task queue, respond with sticky attributes to enable sticky task queue. - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - StickyAttributes: s.Tv().StickyExecutionAttributes(10 * time.Second), - }, nil - }) - s.NoError(err) + // Drain existing first WT from regular task queue, but respond with sticky queue enabled response, next WT will go to sticky queue. + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + StickyAttributes: env.Tv().StickyExecutionAttributes(3 * time.Second), + }, nil + }) + s.NoError(err) - // Force-unload the sticky queue partition so that matching returns - // StickyWorkerUnavailable immediately (pm == nil) without waiting - // for the 10s stickyPollerUnavailableWindow to expire. - _, err = s.GetTestCluster().MatchingClient().ForceUnloadTaskQueuePartition( - testcore.NewContext(), - &matchingservice.ForceUnloadTaskQueuePartitionRequest{ - NamespaceId: s.NamespaceID().String(), - TaskQueuePartition: &taskqueuespb.TaskQueuePartition{ - TaskQueue: s.Tv().StickyTaskQueue().Name, - TaskQueueType: enumspb.TASK_QUEUE_TYPE_WORKFLOW, - }, - }) - s.NoError(err) + go func() { + // Process update in workflow task (it is sticky). + res, err := env.TaskPoller(). + PollWorkflowTask(&workflowservice.PollWorkflowTaskQueueRequest{TaskQueue: env.Tv().StickyTaskQueue()}). + HandleTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + // This WT contains partial history because sticky was enabled. + s.EqualHistory(` + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted`, task.History) - // Now send an update. It should try sticky task queue first, get StickyWorkerUnavailable, - // and fall back to the normal task queue. - updateResultCh := sendUpdateNoError(s, s.Tv()) + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + //nolint:testifylint // callback runs synchronously within HandleTask + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) //nolint:testifylint // callback runs synchronously within HandleTask + s.EqualValues(5, updRequestMsg.GetEventId()) //nolint:testifylint // callback runs synchronously within HandleTask - // Process update in workflow task from non-sticky task queue. - res, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - // Full history from event 1 confirms the task came from the normal queue - // (sticky queue would send partial history starting after the last completed WT). - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted - `, task.History) + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: env.UpdateAcceptCompleteCommands(env.Tv()), + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), + }, nil + }) + //nolint:testifylint // intentional async polling pattern + s.NoError(err) + s.NotNil(res) //nolint:testifylint // intentional async polling pattern + s.EqualValues(0, res.ResetHistoryEventId) //nolint:testifylint // intentional async polling pattern + }() - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + // This is to make sure that sticky poller above reached server first. + // And when update comes, stick poller is already available. + time.Sleep(500 * time.Millisecond) //nolint:forbidigo + updateResult := <-sendUpdateNoError(env, tv) + + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. + 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} + `, env.GetHistory(env.Namespace().String(), tv.WorkflowExecution())) + }) + } +} - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(5, updRequestMsg.GetEventId()) +func (s *WorkflowUpdateSuite) TestStickySpeculativeWorkflowTask_AcceptComplete_StickyWorkerUnavailable() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + // Worker gets full history because update was issued after sticky worker is gone. + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil + } + } - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), - }, nil - }) - s.NoError(err) - s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, res.ResetHistoryEventId) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. - 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} -`, events) - }) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(5, updRequestMsg.GetEventId()) + + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } - t.Run("FirstNormalScheduledWorkflowTask_Reject", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + StickyTaskQueue: env.Tv().StickyTaskQueue(), + StickyScheduleToStartTimeout: 3 * time.Second, + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted`, task.History) - return nil, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + // Drain existing WT from regular task queue, but respond with sticky enabled response to enable stick task queue. + _, err := poller.PollAndProcessWorkflowTask(testcore.WithRespondSticky, testcore.WithoutRetries) + s.NoError(err) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + env.Logger.Info("Sleep 10+ seconds to make sure stickyPollerUnavailableWindow time has passed.") + time.Sleep(10*time.Second + 100*time.Millisecond) //nolint:forbidigo + env.Logger.Info("Sleep 10+ seconds is done.") - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(2, updRequestMsg.GetEventId()) + // Now send an update. It should try sticky task queue first, but got "StickyWorkerUnavailable" error + // and resend it to normal. + // This can be observed in wtHandler: if history is partial => sticky task queue is used. + updateResultCh := sendUpdateNoError(env, env.Tv()) - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } - } + // Process update in workflow task from non-sticky task queue. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - updateResultCh := sendUpdateNoError(s, s.Tv()) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResp := res.NewTask - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, updateResp.ResetHistoryEventId) - - s.Equal(1, wtHandlerCalls) - s.Equal(1, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled // First normal WT was scheduled before update and therefore all 3 events have to be written even if update was rejected. - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted // Empty completed WT. No new events were created after it. -`, events) - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. + 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} + `, events) +} - t.Run("EmptySpeculativeWorkflowTask_Reject", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) +func (s *WorkflowUpdateSuite) TestFirstNormalScheduledWorkflowTask_Reject() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted`, task.History) + return nil, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil + } + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(2, updRequestMsg.GetEventId()) + + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } - // Process update in workflow. - res, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted - `, task.History) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + updateResultCh := sendUpdateNoError(env, env.Tv()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(5, updRequestMsg.GetEventId()) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResp := res.NewTask + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, updateResp.ResetHistoryEventId) - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateRejectMessages(s.Tv(), updRequestMsg), - }, nil - }) - s.NoError(err) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(3, res.ResetHistoryEventId) + s.Equal(1, wtHandlerCalls) + s.Equal(1, msgHandlerCalls) - // Send signal to create WT. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - // Process signal and complete workflow. - res, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled // Speculative WT was dropped and history starts from 5 again. - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted`, task.History) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled // First normal WT was scheduled before update and therefore all 3 events have to be written even if update was rejected. + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted // Empty completed WT. No new events were created after it. + `, events) +} - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ - CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, - }, +func (s *WorkflowUpdateSuite) TestEmptySpeculativeWorkflowTask_Reject() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + updateResultCh := sendUpdateNoError(env, env.Tv()) + + // Process update in workflow. + res, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(5, updRequestMsg.GetEventId()) + + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateRejectMessages(env.Tv(), updRequestMsg), + }, nil + }) + s.NoError(err) + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(3, res.ResetHistoryEventId) + + // Send signal to create WT. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) + + // Process signal and complete workflow. + res, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled // Speculative WT was dropped and history starts from 5 again. + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted`, task.History) + + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ + CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, }, }, - }, nil - }) - s.NoError(err) - s.NotNil(res) + }, + }, nil + }) + s.NoError(err) + s.NotNil(res) - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -1624,1684 +1653,1684 @@ func TestWorkflowUpdateSuite(t *testing.T) { 8 WorkflowTaskCompleted 9 WorkflowExecutionCompleted `, events) - }) - - t.Run("NotEmptySpeculativeWorkflowTask_Reject", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) - - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: s.Tv().ActivityID(), - ActivityType: s.Tv().ActivityType(), - TaskQueue: s.Tv().TaskQueue(), - ScheduleToCloseTimeout: s.Tv().Any().InfiniteTimeout(), - }}, - }}, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled // Speculative WFT will be written to the history because there is ActivityTaskScheduled(5) event. - 7 WorkflowTaskStarted -`, task.History) - return nil, nil - case 3: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted // Empty speculative WFT was written to the history because it shipped events. - 9 ActivityTaskStarted - 10 ActivityTaskCompleted - 11 WorkflowTaskScheduled - 12 WorkflowTaskStarted -`, task.History) - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } - - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1, 3: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(6, updRequestMsg.GetEventId()) - - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } - } - - atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { - return s.Tv().Any().Payloads(), false, nil - } - - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - ActivityTaskHandler: atHandler, - Logger: s.Logger, - T: s.T(), - } - - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - - updateResultCh := sendUpdateNoError(s, s.Tv()) - - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, res.NewTask.ResetHistoryEventId, "no reset of event ID should happened after update rejection if it was delivered with normal workflow task") - - err = poller.PollAndProcessActivityTask(false) - s.NoError(err) - - // Complete workflow. - res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 ActivityTaskScheduled - 6 WorkflowTaskScheduled // Speculative WFT (6-8) presents in the history even though update was rejected. - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 ActivityTaskStarted - 10 ActivityTaskCompleted - 11 WorkflowTaskScheduled - 12 WorkflowTaskStarted - 13 WorkflowTaskCompleted - 14 WorkflowExecutionCompleted`, events) - }) - - t.Run("1stAccept_2ndAccept_2ndComplete_1stComplete", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) - tv1 := s.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1).WithActivityIDNumber(1) - tv2 := s.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2).WithActivityIDNumber(2) - - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted`, task.History) - return append(s.UpdateAcceptCommands(tv1), &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: tv1.ActivityID(), - ActivityType: tv1.ActivityType(), - TaskQueue: tv1.TaskQueue(), - ScheduleToCloseTimeout: tv1.Any().InfiniteTimeout(), - }}, - }), nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted // 1st update is accepted. - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled // New normal WT is created because of the 2nd update. - 8 WorkflowTaskStarted`, task.History) - return append(s.UpdateAcceptCommands(tv2), &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: tv2.ActivityID(), - ActivityType: tv2.ActivityType(), - TaskQueue: tv2.TaskQueue(), - ScheduleToCloseTimeout: tv2.Any().InfiniteTimeout(), - }}, - }), nil - case 3: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted - 10 WorkflowExecutionUpdateAccepted // 2nd update is accepted. - 11 ActivityTaskScheduled - 12 ActivityTaskStarted - 13 ActivityTaskCompleted - 14 WorkflowTaskScheduled - 15 WorkflowTaskStarted -`, task.History) - return s.UpdateCompleteCommands(tv2), nil - case 4: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted - 10 WorkflowExecutionUpdateAccepted - 11 ActivityTaskScheduled - 12 ActivityTaskStarted - 13 ActivityTaskCompleted - 14 WorkflowTaskScheduled - 15 WorkflowTaskStarted - 16 WorkflowTaskCompleted - 17 WorkflowExecutionUpdateCompleted // 2nd update is completed. - 18 ActivityTaskStarted - 19 ActivityTaskCompleted - 20 WorkflowTaskScheduled - 21 WorkflowTaskStarted -`, task.History) - return s.UpdateCompleteCommands(tv1), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } - - var upd1RequestMsg, upd2RequestMsg *protocolpb.Message - - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - upd1RequestMsg = task.Messages[0] - upd1Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd1RequestMsg.GetBody()) - s.Equal("args-value-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), upd1Request.GetInput().GetArgs())) - s.EqualValues(2, upd1RequestMsg.GetEventId()) - return s.UpdateAcceptMessages(tv1, upd1RequestMsg), nil - case 2: - upd2RequestMsg = task.Messages[0] - upd2Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd2RequestMsg.GetBody()) - s.Equal("args-value-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), upd2Request.GetInput().GetArgs())) - s.EqualValues(7, upd2RequestMsg.GetEventId()) - return s.UpdateAcceptMessages(tv2, upd2RequestMsg), nil - case 3: - s.NotNil(upd2RequestMsg) - return s.UpdateCompleteMessages(tv2, upd2RequestMsg), nil - case 4: - s.NotNil(upd1RequestMsg) - return s.UpdateCompleteMessages(tv1, upd1RequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } - } +} - atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { - return s.Tv().Any().Payloads(), false, nil +func (s *WorkflowUpdateSuite) TestNotEmptySpeculativeWorkflowTask_Reject() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: env.Tv().ActivityID(), + ActivityType: env.Tv().ActivityType(), + TaskQueue: env.Tv().TaskQueue(), + ScheduleToCloseTimeout: env.Tv().Any().InfiniteTimeout(), + }}, + }}, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled // Speculative WFT will be written to the history because there is ActivityTaskScheduled(5) event. + 7 WorkflowTaskStarted + `, task.History) + return nil, nil + case 3: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted // Empty speculative WFT was written to the history because it shipped events. + 9 ActivityTaskStarted + 10 ActivityTaskCompleted + 11 WorkflowTaskScheduled + 12 WorkflowTaskStarted + `, task.History) + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - ActivityTaskHandler: atHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1, 3: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(6, updRequestMsg.GetEventId()) + + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - updateResultCh1 := sendUpdateNoError(s, tv1) - - // Accept update1 in normal WT1. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - - // Send 2nd update and create speculative WT2. - updateResultCh2 := sendUpdateNoError(s, tv2) - - // Poll for WT2 which 2nd update. Accept update2. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) + atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { + return env.Tv().Any().Payloads(), false, nil + } - err = poller.PollAndProcessActivityTask(false) - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + ActivityTaskHandler: atHandler, + Logger: env.Logger, + T: s.T(), + } - // Complete update2 in WT3. - res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult2 := <-updateResultCh2 - s.Equal("success-result-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - err = poller.PollAndProcessActivityTask(false) - s.NoError(err) + updateResultCh := sendUpdateNoError(env, env.Tv()) - // Complete update1 in WT4. - res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult1 := <-updateResultCh1 - s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, updateResult1.Stage) - s.Equal("success-result-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), updateResult1.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) - - s.Equal(4, wtHandlerCalls) - s.Equal(4, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 2} // WTScheduled event which delivered update to the worker. - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted - 10 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 7} // WTScheduled event which delivered update to the worker. - 11 ActivityTaskScheduled - 12 ActivityTaskStarted - 13 ActivityTaskCompleted - 14 WorkflowTaskScheduled - 15 WorkflowTaskStarted - 16 WorkflowTaskCompleted - 17 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 10} // 2nd update is completed. - 18 ActivityTaskStarted - 19 ActivityTaskCompleted - 20 WorkflowTaskScheduled - 21 WorkflowTaskStarted - 22 WorkflowTaskCompleted - 23 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 5} // 1st update is completed. -`, events) - }) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, res.NewTask.ResetHistoryEventId, "no reset of event ID should happened after update rejection if it was delivered with normal workflow task") - t.Run("1stAccept_2ndReject_1stComplete", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + err = poller.PollAndProcessActivityTask(false) + s.NoError(err) - tv1 := s.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1).WithActivityIDNumber(1) - tv2 := s.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2).WithActivityIDNumber(2) + // Complete workflow. + res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted`, task.History) - return append(s.UpdateAcceptCommands(tv1), &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: tv1.ActivityID(), - ActivityType: tv1.ActivityType(), - TaskQueue: tv1.TaskQueue(), - ScheduleToCloseTimeout: tv1.Any().InfiniteTimeout(), - }}, - }), nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted // 1st update is accepted. - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled // Speculative WFT with WorkflowExecutionUpdateAccepted(5) event. - 8 WorkflowTaskStarted -`, task.History) - // Message handler rejects 2nd update. - return nil, nil - case 3: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted // Speculative WFT is written to the history because it shipped event. - 10 ActivityTaskStarted - 11 ActivityTaskCompleted - 12 WorkflowTaskScheduled - 13 WorkflowTaskStarted -`, task.History) - return s.UpdateCompleteCommands(tv1), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - var upd1RequestMsg *protocolpb.Message - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - upd1RequestMsg = task.Messages[0] - upd1Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd1RequestMsg.GetBody()) - s.Equal("args-value-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), upd1Request.GetInput().GetArgs())) - s.EqualValues(2, upd1RequestMsg.GetEventId()) - return s.UpdateAcceptMessages(tv1, upd1RequestMsg), nil - case 2: - upd2RequestMsg := task.Messages[0] - upd2Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd2RequestMsg.GetBody()) - s.Equal("args-value-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), upd2Request.GetInput().GetArgs())) - s.EqualValues(7, upd2RequestMsg.GetEventId()) - return s.UpdateRejectMessages(tv2, upd2RequestMsg), nil - case 3: - s.NotNil(upd1RequestMsg) - return s.UpdateCompleteMessages(tv1, upd1RequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } - } + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 ActivityTaskScheduled + 6 WorkflowTaskScheduled // Speculative WFT (6-8) presents in the history even though update was rejected. + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 ActivityTaskStarted + 10 ActivityTaskCompleted + 11 WorkflowTaskScheduled + 12 WorkflowTaskStarted + 13 WorkflowTaskCompleted + 14 WorkflowExecutionCompleted`, events) +} - atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { - return s.Tv().Any().Payloads(), false, nil +func (s *WorkflowUpdateSuite) Test1stAccept_2ndAccept_2ndComplete_1stComplete() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + tv1 := env.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1).WithActivityIDNumber(1) + tv2 := env.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2).WithActivityIDNumber(2) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted`, task.History) + return append(env.UpdateAcceptCommands(tv1), &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: tv1.ActivityID(), + ActivityType: tv1.ActivityType(), + TaskQueue: tv1.TaskQueue(), + ScheduleToCloseTimeout: tv1.Any().InfiniteTimeout(), + }}, + }), nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted // 1st update is accepted. + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled // New normal WT is created because of the 2nd update. + 8 WorkflowTaskStarted`, task.History) + return append(env.UpdateAcceptCommands(tv2), &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: tv2.ActivityID(), + ActivityType: tv2.ActivityType(), + TaskQueue: tv2.TaskQueue(), + ScheduleToCloseTimeout: tv2.Any().InfiniteTimeout(), + }}, + }), nil + case 3: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted + 10 WorkflowExecutionUpdateAccepted // 2nd update is accepted. + 11 ActivityTaskScheduled + 12 ActivityTaskStarted + 13 ActivityTaskCompleted + 14 WorkflowTaskScheduled + 15 WorkflowTaskStarted + `, task.History) + return env.UpdateCompleteCommands(tv2), nil + case 4: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted + 10 WorkflowExecutionUpdateAccepted + 11 ActivityTaskScheduled + 12 ActivityTaskStarted + 13 ActivityTaskCompleted + 14 WorkflowTaskScheduled + 15 WorkflowTaskStarted + 16 WorkflowTaskCompleted + 17 WorkflowExecutionUpdateCompleted // 2nd update is completed. + 18 ActivityTaskStarted + 19 ActivityTaskCompleted + 20 WorkflowTaskScheduled + 21 WorkflowTaskStarted + `, task.History) + return env.UpdateCompleteCommands(tv1), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - ActivityTaskHandler: atHandler, - Logger: s.Logger, - T: s.T(), + var upd1RequestMsg, upd2RequestMsg *protocolpb.Message + + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + upd1RequestMsg = task.Messages[0] + upd1Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd1RequestMsg.GetBody()) + s.Equal("args-value-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), upd1Request.GetInput().GetArgs())) + s.EqualValues(2, upd1RequestMsg.GetEventId()) + return env.UpdateAcceptMessages(tv1, upd1RequestMsg), nil + case 2: + upd2RequestMsg = task.Messages[0] + upd2Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd2RequestMsg.GetBody()) + s.Equal("args-value-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), upd2Request.GetInput().GetArgs())) + s.EqualValues(7, upd2RequestMsg.GetEventId()) + return env.UpdateAcceptMessages(tv2, upd2RequestMsg), nil + case 3: + s.NotNil(upd2RequestMsg) + return env.UpdateCompleteMessages(tv2, upd2RequestMsg), nil + case 4: + s.NotNil(upd1RequestMsg) + return env.UpdateCompleteMessages(tv1, upd1RequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - updateResultCh1 := sendUpdateNoError(s, tv1) + atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { + return env.Tv().Any().Payloads(), false, nil + } - // Accept update1 in WT1. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + ActivityTaskHandler: atHandler, + Logger: env.Logger, + T: s.T(), + } - // Send 2nd update and create speculative WT2. - updateResultCh2 := sendUpdateNoError(s, tv2) + updateResultCh1 := sendUpdateNoError(env, tv1) - // Poll for WT2 which 2nd update. Reject update2. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - s.EqualValues(0, res.NewTask.ResetHistoryEventId, "no reset of event ID should happened after update rejection if it was delivered with workflow task which had events") + // Accept update1 in normal WT1. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - updateResult2 := <-updateResultCh2 - s.Equal("rejection-of-"+tv2.UpdateID(), updateResult2.GetOutcome().GetFailure().GetMessage()) + // Send 2nd update and create speculative WT2. + updateResultCh2 := sendUpdateNoError(env, tv2) - err = poller.PollAndProcessActivityTask(false) - s.NoError(err) + // Poll for WT2 which 2nd update. Accept update2. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - // Complete update1 in WT3. - res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult1 := <-updateResultCh1 - s.Equal("success-result-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), updateResult1.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 2} // WTScheduled event which delivered update to the worker. - 6 ActivityTaskScheduled - 7 WorkflowTaskScheduled - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted // WT which had rejected update. - 10 ActivityTaskStarted - 11 ActivityTaskCompleted - 12 WorkflowTaskScheduled - 13 WorkflowTaskStarted - 14 WorkflowTaskCompleted - 15 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 5} -`, events) - }) + err = poller.PollAndProcessActivityTask(false) + s.NoError(err) - t.Run("SpeculativeWorkflowTask_Fail", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + // Complete update2 in WT3. + res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult2 := <-updateResultCh2 + s.Equal("success-result-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + err = poller.PollAndProcessActivityTask(false) + s.NoError(err) - // Use test context with shorter timeout for this specific operation - timeoutCtx, cancel := context.WithTimeout(s.Context(), 2*time.Second) - defer cancel() - updateResultCh := sendUpdate(timeoutCtx, s, s.Tv()) + // Complete update1 in WT4. + res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult1 := <-updateResultCh1 + s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, updateResult1.Stage) + s.Equal("success-result-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), updateResult1.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - // Try to accept update in workflow: get malformed response. - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted - `, task.History) + s.Equal(4, wtHandlerCalls) + s.Equal(4, msgHandlerCalls) - s.Require().NotEmpty(task.Messages, "expected update message in task") - updRequestMsg := task.Messages[0] - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: s.UpdateAcceptCommands(s.Tv()), - // Emulate bug in worker/SDK update handler code. Return malformed acceptance response. - Messages: []*protocolpb.Message{ - { - Id: s.Tv().MessageID() + "_update-accepted", - ProtocolInstanceId: s.Tv().Any().String(), - SequencingId: nil, - Body: protoutils.MarshalAny(s.T(), &updatepb.Acceptance{ - AcceptedRequestMessageId: updRequestMsg.GetId(), - AcceptedRequestSequencingEventId: updRequestMsg.GetEventId(), - AcceptedRequest: nil, // must not be nil! - }), - }, - }, - }, nil - }) - s.Error(err) - s.Contains(err.Error(), "wasn't found") - - // Update is aborted, speculative WFT failure is recorded into the history. - updateResult := <-updateResultCh - var wfNotReady *serviceerror.WorkflowNotReady - s.ErrorAs(updateResult.err, &wfNotReady) - s.Contains(updateResult.err.Error(), "Unable to perform workflow execution update due to unexpected workflow task failure.") - - // New transient WFT is created and is now included in the history. - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowTaskScheduled`, events) - - // Send Update again. It will be delivered on existing transient WFT. - updateResultCh = sendUpdate(timeoutCtx, s, s.Tv()) - - // Try to accept 2nd update in workflow: get error. Poller will fail WFT, but the registry won't be cleared and Update won't be aborted. - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowTaskScheduled // Transient WFT - 9 WorkflowTaskStarted`, task.History) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - s.Require().NotEmpty(task.Messages, "expected update message in task") - updRequestMsg := task.Messages[0] - s.EqualValues(8, updRequestMsg.GetEventId()) - // Returning error will cause the poller to fail WFT. - return nil, errors.New("malformed request") - }) - // The error is from RespondWorkflowTaskFailed, which should go w/o error. - s.NoError(err) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 2} // WTScheduled event which delivered update to the worker. + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted + 10 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 7} // WTScheduled event which delivered update to the worker. + 11 ActivityTaskScheduled + 12 ActivityTaskStarted + 13 ActivityTaskCompleted + 14 WorkflowTaskScheduled + 15 WorkflowTaskStarted + 16 WorkflowTaskCompleted + 17 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 10} // 2nd update is completed. + 18 ActivityTaskStarted + 19 ActivityTaskCompleted + 20 WorkflowTaskScheduled + 21 WorkflowTaskStarted + 22 WorkflowTaskCompleted + 23 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 5} // 1st update is completed. + `, events) +} - // Update timed out, but stays in the registry and will be delivered again on the new transient WFT. - updateResult = <-updateResultCh - s.Error(updateResult.err) - s.True(common.IsContextDeadlineExceededErr(updateResult.err), "UpdateWorkflowExecution must timeout after 2 seconds") - s.Nil(updateResult.response) - - // This WFT failure wasn't recorded because WFT was transient, but the scheduled event is included. - events = s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowTaskScheduled`, events) - - // Try to accept 2nd update in workflow 2nd time: get error. Poller will fail WT. Update is not aborted. - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - // 1st attempt UpdateWorkflowExecution call has timed out but the - // update is still running - s.Require().NotEmpty(task.Messages, "expected update message in task") - updRequestMsg := task.Messages[0] - s.EqualValues(8, updRequestMsg.GetEventId()) - // Fail WT one more time. This is transient WT and shouldn't appear in the history. - // Returning error will cause the poller to fail WT. - return nil, errors.New("malformed request") - }) - // The error is from RespondWorkflowTaskFailed, which should go w/o error. - s.NoError(err) +func (s *WorkflowUpdateSuite) Test1stAccept_2ndReject_1stComplete() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) - events = s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowTaskScheduled`, events) - - // Complete Update and workflow. - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowTaskScheduled // Transient WFT - 9 WorkflowTaskStarted`, task.History) + tv1 := env.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1).WithActivityIDNumber(1) + tv2 := env.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2).WithActivityIDNumber(2) - s.Require().NotEmpty(task.Messages, "expected update message in task") - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - Commands: append(s.UpdateAcceptCompleteCommands(s.Tv()), &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }), - }, nil - }) - s.NoError(err) + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted`, task.History) + return append(env.UpdateAcceptCommands(tv1), &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: tv1.ActivityID(), + ActivityType: tv1.ActivityType(), + TaskQueue: tv1.TaskQueue(), + ScheduleToCloseTimeout: tv1.Any().InfiniteTimeout(), + }}, + }), nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted // 1st update is accepted. + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled // Speculative WFT with WorkflowExecutionUpdateAccepted(5) event. + 8 WorkflowTaskStarted + `, task.History) + // Message handler rejects 2nd update. + return nil, nil + case 3: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted // Speculative WFT is written to the history because it shipped event. + 10 ActivityTaskStarted + 11 ActivityTaskCompleted + 12 WorkflowTaskScheduled + 13 WorkflowTaskStarted + `, task.History) + return env.UpdateCompleteCommands(tv1), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil + } + } - events = s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowTaskScheduled - 9 WorkflowTaskStarted - 10 WorkflowTaskCompleted // Transient WFT was completed successfully and ended up in the history. - 11 WorkflowExecutionUpdateAccepted - 12 WorkflowExecutionUpdateCompleted - 13 WorkflowExecutionCompleted`, events) - }) + var upd1RequestMsg *protocolpb.Message + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + upd1RequestMsg = task.Messages[0] + upd1Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd1RequestMsg.GetBody()) + s.Equal("args-value-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), upd1Request.GetInput().GetArgs())) + s.EqualValues(2, upd1RequestMsg.GetEventId()) + return env.UpdateAcceptMessages(tv1, upd1RequestMsg), nil + case 2: + upd2RequestMsg := task.Messages[0] + upd2Request := protoutils.UnmarshalAny[*updatepb.Request](s.T(), upd2RequestMsg.GetBody()) + s.Equal("args-value-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), upd2Request.GetInput().GetArgs())) + s.EqualValues(7, upd2RequestMsg.GetEventId()) + return env.UpdateRejectMessages(tv2, upd2RequestMsg), nil + case 3: + s.NotNil(upd1RequestMsg) + return env.UpdateCompleteMessages(tv1, upd1RequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } - t.Run("StartedSpeculativeWorkflowTask_ConvertToNormalBecauseOfBufferedSignal", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { + return env.Tv().Any().Payloads(), false, nil + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. Events 5 and 6 are written into the history when signal is received. - 6 WorkflowTaskStarted -`, task.History) - // Send signal which will be buffered. This will persist MS and speculative WT must be converted to normal. - err := s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) - return nil, nil - case 3: - s.EqualHistory(` - 7 WorkflowTaskCompleted - 8 WorkflowExecutionSignaled // It was buffered and got to the history after WT is completed. - 9 WorkflowTaskScheduled - 10 WorkflowTaskStarted`, task.History) - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + ActivityTaskHandler: atHandler, + Logger: env.Logger, + T: s.T(), + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1, 3: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] + updateResultCh1 := sendUpdateNoError(env, tv1) - s.EqualValues(5, updRequestMsg.GetEventId()) + // Accept update1 in WT1. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Update is rejected but corresponding speculative WT will be in the history anyway, because it was converted to normal due to buffered signal. - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } - } + // Send 2nd update and create speculative WT2. + updateResultCh2 := sendUpdateNoError(env, tv2) - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + // Poll for WT2 which 2nd update. Reject update2. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + s.EqualValues(0, res.NewTask.ResetHistoryEventId, "no reset of event ID should happened after update rejection if it was delivered with workflow task which had events") - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + updateResult2 := <-updateResultCh2 + s.Equal("rejection-of-"+tv2.UpdateID(), updateResult2.GetOutcome().GetFailure().GetMessage()) - updateResultCh := sendUpdateNoError(s, s.Tv()) + err = poller.PollAndProcessActivityTask(false) + s.NoError(err) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResp := res.NewTask - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, updateResp.ResetHistoryEventId) + // Complete update1 in WT3. + res, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult1 := <-updateResultCh1 + s.Equal("success-result-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), updateResult1.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - // Complete workflow. - completeWorkflowResp, err := poller.HandlePartialWorkflowTask(updateResp.GetWorkflowTask(), false) - s.NoError(err) - s.NotNil(completeWorkflowResp) - s.Nil(completeWorkflowResp.GetWorkflowTask()) - s.EqualValues(0, completeWorkflowResp.ResetHistoryEventId) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted // Update was rejected on speculative WT, but events 5-7 are in the history because of buffered signal. - 8 WorkflowExecutionSignaled - 9 WorkflowTaskScheduled - 10 WorkflowTaskStarted - 11 WorkflowTaskCompleted - 12 WorkflowExecutionCompleted`, events) - }) + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - t.Run("ScheduledSpeculativeWorkflowTask_ConvertToNormalBecauseOfSignal", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // It was initially speculative WT but was already converted to normal when signal was received. - 6 WorkflowExecutionSignaled - 7 WorkflowTaskStarted`, task.History) - return nil, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 2} // WTScheduled event which delivered update to the worker. + 6 ActivityTaskScheduled + 7 WorkflowTaskScheduled + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted // WT which had rejected update. + 10 ActivityTaskStarted + 11 ActivityTaskCompleted + 12 WorkflowTaskScheduled + 13 WorkflowTaskStarted + 14 WorkflowTaskCompleted + 15 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 5} + `, events) +} - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - s.Require().NotEmpty(task.Messages, "expected update message in task") - updRequestMsg := task.Messages[0] +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_Fail() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + // Use test context with shorter timeout for this specific operation + timeoutCtx, cancel := context.WithTimeout(env.Context(), 2*time.Second) + defer cancel() + updateResultCh := sendUpdate(timeoutCtx, env, env.Tv()) + + // Try to accept update in workflow: get malformed response. + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + + s.NotEmpty(task.Messages, "expected update message in task") + updRequestMsg := task.Messages[0] + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: env.UpdateAcceptCommands(env.Tv()), + // Emulate bug in worker/SDK update handler code. Return malformed acceptance response. + Messages: []*protocolpb.Message{ + { + Id: env.Tv().MessageID() + "_update-accepted", + ProtocolInstanceId: env.Tv().Any().String(), + SequencingId: nil, + Body: protoutils.MarshalAny(s.T(), &updatepb.Acceptance{ + AcceptedRequestMessageId: updRequestMsg.GetId(), + AcceptedRequestSequencingEventId: updRequestMsg.GetEventId(), + AcceptedRequest: nil, // must not be nil! + }), + }, + }, + }, nil + }) + s.Error(err) + s.Contains(err.Error(), "wasn't found") + + // Update is aborted, speculative WFT failure is recorded into the history. + updateResult := <-updateResultCh + var wfNotReady *serviceerror.WorkflowNotReady + s.ErrorAs(updateResult.err, &wfNotReady) + s.Contains(updateResult.err.Error(), "Unable to perform workflow execution update due to unexpected workflow task failure.") + + // New transient WFT is created and is now included in the history. + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowTaskScheduled`, events) + + // Send Update again. It will be delivered on existing transient WFT. + updateResultCh = sendUpdate(timeoutCtx, env, env.Tv()) + + // Try to accept 2nd update in workflow: get error. Poller will fail WFT, but the registry won't be cleared and Update won't be aborted. + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowTaskScheduled // Transient WFT + 9 WorkflowTaskStarted`, task.History) + + s.NotEmpty(task.Messages, "expected update message in task") + updRequestMsg := task.Messages[0] + s.EqualValues(8, updRequestMsg.GetEventId()) + // Returning error will cause the poller to fail WFT. + return nil, errors.New("malformed request") + }) + // The error is from RespondWorkflowTaskFailed, which should go w/o error. + s.NoError(err) + + // Update timed out, but stays in the registry and will be delivered again on the new transient WFT. + updateResult = <-updateResultCh + s.Error(updateResult.err) + s.True(common.IsContextDeadlineExceededErr(updateResult.err), "UpdateWorkflowExecution must timeout after 2 seconds") + s.Nil(updateResult.response) + + // This WFT failure wasn't recorded because WFT was transient, but the scheduled event is included. + events = env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowTaskScheduled`, events) + + // Try to accept 2nd update in workflow 2nd time: get error. Poller will fail WT. Update is not aborted. + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + // 1st attempt UpdateWorkflowExecution call has timed out but the + // update is still running + s.NotEmpty(task.Messages, "expected update message in task") + updRequestMsg := task.Messages[0] + s.EqualValues(8, updRequestMsg.GetEventId()) + // Fail WT one more time. This is transient WT and shouldn't appear in the history. + // Returning error will cause the poller to fail WT. + return nil, errors.New("malformed request") + }) + // The error is from RespondWorkflowTaskFailed, which should go w/o error. + s.NoError(err) - s.EqualValues(6, updRequestMsg.GetEventId()) + events = env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowTaskScheduled`, events) + + // Complete Update and workflow. + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowTaskScheduled // Transient WFT + 9 WorkflowTaskStarted`, task.History) + + s.NotEmpty(task.Messages, "expected update message in task") + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + Commands: append(env.UpdateAcceptCompleteCommands(env.Tv()), &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }), + }, nil + }) + s.NoError(err) - // Update is rejected but corresponding speculative WT was already converted to normal, - // and will be in the history anyway. - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } - } + events = env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowTaskScheduled + 9 WorkflowTaskStarted + 10 WorkflowTaskCompleted // Transient WFT was completed successfully and ended up in the history. + 11 WorkflowExecutionUpdateAccepted + 12 WorkflowExecutionUpdateCompleted + 13 WorkflowExecutionCompleted`, events) +} - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), +func (s *WorkflowUpdateSuite) TestStartedSpeculativeWorkflowTask_ConvertToNormalBecauseOfBufferedSignal() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. Events 5 and 6 are written into the history when signal is received. + 6 WorkflowTaskStarted + `, task.History) + // Send signal which will be buffered. This will persist MS and speculative WT must be converted to normal. + err := env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) + return nil, nil + case 3: + s.EqualHistory(` + 7 WorkflowTaskCompleted + 8 WorkflowExecutionSignaled // It was buffered and got to the history after WT is completed. + 9 WorkflowTaskScheduled + 10 WorkflowTaskStarted`, task.History) + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1, 3: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + + s.EqualValues(5, updRequestMsg.GetEventId()) + + // Update is rejected but corresponding speculative WT will be in the history anyway, because it was converted to normal due to buffered signal. + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Send signal which will NOT be buffered because speculative WT is not started yet (only scheduled). - // This will persist MS and speculative WT must be converted to normal. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowExecutionSignaled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted // Update was rejected but WT events 5,7,8 are in the history because of signal. -`, events) - }) + updateResultCh := sendUpdateNoError(env, env.Tv()) - t.Run("SpeculativeWorkflowTask_StartToCloseTimeout", func(t *testing.T) { - // Uses CaptureMetricsHandler which requires a dedicated cluster to avoid metric interference. - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - capture := s.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() - defer s.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) - - request := &workflowservice.StartWorkflowExecutionRequest{ - RequestId: s.Tv().Any().String(), - Namespace: s.Namespace().String(), - WorkflowId: s.Tv().WorkflowID(), - WorkflowType: s.Tv().WorkflowType(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskTimeout: durationpb.New(1 * time.Second), // Important! - } + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResp := res.NewTask + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, updateResp.ResetHistoryEventId) - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), request) - s.NoError(err) + // Complete workflow. + completeWorkflowResp, err := poller.HandlePartialWorkflowTask(updateResp.GetWorkflowTask(), false) + s.NoError(err) + s.NotNil(completeWorkflowResp) + s.Nil(completeWorkflowResp.GetWorkflowTask()) + s.EqualValues(0, completeWorkflowResp.ResetHistoryEventId) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted -`, task.History) - // Emulate slow worker: sleep little more than WT timeout. - time.Sleep(request.WorkflowTaskTimeout.AsDuration() + 100*time.Millisecond) //nolint:forbidigo - // This doesn't matter because WT times out before update is applied. - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - case 3: - // Speculative WT timed out and retried as normal WT. - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskTimedOut - 8 WorkflowTaskScheduled {"Attempt":2 } // Transient WT. - 9 WorkflowTaskStarted`, task.History) - commands := append(s.UpdateAcceptCompleteCommands(s.Tv()), - &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }) - return commands, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - s.Len(task.Messages, 1) - updRequestMsg := task.Messages[0] + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - // This doesn't matter because WT times out before update is applied. - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - case 3: - // Update is still in registry and was sent again. - updRequestMsg := task.Messages[0] + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted // Update was rejected on speculative WT, but events 5-7 are in the history because of buffered signal. + 8 WorkflowExecutionSignaled + 9 WorkflowTaskScheduled + 10 WorkflowTaskStarted + 11 WorkflowTaskCompleted + 12 WorkflowExecutionCompleted`, events) +} - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestScheduledSpeculativeWorkflowTask_ConvertToNormalBecauseOfSignal() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // It was initially speculative WT but was already converted to normal when signal was received. + 6 WorkflowExecutionSignaled + 7 WorkflowTaskStarted`, task.History) + return nil, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + s.NotEmpty(task.Messages, "expected update message in task") + updRequestMsg := task.Messages[0] + + s.EqualValues(6, updRequestMsg.GetEventId()) + + // Update is rejected but corresponding speculative WT was already converted to normal, + // and will be in the history anyway. + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err = poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Try to process update in workflow, but it takes more than WT timeout. So, WT times out. - _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.Error(err) - s.Equal("Workflow task not found.", err.Error()) + updateResultCh := sendUpdateNoError(env, env.Tv()) - // ensure correct metrics were recorded - snap := capture.Snapshot() + // Send signal which will NOT be buffered because speculative WT is not started yet (only scheduled). + // This will persist MS and speculative WT must be converted to normal. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) - var speculativeWorkflowTaskTimeoutTasks int - for _, m := range snap[metrics.TaskRequests.Name()] { - if m.Tags[metrics.OperationTagName] == metrics.TaskTypeTimerActiveTaskSpeculativeWorkflowTaskTimeout { - speculativeWorkflowTaskTimeoutTasks += 1 - } - } - s.Equal(1, speculativeWorkflowTaskTimeoutTasks, "expected 1 speculative workflow task timeout task to be created") + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - var speculativeStartToCloseTimeouts int - for _, m := range snap[metrics.StartToCloseTimeoutCounter.Name()] { - if m.Tags[metrics.OperationTagName] == metrics.TaskTypeTimerActiveTaskSpeculativeWorkflowTaskTimeout { - speculativeStartToCloseTimeouts += 1 - } - } - s.Equal(1, speculativeStartToCloseTimeouts, "expected 1 timeout of a speculative workflow task timeout task") + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - // New normal WT was created on server after speculative WT has timed out. - // It will accept and complete update first and workflow itself with the same WT. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResp := res.NewTask - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, updateResp.ResetHistoryEventId) - s.Nil(updateResp.GetWorkflowTask()) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskTimedOut // Timeout of speculative WT writes events 5-7 - 8 WorkflowTaskScheduled {"Attempt":2 } - 9 WorkflowTaskStarted - 10 WorkflowTaskCompleted - 11 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 8} // WTScheduled event which delivered update to the worker. - 12 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 11} - 13 WorkflowExecutionCompleted`, events) - }) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - t.Run("SpeculativeWorkflowTask_ScheduleToStartTimeout", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowExecutionSignaled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted // Update was rejected but WT events 5,7,8 are in the history because of signal. + `, events) +} - // Drain first WT and respond with sticky enabled response to enable sticky task queue. - stickyScheduleToStartTimeout := 1 * time.Second - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - StickyAttributes: s.Tv().StickyExecutionAttributes(stickyScheduleToStartTimeout), - }, nil - }) - s.NoError(err) +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_StartToCloseTimeout() { + // Uses CaptureMetricsHandler which requires a dedicated cluster to avoid metric interference. + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + capture := env.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() + defer env.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: env.Tv().Any().String(), + Namespace: env.Namespace().String(), + WorkflowId: env.Tv().WorkflowID(), + WorkflowType: env.Tv().WorkflowType(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskTimeout: durationpb.New(1 * time.Second), // Important! + } - sendUpdateNoError(s, s.Tv()) + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), request) + s.NoError(err) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + // Emulate slow worker: sleep little more than WT timeout. + time.Sleep(request.WorkflowTaskTimeout.AsDuration() + 100*time.Millisecond) //nolint:forbidigo + // This doesn't matter because WT times out before update is applied. + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + case 3: + // Speculative WT timed out and retried as normal WT. + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskTimedOut + 8 WorkflowTaskScheduled {"Attempt":2 } // Transient WT. + 9 WorkflowTaskStarted`, task.History) + commands := append(env.UpdateAcceptCompleteCommands(env.Tv()), + &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }) + return commands, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil + } + } - s.Logger.Info("Wait for sticky timeout to fire. Sleep poller.StickyScheduleToStartTimeout+ seconds.", tag.Duration("StickyScheduleToStartTimeout", stickyScheduleToStartTimeout)) - time.Sleep(stickyScheduleToStartTimeout + 100*time.Millisecond) //nolint:forbidigo - s.Logger.Info("Sleep is done.") + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + s.Len(task.Messages, 1) + updRequestMsg := task.Messages[0] + + // This doesn't matter because WT times out before update is applied. + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + case 3: + // Update is still in registry and was sent again. + updRequestMsg := task.Messages[0] + + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil + } + } - // Try to process update in workflow, poll from normal task queue. - res, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - // Speculative WFT timed out on sticky task queue. Server sent full history with sticky timeout event. - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskTimedOut - 7 WorkflowTaskScheduled {"Attempt":1} // Normal WT. - 8 WorkflowTaskStarted`, task.History) - - // Reject update, but WFT will still be in the history due to timeout on sticky queue. - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateRejectMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - s.NotNil(res) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT was written into the history because of timeout. - 6 WorkflowTaskTimedOut - 7 WorkflowTaskScheduled {"Attempt":1} // Second attempt WT is normal WT (clear stickiness reset attempts count). - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted // Normal WT is completed and events are in the history even update was rejected. - `, events) - }) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - t.Run("SpeculativeWorkflowTask_ScheduleToStartTimeoutOnNormalTaskQueue", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + // Drain first WT. + _, err = poller.PollAndProcessWorkflowTask() + s.NoError(err) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled {"TaskQueue": {"Kind": 1}} // Speculative WT timed out on normal(1) task queue. - 6 WorkflowTaskTimedOut - 7 WorkflowTaskScheduled {"Attempt":1} // Normal WT is scheduled. - 8 WorkflowTaskStarted -`, task.History) - return nil, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + updateResultCh := sendUpdateNoError(env, env.Tv()) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + // Try to process update in workflow, but it takes more than WT timeout. So, WT times out. + _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.Error(err) + s.Equal("Workflow task not found.", err.Error()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(7, updRequestMsg.GetEventId()) + // ensure correct metrics were recorded + snap := capture.Snapshot() - return s.UpdateRejectMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + var speculativeWorkflowTaskTimeoutTasks int + for _, m := range snap[metrics.TaskRequests.Name()] { + if m.Tags[metrics.OperationTagName] == metrics.TaskTypeTimerActiveTaskSpeculativeWorkflowTaskTimeout { + speculativeWorkflowTaskTimeoutTasks += 1 } + } + s.Equal(1, speculativeWorkflowTaskTimeoutTasks, "expected 1 speculative workflow task timeout task to be created") - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + var speculativeStartToCloseTimeouts int + for _, m := range snap[metrics.StartToCloseTimeoutCounter.Name()] { + if m.Tags[metrics.OperationTagName] == metrics.TaskTypeTimerActiveTaskSpeculativeWorkflowTaskTimeout { + speculativeStartToCloseTimeouts += 1 } + } + s.Equal(1, speculativeStartToCloseTimeouts, "expected 1 timeout of a speculative workflow task timeout task") - // Drain existing WT from normal task queue. - _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) + // New normal WT was created on server after speculative WT has timed out. + // It will accept and complete update first and workflow itself with the same WT. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResp := res.NewTask + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, updateResp.ResetHistoryEventId) + s.Nil(updateResp.GetWorkflowTask()) - // Now send an update. It will create a speculative WT on normal task queue, - // which will time out in 5 seconds and create new normal WT. - updateResultCh := sendUpdateNoError(s, s.Tv()) + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - // TODO: it would be nice to shutdown matching before sending an update to emulate case which is actually being tested here. - // But test infrastructure doesn't support it. 5 seconds sleep will cause same observable effect. - s.Logger.Info("Sleep 5+ seconds to make sure tasks.SpeculativeWorkflowTaskScheduleToStartTimeout time has passed.") - time.Sleep(5*time.Second + 100*time.Millisecond) //nolint:forbidigo - s.Logger.Info("Sleep 5+ seconds is done.") + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, res.NewTask.ResetHistoryEventId, "no reset of event ID should happened after update rejection if it was delivered with normal workflow task") - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled {"TaskQueue": {"Kind": 1}} // Speculative WT timed out on normal(1) task queue. - 6 WorkflowTaskTimedOut - 7 WorkflowTaskScheduled {"Attempt":1} // Normal WT is scheduled. Even update was rejected, WT is in the history. - 8 WorkflowTaskStarted - 9 WorkflowTaskCompleted -`, events) - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskTimedOut // Timeout of speculative WT writes events 5-7 + 8 WorkflowTaskScheduled {"Attempt":2 } + 9 WorkflowTaskStarted + 10 WorkflowTaskCompleted + 11 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 8} // WTScheduled event which delivered update to the worker. + 12 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 11} + 13 WorkflowExecutionCompleted`, events) +} - t.Run("StartedSpeculativeWorkflowTask_TerminateWorkflow", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_ScheduleToStartTimeout() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + // Drain first WT and respond with sticky enabled response to enable sticky task queue. + stickyScheduleToStartTimeout := 1 * time.Second + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + StickyAttributes: env.Tv().StickyExecutionAttributes(stickyScheduleToStartTimeout), + }, nil + }) + s.NoError(err) + + sendUpdateNoError(env, env.Tv()) + + env.Logger.Info("Wait for sticky timeout to fire. Sleep poller.StickyScheduleToStartTimeout+ seconds.", tag.Duration("StickyScheduleToStartTimeout", stickyScheduleToStartTimeout)) + time.Sleep(stickyScheduleToStartTimeout + 100*time.Millisecond) //nolint:forbidigo + env.Logger.Info("Sleep is done.") + + // Try to process update in workflow, poll from normal task queue. + res, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + // Speculative WFT timed out on sticky task queue. Server sent full history with sticky timeout event. + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskTimedOut + 7 WorkflowTaskScheduled {"Attempt":1} // Normal WT. + 8 WorkflowTaskStarted`, task.History) + + // Reject update, but WFT will still be in the history due to timeout on sticky queue. + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateRejectMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) + s.NotNil(res) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - // Terminate workflow while speculative WT is running. - _, err := s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(s.Context()), &workflowservice.TerminateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowExecution: s.Tv().WorkflowExecution(), - Reason: s.Tv().Any().String(), - }) - s.NoError(err) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT was written into the history because of timeout. + 6 WorkflowTaskTimedOut + 7 WorkflowTaskScheduled {"Attempt":1} // Second attempt WT is normal WT (clear stickiness reset attempts count). + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted // Normal WT is completed and events are in the history even update was rejected. + `, events) +} - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_ScheduleToStartTimeoutOnNormalTaskQueue() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled {"TaskQueue": {"Kind": 1}} // Speculative WT timed out on normal(1) task queue. + 6 WorkflowTaskTimedOut + 7 WorkflowTaskScheduled {"Attempt":1} // Normal WT is scheduled. + 8 WorkflowTaskStarted + `, task.History) + return nil, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(7, updRequestMsg.GetEventId()) + + return env.UpdateRejectMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + // Drain existing WT from normal task queue. + _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) - oneSecondTimeoutCtx, cancel := context.WithTimeout(s.Context(), 1*time.Second) - defer cancel() - updateResultCh := sendUpdate(oneSecondTimeoutCtx, s, s.Tv()) + // Now send an update. It will create a speculative WT on normal task queue, + // which will time out in 5 seconds and create new normal WT. + updateResultCh := sendUpdateNoError(env, env.Tv()) - // Process update in workflow. - _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.Error(err) - s.ErrorAs(err, new(*serviceerror.NotFound)) - s.ErrorContains(err, "Workflow task not found.") - - updateResult := <-updateResultCh - s.Error(updateResult.err) - var notFound *serviceerror.NotFound - s.ErrorAs(updateResult.err, ¬Found) - s.ErrorContains(updateResult.err, update.AbortedByWorkflowClosingErr.Error()) - s.Nil(updateResult.response) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT was converted to normal WT during termination. - 6 WorkflowTaskStarted - 7 WorkflowTaskFailed - 8 WorkflowExecutionTerminated`, events) - - msResp, err := s.AdminClient().DescribeMutableState(testcore.NewContext(s.Context()), &adminservice.DescribeMutableStateRequest{ - Namespace: s.Namespace().String(), - Execution: s.Tv().WorkflowExecution(), - Archetype: chasm.WorkflowArchetype, - }) - s.NoError(err) - s.EqualValues(7, msResp.GetDatabaseMutableState().GetExecutionInfo().GetCompletionEventBatchId(), "completion_event_batch_id should point to WTFailed event") - }) + // TODO: it would be nice to shutdown matching before sending an update to emulate case which is actually being tested here. + // But test infrastructure doesn't support it. 5 seconds sleep will cause same observable effect. + env.Logger.Info("Sleep 5+ seconds to make sure tasks.SpeculativeWorkflowTaskScheduleToStartTimeout time has passed.") + time.Sleep(5*time.Second + 100*time.Millisecond) //nolint:forbidigo + env.Logger.Info("Sleep 5+ seconds is done.") - t.Run("ScheduledSpeculativeWorkflowTask_TerminateWorkflow", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, res.NewTask.ResetHistoryEventId, "no reset of event ID should happened after update rejection if it was delivered with normal workflow task") - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled {"TaskQueue": {"Kind": 1}} // Speculative WT timed out on normal(1) task queue. + 6 WorkflowTaskTimedOut + 7 WorkflowTaskScheduled {"Attempt":1} // Normal WT is scheduled. Even update was rejected, WT is in the history. + 8 WorkflowTaskStarted + 9 WorkflowTaskCompleted + `, events) +} + +func (s *WorkflowUpdateSuite) TestStartedSpeculativeWorkflowTask_TerminateWorkflow() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + // Terminate workflow while speculative WT is running. + _, err := env.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(env.Context()), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: env.Tv().WorkflowExecution(), + Reason: env.Tv().Any().String(), + }) + s.NoError(err) + + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted`, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - oneSecondTimeoutCtx, cancel := context.WithTimeout(s.Context(), 1*time.Second) - defer cancel() - updateResultCh := sendUpdate(oneSecondTimeoutCtx, s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Terminate workflow after speculative WT is scheduled but not started. - _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(s.Context()), &workflowservice.TerminateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowExecution: s.Tv().WorkflowExecution(), - Reason: s.Tv().Any().String(), - }) - s.NoError(err) + oneSecondTimeoutCtx, cancel := context.WithTimeout(env.Context(), 1*time.Second) + defer cancel() + updateResultCh := sendUpdate(oneSecondTimeoutCtx, env, env.Tv()) - updateResult := <-updateResultCh - s.Error(updateResult.err) - var notFound *serviceerror.NotFound - s.ErrorAs(updateResult.err, ¬Found) - s.ErrorContains(updateResult.err, update.AbortedByWorkflowClosingErr.Error()) - s.Nil(updateResult.response) - - s.Equal(1, wtHandlerCalls) - s.Equal(1, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionTerminated // Speculative WTScheduled event is not written to history if WF is terminated. -`, events) - - msResp, err := s.AdminClient().DescribeMutableState(testcore.NewContext(s.Context()), &adminservice.DescribeMutableStateRequest{ - Namespace: s.Namespace().String(), - Execution: s.Tv().WorkflowExecution(), - Archetype: chasm.WorkflowArchetype, - }) - s.NoError(err) - s.EqualValues(5, msResp.GetDatabaseMutableState().GetExecutionInfo().GetCompletionEventBatchId(), "completion_event_batch_id should point to WFTerminated event") + // Process update in workflow. + _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.Error(err) + s.ErrorAs(err, new(*serviceerror.NotFound)) + s.ErrorContains(err, "Workflow task not found.") + + updateResult := <-updateResultCh + s.Error(updateResult.err) + var notFound *serviceerror.NotFound + s.ErrorAs(updateResult.err, ¬Found) + s.ErrorContains(updateResult.err, update.AbortedByWorkflowClosingErr.Error()) + s.Nil(updateResult.response) + + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) + + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT was converted to normal WT during termination. + 6 WorkflowTaskStarted + 7 WorkflowTaskFailed + 8 WorkflowExecutionTerminated`, events) + + msResp, err := env.AdminClient().DescribeMutableState(testcore.NewContext(env.Context()), &adminservice.DescribeMutableStateRequest{ + Namespace: env.Namespace().String(), + Execution: env.Tv().WorkflowExecution(), + Archetype: chasm.WorkflowArchetype, }) + s.NoError(err) + s.EqualValues(7, msResp.GetDatabaseMutableState().GetExecutionInfo().GetCompletionEventBatchId(), "completion_event_batch_id should point to WTFailed event") +} - t.Run("CompleteWorkflow_AbortUpdates", func(t *testing.T) { - type testCase struct { - name string - description string - updateErr map[string]string // Update error by completionCommand.Name. - updateFailure string - commands func(s *testcore.FunctionalTestBase, tv *testvars.TestVars) []*commandpb.Command - messages func(s *testcore.FunctionalTestBase, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message +func (s *WorkflowUpdateSuite) TestScheduledSpeculativeWorkflowTask_TerminateWorkflow() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } - type completionCommand struct { - name string - finalStatus enumspb.WorkflowExecutionStatus - useRunID bool - command func(tv *testvars.TestVars) *commandpb.Command + } + + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } - testCases := []testCase{ - { - name: "update admitted", - description: "update in stateAdmitted must get an error", - updateErr: map[string]string{ - "workflow completed": update.AbortedByWorkflowClosingErr.Error(), - "workflow continued as new without runID": "workflow operation can not be applied because workflow is closing", - "workflow continued as new with runID": "workflow operation can not be applied because workflow is closing", - "workflow failed": update.AbortedByWorkflowClosingErr.Error(), - }, - updateFailure: "", - commands: func(s *testcore.FunctionalTestBase, _ *testvars.TestVars) []*commandpb.Command { return nil }, - messages: func(s *testcore.FunctionalTestBase, _ *testvars.TestVars, _ *protocolpb.Message) []*protocolpb.Message { - return nil - }, + } + + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } + + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) + + oneSecondTimeoutCtx, cancel := context.WithTimeout(env.Context(), 1*time.Second) + defer cancel() + updateResultCh := sendUpdate(oneSecondTimeoutCtx, env, env.Tv()) + + // Terminate workflow after speculative WT is scheduled but not started. + _, err = env.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(env.Context()), &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: env.Tv().WorkflowExecution(), + Reason: env.Tv().Any().String(), + }) + s.NoError(err) + + updateResult := <-updateResultCh + s.Error(updateResult.err) + var notFound *serviceerror.NotFound + s.ErrorAs(updateResult.err, ¬Found) + s.ErrorContains(updateResult.err, update.AbortedByWorkflowClosingErr.Error()) + s.Nil(updateResult.response) + + s.Equal(1, wtHandlerCalls) + s.Equal(1, msgHandlerCalls) + + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionTerminated // Speculative WTScheduled event is not written to history if WF is terminated. + `, events) + + msResp, err := env.AdminClient().DescribeMutableState(testcore.NewContext(env.Context()), &adminservice.DescribeMutableStateRequest{ + Namespace: env.Namespace().String(), + Execution: env.Tv().WorkflowExecution(), + Archetype: chasm.WorkflowArchetype, + }) + s.NoError(err) + s.EqualValues(5, msResp.GetDatabaseMutableState().GetExecutionInfo().GetCompletionEventBatchId(), "completion_event_batch_id should point to WFTerminated event") +} + +func (s *WorkflowUpdateSuite) TestCompleteWorkflow_AbortUpdates() { + type testCase struct { + name string + description string + updateErr map[string]string // Update error by completionCommand.Name. + updateFailure string + commands func(env *testcore.TestEnv, tv *testvars.TestVars) []*commandpb.Command + messages func(env *testcore.TestEnv, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message + } + type completionCommand struct { + name string + finalStatus enumspb.WorkflowExecutionStatus + useRunID bool + command func(tv *testvars.TestVars) *commandpb.Command + } + testCases := []testCase{ + { + name: "update admitted", + description: "update in stateAdmitted must get an error", + updateErr: map[string]string{ + "workflow completed": update.AbortedByWorkflowClosingErr.Error(), + "workflow continued as new without runID": "workflow operation can not be applied because workflow is closing", + "workflow continued as new with runID": "workflow operation can not be applied because workflow is closing", + "workflow failed": update.AbortedByWorkflowClosingErr.Error(), }, - { - name: "update accepted", - description: "update in stateAccepted must get an update failure", - updateErr: map[string]string{"*": ""}, - updateFailure: "Workflow Update failed because the Workflow completed before the Update completed.", - commands: func(s *testcore.FunctionalTestBase, tv *testvars.TestVars) []*commandpb.Command { - return s.UpdateAcceptCommands(tv) - }, - messages: func(s *testcore.FunctionalTestBase, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message { - return s.UpdateAcceptMessages(tv, updRequestMsg) - }, + updateFailure: "", + commands: func(env *testcore.TestEnv, _ *testvars.TestVars) []*commandpb.Command { return nil }, + messages: func(env *testcore.TestEnv, _ *testvars.TestVars, _ *protocolpb.Message) []*protocolpb.Message { + return nil }, - { - name: "update completed", - description: "completed update must not be affected by workflow completion", - updateErr: map[string]string{"*": ""}, - updateFailure: "", - commands: func(s *testcore.FunctionalTestBase, tv *testvars.TestVars) []*commandpb.Command { - return s.UpdateAcceptCompleteCommands(tv) - }, - messages: func(s *testcore.FunctionalTestBase, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message { - return s.UpdateAcceptCompleteMessages(tv, updRequestMsg) - }, + }, + { + name: "update accepted", + description: "update in stateAccepted must get an update failure", + updateErr: map[string]string{"*": ""}, + updateFailure: "Workflow Update failed because the Workflow completed before the Update completed.", + commands: func(env *testcore.TestEnv, tv *testvars.TestVars) []*commandpb.Command { + return env.UpdateAcceptCommands(tv) }, - { - name: "update rejected", - description: "rejected update must be rejected with rejection from workflow", - updateErr: map[string]string{"*": ""}, - updateFailure: "rejection-of-", // Rejection from workflow. - commands: func(s *testcore.FunctionalTestBase, tv *testvars.TestVars) []*commandpb.Command { return nil }, - messages: func(s *testcore.FunctionalTestBase, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message { - return s.UpdateRejectMessages(tv, updRequestMsg) - }, + messages: func(env *testcore.TestEnv, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message { + return env.UpdateAcceptMessages(tv, updRequestMsg) }, - } + }, + { + name: "update completed", + description: "completed update must not be affected by workflow completion", + updateErr: map[string]string{"*": ""}, + updateFailure: "", + commands: func(env *testcore.TestEnv, tv *testvars.TestVars) []*commandpb.Command { + return env.UpdateAcceptCompleteCommands(tv) + }, + messages: func(env *testcore.TestEnv, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message { + return env.UpdateAcceptCompleteMessages(tv, updRequestMsg) + }, + }, + { + name: "update rejected", + description: "rejected update must be rejected with rejection from workflow", + updateErr: map[string]string{"*": ""}, + updateFailure: "rejection-of-", // Rejection from workflow. + commands: func(env *testcore.TestEnv, tv *testvars.TestVars) []*commandpb.Command { return nil }, + messages: func(env *testcore.TestEnv, tv *testvars.TestVars, updRequestMsg *protocolpb.Message) []*protocolpb.Message { + return env.UpdateRejectMessages(tv, updRequestMsg) + }, + }, + } - workflowCompletionCommands := []completionCommand{ - { - name: "workflow completed", - finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED, - useRunID: false, - command: func(_ *testvars.TestVars) *commandpb.Command { - return &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - } - }, + workflowCompletionCommands := []completionCommand{ + { + name: "workflow completed", + finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED, + useRunID: false, + command: func(_ *testvars.TestVars) *commandpb.Command { + return &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + } }, - { - name: "workflow continued as new with runID", - finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_CONTINUED_AS_NEW, - useRunID: true, - command: func(tv *testvars.TestVars) *commandpb.Command { - return &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_ContinueAsNewWorkflowExecutionCommandAttributes{ContinueAsNewWorkflowExecutionCommandAttributes: &commandpb.ContinueAsNewWorkflowExecutionCommandAttributes{ - WorkflowType: tv.WorkflowType(), - TaskQueue: tv.TaskQueue(), - }}, - } - }, + }, + { + name: "workflow continued as new with runID", + finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_CONTINUED_AS_NEW, + useRunID: true, + command: func(tv *testvars.TestVars) *commandpb.Command { + return &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_ContinueAsNewWorkflowExecutionCommandAttributes{ContinueAsNewWorkflowExecutionCommandAttributes: &commandpb.ContinueAsNewWorkflowExecutionCommandAttributes{ + WorkflowType: tv.WorkflowType(), + TaskQueue: tv.TaskQueue(), + }}, + } }, - { - name: "workflow continued as new without runID", - finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_RUNNING, // This is the status of new run because update doesn't go to particular runID. - useRunID: false, - command: func(tv *testvars.TestVars) *commandpb.Command { - return &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_ContinueAsNewWorkflowExecutionCommandAttributes{ContinueAsNewWorkflowExecutionCommandAttributes: &commandpb.ContinueAsNewWorkflowExecutionCommandAttributes{ - WorkflowType: tv.WorkflowType(), - TaskQueue: tv.TaskQueue(), - }}, - } - }, + }, + { + name: "workflow continued as new without runID", + finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_RUNNING, // This is the status of new run because update doesn't go to particular runID. + useRunID: false, + command: func(tv *testvars.TestVars) *commandpb.Command { + return &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_ContinueAsNewWorkflowExecutionCommandAttributes{ContinueAsNewWorkflowExecutionCommandAttributes: &commandpb.ContinueAsNewWorkflowExecutionCommandAttributes{ + WorkflowType: tv.WorkflowType(), + TaskQueue: tv.TaskQueue(), + }}, + } }, - { - name: "workflow failed", - finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_FAILED, - useRunID: true, - command: func(tv *testvars.TestVars) *commandpb.Command { - return &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_FAIL_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_FailWorkflowExecutionCommandAttributes{FailWorkflowExecutionCommandAttributes: &commandpb.FailWorkflowExecutionCommandAttributes{ - Failure: tv.Any().ApplicationFailure(), - }}, - } - }, + }, + { + name: "workflow failed", + finalStatus: enumspb.WORKFLOW_EXECUTION_STATUS_FAILED, + useRunID: true, + command: func(tv *testvars.TestVars) *commandpb.Command { + return &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_FAIL_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_FailWorkflowExecutionCommandAttributes{FailWorkflowExecutionCommandAttributes: &commandpb.FailWorkflowExecutionCommandAttributes{ + Failure: tv.Any().ApplicationFailure(), + }}, + } }, - } - - for _, tc := range testCases { - for _, wfCC := range workflowCompletionCommands { - t.Run(tc.name+" "+wfCC.name, func(t *testing.T) { - s := testcore.NewEnv(t) - runID := mustStartWorkflow(s, s.Tv()) - tv := s.Tv() - if wfCC.useRunID { - tv = tv.WithRunID(runID) - } + }, + } - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - return append(tc.commands(s.FunctionalTestBase, s.Tv()), wfCC.command(s.Tv())), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + for _, tc := range testCases { + for _, wfCC := range workflowCompletionCommands { + s.Run(tc.name+" "+wfCC.name, func(s *WorkflowUpdateSuite) { + env := testcore.NewEnv(s.T()) + runID := mustStartWorkflow(env, env.Tv()) + tv := env.Tv() + if wfCC.useRunID { + tv = tv.WithRunID(runID) + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - return tc.messages(s.FunctionalTestBase, s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + return append(tc.commands(env, env.Tv()), wfCC.command(env.Tv())), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + return tc.messages(env, env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updateResultCh := sendUpdate(testcore.NewContext(s.Context()), s, tv) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Complete workflow. - _, err = poller.PollAndProcessWorkflowTask() - s.NoError(err) + updateResultCh := sendUpdate(testcore.NewContext(env.Context()), env, tv) - updateResult := <-updateResultCh - expectedUpdateErr := tc.updateErr[wfCC.name] - if expectedUpdateErr == "" { - expectedUpdateErr = tc.updateErr["*"] - } - if expectedUpdateErr != "" { - s.Error(updateResult.err, tc.description) - s.Equal(expectedUpdateErr, updateResult.err.Error()) - } else { - s.NoError(updateResult.err, tc.description) - } + // Complete workflow. + _, err = poller.PollAndProcessWorkflowTask() + s.NoError(err) - if tc.updateFailure != "" { - s.NotNil(updateResult.response.GetOutcome().GetFailure(), tc.description) - s.Contains(updateResult.response.GetOutcome().GetFailure().GetMessage(), tc.updateFailure, tc.description) - } else { - s.Nil(updateResult.response.GetOutcome().GetFailure(), tc.description) - } + updateResult := <-updateResultCh + expectedUpdateErr := tc.updateErr[wfCC.name] + if expectedUpdateErr == "" { + expectedUpdateErr = tc.updateErr["*"] + } + if expectedUpdateErr != "" { + s.Error(updateResult.err, tc.description) + s.Equal(expectedUpdateErr, updateResult.err.Error()) + } else { + s.NoError(updateResult.err, tc.description) + } - if expectedUpdateErr == "" && tc.updateFailure == "" { - s.Equal(runID, updateResult.response.GetUpdateRef().GetWorkflowExecution().GetRunId(), "update wasn't applied to the same run as was started") - } + if tc.updateFailure != "" { + s.NotNil(updateResult.response.GetOutcome().GetFailure(), tc.description) + s.Contains(updateResult.response.GetOutcome().GetFailure().GetMessage(), tc.updateFailure, tc.description) + } else { + s.Nil(updateResult.response.GetOutcome().GetFailure(), tc.description) + } - // Check that update didn't block workflow completion. - descResp, err := s.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(s.Context()), &workflowservice.DescribeWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - Execution: tv.WorkflowExecution(), - }) - s.NoError(err) - s.Equal(wfCC.finalStatus, descResp.WorkflowExecutionInfo.Status) + if expectedUpdateErr == "" && tc.updateFailure == "" { + s.Equal(runID, updateResult.response.GetUpdateRef().GetWorkflowExecution().GetRunId(), "update wasn't applied to the same run as was started") + } - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) + // Check that update didn't block workflow completion. + descResp, err := env.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(env.Context()), &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + Execution: tv.WorkflowExecution(), }) - } - } - }) - - t.Run("SpeculativeWorkflowTask_Heartbeat", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) - - // Drain first WT. - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) - - updateResultCh := sendUpdateNoError(s, s.Tv()) - - // Heartbeat from speculative WT (no messages, no commands). - var updRequestMsg *protocolpb.Message - res, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Events (5 and 6) are for speculative WT, but they won't disappear after reject because speculative WT is converted to normal during heartbeat. - 6 WorkflowTaskStarted - `, task.History) - - s.Len(task.Messages, 1) - updRequestMsg = task.Messages[0] - s.EqualValues(5, updRequestMsg.GetEventId()) + s.NoError(err) + s.Equal(wfCC.finalStatus, descResp.WorkflowExecutionInfo.Status) - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - ReturnNewWorkflowTask: true, - ForceCreateNewWorkflowTask: true, - }, nil + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) }) - s.NoError(err) - - // Reject update from workflow. - updateResp, err := s.TaskPoller().HandleWorkflowTask(s.Tv(), - res.GetWorkflowTask(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistory(` - 7 WorkflowTaskCompleted - 8 WorkflowTaskScheduled // New WT (after heartbeat) is normal and won't disappear from the history after reject. - 9 WorkflowTaskStarted - `, task.History) - - s.Empty(task.Messages) + } + } +} - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateRejectMessages(s.Tv(), updRequestMsg), - }, nil - }) - s.NoError(err) - s.NotNil(updateResp) +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_Heartbeat() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + // Drain first WT. + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + updateResultCh := sendUpdateNoError(env, env.Tv()) + + // Heartbeat from speculative WT (no messages, no commands). + var updRequestMsg *protocolpb.Message + res, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Events (5 and 6) are for speculative WT, but they won't disappear after reject because speculative WT is converted to normal during heartbeat. + 6 WorkflowTaskStarted + `, task.History) + + s.Len(task.Messages, 1) + updRequestMsg = task.Messages[0] + s.EqualValues(5, updRequestMsg.GetEventId()) + + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + ReturnNewWorkflowTask: true, + ForceCreateNewWorkflowTask: true, + }, nil + }) + s.NoError(err) + + // Reject update from workflow. + updateResp, err := env.TaskPoller().HandleWorkflowTask(env.Tv(), + res.GetWorkflowTask(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistory(` + 7 WorkflowTaskCompleted + 8 WorkflowTaskScheduled // New WT (after heartbeat) is normal and won't disappear from the history after reject. + 9 WorkflowTaskStarted + `, task.History) + + s.Empty(task.Messages) + + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateRejectMessages(env.Tv(), updRequestMsg), + }, nil + }) + s.NoError(err) + s.NotNil(updateResp) - updateResult := <-updateResultCh - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(0, updateResp.ResetHistoryEventId, "no reset of event ID should happened after update rejection because of heartbeat") + updateResult := <-updateResultCh + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(0, updateResp.ResetHistoryEventId, "no reset of event ID should happened after update rejection because of heartbeat") - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -3313,757 +3342,757 @@ func TestWorkflowUpdateSuite(t *testing.T) { 9 WorkflowTaskStarted 10 WorkflowTaskCompleted // After heartbeat new normal WT was created and events are written into the history even update is rejected. `, events) - }) - - t.Run("ScheduledSpeculativeWorkflowTask_LostUpdate", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - mustStartWorkflow(s, s.Tv()) - - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted`, task.History) - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } +} - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - s.Empty(task.Messages, "update lost due to lost update registry") - return nil, nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestScheduledSpeculativeWorkflowTask_LostUpdate() { + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted`, task.History) + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + s.Empty(task.Messages, "update lost due to lost update registry") + return nil, nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - - halfSecondTimeoutCtx, cancel := context.WithTimeout(s.Context(), 500*time.Millisecond) - defer cancel() - updateResult := <-sendUpdate(halfSecondTimeoutCtx, s, s.Tv()) - s.Error(updateResult.err) - s.True(common.IsContextDeadlineExceededErr(updateResult.err), updateResult.err.Error()) - s.Nil(updateResult.response) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Lose update registry. Speculative WFT and update registry disappear. - loseUpdateRegistryAndAbandonPendingUpdates(s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) + + halfSecondTimeoutCtx, cancel := context.WithTimeout(env.Context(), 500*time.Millisecond) + defer cancel() + updateResult := <-sendUpdate(halfSecondTimeoutCtx, env, env.Tv()) + s.Error(updateResult.err) + s.True(common.IsContextDeadlineExceededErr(updateResult.err), updateResult.err.Error()) + s.Nil(updateResult.response) + + // Lose update registry. Speculative WFT and update registry disappear. + loseUpdateRegistryAndAbandonPendingUpdates(env, env.Tv()) + + // Ensure, there is no WFT. + pollCtx, cancel := context.WithTimeout(env.Context(), common.MinLongPollTimeout*2) + defer cancel() + pollResponse, err := env.FrontendClient().PollWorkflowTaskQueue(pollCtx, &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + }) + s.NoError(err) + s.Nil(pollResponse.Messages, "there should not be new WFT with messages") - // Ensure, there is no WFT. - pollCtx, cancel := context.WithTimeout(s.Context(), common.MinLongPollTimeout*2) - defer cancel() - pollResponse, err := s.FrontendClient().PollWorkflowTaskQueue(pollCtx, &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - }) - s.NoError(err) - s.Nil(pollResponse.Messages, "there should not be new WFT with messages") + // Send signal to schedule new WT. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) - // Send signal to schedule new WT. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + // Complete workflow and check that there is update messages. + completeWorkflowResp, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) + s.NotNil(completeWorkflowResp) - // Complete workflow and check that there is update messages. - completeWorkflowResp, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - s.NotNil(completeWorkflowResp) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 WorkflowExecutionCompleted`, events) - }) + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - t.Run("StartedSpeculativeWorkflowTask_LostUpdate", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - mustStartWorkflow(s, s.Tv()) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. Events 5 and 6 will be lost. - 6 WorkflowTaskStarted -`, task.History) - - // Lose update registry. Update is lost and NotFound error will be returned to RespondWorkflowTaskCompleted. - loseUpdateRegistryAndAbandonPendingUpdates(s, s.Tv()) - - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - case 3: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted -`, task.History) - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 WorkflowExecutionCompleted`, events) +} - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - s.EqualValues(5, updRequestMsg.GetEventId()) +func (s *WorkflowUpdateSuite) TestStartedSpeculativeWorkflowTask_LostUpdate() { + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. Events 5 and 6 will be lost. + 6 WorkflowTaskStarted + `, task.History) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - case 3: - s.Empty(task.Messages, "no messages since update registry was lost") - return nil, nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + // Lose update registry. Update is lost and NotFound error will be returned to RespondWorkflowTaskCompleted. + loseUpdateRegistryAndAbandonPendingUpdates(env, env.Tv()) + + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + case 3: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + `, task.History) + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + s.EqualValues(5, updRequestMsg.GetEventId()) + + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + case 3: + s.Empty(task.Messages, "no messages since update registry was lost") + return nil, nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - halfSecondTimeoutCtx, cancel := context.WithTimeout(s.Context(), 500*time.Millisecond) - defer cancel() - updateResultCh := sendUpdate(halfSecondTimeoutCtx, s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Process update in workflow. - _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.Error(err) - s.ErrorAs(err, new(*serviceerror.NotFound)) - s.ErrorContains(err, "Workflow task not found") + halfSecondTimeoutCtx, cancel := context.WithTimeout(env.Context(), 500*time.Millisecond) + defer cancel() + updateResultCh := sendUpdate(halfSecondTimeoutCtx, env, env.Tv()) - updateResult := <-updateResultCh - s.Error(updateResult.err) - s.True(common.IsContextDeadlineExceededErr(updateResult.err), updateResult.err.Error()) - s.Nil(updateResult.response) + // Process update in workflow. + _, err = poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.Error(err) + s.ErrorAs(err, new(*serviceerror.NotFound)) + s.ErrorContains(err, "Workflow task not found") - // Send signal to schedule new WFT. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + updateResult := <-updateResultCh + s.Error(updateResult.err) + s.True(common.IsContextDeadlineExceededErr(updateResult.err), updateResult.err.Error()) + s.Nil(updateResult.response) - // Complete workflow. - completeWorkflowResp, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - s.NotNil(completeWorkflowResp) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionSignaled - 6 WorkflowTaskScheduled - 7 WorkflowTaskStarted - 8 WorkflowTaskCompleted - 9 WorkflowExecutionCompleted`, events) - }) + // Send signal to schedule new WFT. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) - t.Run("FirstNormalWorkflowTask_UpdateResurrectedAfterRegistryCleared", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - mustStartWorkflow(s, s.Tv()) + // Complete workflow. + completeWorkflowResp, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) + s.NotNil(completeWorkflowResp) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted -`, task.History) - // Clear update registry. Update will be resurrected in registry from acceptance message. - clearUpdateRegistryAndAbortPendingUpdates(s, s.Tv()) - - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 WorkflowExecutionUpdateCompleted - 7 WorkflowExecutionSignaled - 8 WorkflowTaskScheduled - 9 WorkflowTaskStarted -`, task.History) - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - updRequestMsg := task.Messages[0] - s.EqualValues(2, updRequestMsg.GetEventId()) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - case 2: - s.Empty(task.Messages, "update must be processed and not delivered again") - return nil, nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionSignaled + 6 WorkflowTaskScheduled + 7 WorkflowTaskStarted + 8 WorkflowTaskCompleted + 9 WorkflowExecutionCompleted`, events) +} + +func (s *WorkflowUpdateSuite) TestFirstNormalWorkflowTask_UpdateResurrectedAfterRegistryCleared() { + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + `, task.History) + // Clear update registry. Update will be resurrected in registry from acceptance message. + clearUpdateRegistryAndAbortPendingUpdates(env, env.Tv()) + + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 WorkflowExecutionUpdateCompleted + 7 WorkflowExecutionSignaled + 8 WorkflowTaskScheduled + 9 WorkflowTaskStarted + `, task.History) + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + updRequestMsg := task.Messages[0] + s.EqualValues(2, updRequestMsg.GetEventId()) + + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + case 2: + s.Empty(task.Messages, "update must be processed and not delivered again") + return nil, nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Process update in workflow. Update won't be found on server but will be resurrected from acceptance message and completed. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) + updateResultCh := sendUpdateNoError(env, env.Tv()) - // Client receives resurrected Update outcome. - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + // Process update in workflow. Update won't be found on server but will be resurrected from acceptance message and completed. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) - // Signal to create new WFT which shouldn't get any updates. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + // Client receives resurrected Update outcome. + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - // Complete workflow. - completeWorkflowResp, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(completeWorkflowResp) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 WorkflowExecutionUpdateCompleted - 7 WorkflowExecutionSignaled - 8 WorkflowTaskScheduled - 9 WorkflowTaskStarted - 10 WorkflowTaskCompleted - 11 WorkflowExecutionCompleted`, events) - }) + // Signal to create new WFT which shouldn't get any updates. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) - t.Run("ScheduledSpeculativeWorkflowTask_DeduplicateID", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + // Complete workflow. + completeWorkflowResp, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(completeWorkflowResp) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - s.Len(task.Messages, 1, "2nd update must be deduplicated by ID") - updRequestMsg := task.Messages[0] + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 WorkflowExecutionUpdateCompleted + 7 WorkflowExecutionSignaled + 8 WorkflowTaskScheduled + 9 WorkflowTaskStarted + 10 WorkflowTaskCompleted + 11 WorkflowExecutionCompleted`, events) +} + +func (s *WorkflowUpdateSuite) TestScheduledSpeculativeWorkflowTask_DeduplicateID() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + s.Len(task.Messages, 1, "2nd update must be deduplicated by ID") + updRequestMsg := task.Messages[0] + + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Send second update with the same ID. - updateResultCh2 := sendUpdateNoError(s, s.Tv()) + updateResultCh := sendUpdateNoError(env, env.Tv()) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResp := res.NewTask - updateResult := <-updateResultCh - updateResult2 := <-updateResultCh2 - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess())) - s.EqualValues(0, updateResp.ResetHistoryEventId) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. - 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} -`, events) - }) + // Send second update with the same ID. + updateResultCh2 := sendUpdateNoError(env, env.Tv()) - t.Run("StartedSpeculativeWorkflowTask_DeduplicateID", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResp := res.NewTask + updateResult := <-updateResultCh + updateResult2 := <-updateResultCh2 + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess())) + s.EqualValues(0, updateResp.ResetHistoryEventId) - var updateResultCh2 <-chan *workflowservice.UpdateWorkflowExecutionResponse + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - // Send second update with the same ID when WT is started but not completed. - updateResultCh2 = sendUpdateNoError(s, s.Tv()) - - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - s.Len(task.Messages, 1, "2nd update should not has reached server yet") - updRequestMsg := task.Messages[0] - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. + 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} + `, events) +} + +func (s *WorkflowUpdateSuite) TestStartedSpeculativeWorkflowTask_DeduplicateID() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + var updateResultCh2 <-chan *workflowservice.UpdateWorkflowExecutionResponse + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + // Send second update with the same ID when WT is started but not completed. + updateResultCh2 = sendUpdateNoError(env, env.Tv()) + + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + s.Len(task.Messages, 1, "2nd update should not has reached server yet") + updRequestMsg := task.Messages[0] + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, res.NewTask.ResetHistoryEventId) - - updateResult2 := <-updateResultCh2 - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess())) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. - 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} -`, events) - }) + updateResultCh := sendUpdateNoError(env, env.Tv()) - t.Run("CompletedSpeculativeWorkflowTask_DeduplicateID", func(t *testing.T) { - testCases := []struct { - Name string - CloseShard bool - }{ - { - Name: "no shard reload", - CloseShard: false, - }, - { - Name: "with shard reload", - CloseShard: true, - }, - } + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, res.NewTask.ResetHistoryEventId) - for _, tc := range testCases { - t.Run(tc.Name, func(t *testing.T) { - // Uses closeShard conditionally which requires a dedicated cluster. - var opts []testcore.TestOption - if tc.CloseShard { - opts = append(opts, testcore.WithDedicatedCluster()) - } - s := testcore.NewEnv(t, opts...) - mustStartWorkflow(s, s.Tv()) + updateResult2 := <-updateResultCh2 + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess())) - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - case 3: - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - case 3: - s.Empty(task.Messages, "2nd update must be deduplicated by ID ") - return nil, nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. + 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} + `, events) +} + +func (s *WorkflowUpdateSuite) TestCompletedSpeculativeWorkflowTask_DeduplicateID() { + testCases := []struct { + Name string + CloseShard bool + }{ + { + Name: "no shard reload", + CloseShard: false, + }, + { + Name: "with shard reload", + CloseShard: true, + }, + } + + for _, tc := range testCases { + s.Run(tc.Name, func(s *WorkflowUpdateSuite) { + // Uses closeShard conditionally which requires a dedicated cluster. + var opts []testcore.TestOption + if tc.CloseShard { + opts = append(opts, testcore.WithDedicatedCluster()) + } + env := testcore.NewEnv(s.T(), opts...) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + case 3: + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + case 3: + s.Empty(task.Messages, "2nd update must be deduplicated by ID ") + return nil, nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - - updateResultCh := sendUpdateNoError(s, s.Tv()) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - // Process update in workflow. - _, err = poller.PollAndProcessWorkflowTask() - s.NoError(err) - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - if tc.CloseShard { - // Close shard to make sure that for completed updates deduplication works even after shard reload. - closeShard(s, s.Tv().WorkflowID()) - } + updateResultCh := sendUpdateNoError(env, env.Tv()) - // Send second update with the same ID. It must return immediately. - updateResult2 := <-sendUpdateNoError(s, s.Tv()) + // Process update in workflow. + _, err = poller.PollAndProcessWorkflowTask() + s.NoError(err) + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - // Ensure, there is no new WT. - pollCtx, cancel := context.WithTimeout(s.Context(), common.MinLongPollTimeout*2) - defer cancel() - pollResponse, err := s.FrontendClient().PollWorkflowTaskQueue(pollCtx, &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - }) - s.NoError(err) - s.Nil(pollResponse.Messages, "there must be no new WT") + if tc.CloseShard { + // Close shard to make sure that for completed updates deduplication works even after shard reload. + closeShard(env, env.Tv().WorkflowID()) + } - s.Equal( - "success-result-of-"+s.Tv().UpdateID(), - testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess()), - "results of the first update must be available") + // Send second update with the same ID. It must return immediately. + updateResult2 := <-sendUpdateNoError(env, env.Tv()) - // Send signal to schedule new WT. - err = s.SendSignal(s.Namespace().String(), s.Tv().WorkflowExecution(), s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.NoError(err) + // Ensure, there is no new WT. + pollCtx, cancel := context.WithTimeout(env.Context(), common.MinLongPollTimeout*2) + defer cancel() + pollResponse, err := env.FrontendClient().PollWorkflowTaskQueue(pollCtx, &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + }) + s.NoError(err) + s.Nil(pollResponse.Messages, "there must be no new WT") - // Complete workflow. - completeWorkflowResp, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - s.NotNil(completeWorkflowResp) + s.Equal( + "success-result-of-"+env.Tv().UpdateID(), + testcore.DecodeString(s.T(), updateResult2.GetOutcome().GetSuccess()), + "results of the first update must be available") - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) + // Send signal to schedule new WT. + err = env.SendSignal(env.Namespace().String(), env.Tv().WorkflowExecution(), env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.NoError(err) - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) + // Complete workflow. + completeWorkflowResp, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) + s.NotNil(completeWorkflowResp) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. - 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} - 10 WorkflowExecutionSignaled - 11 WorkflowTaskScheduled - 12 WorkflowTaskStarted - 13 WorkflowTaskCompleted - 14 WorkflowExecutionCompleted -`, events) - }) - } - }) + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - t.Run("StaleSpeculativeWorkflowTask_Fail_BecauseOfDifferentStartedId", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - /* - Test scenario: - An update created a speculative WT and WT is dispatched to the worker (started). - Update registry is cleared, speculative WT disappears from server. - Update is retired and second speculative WT is scheduled but not dispatched yet. - An activity completes, it converts the 2nd speculative WT into normal one. - The first speculative WT responds back, server fails request because WorkflowTaskStarted event Id is mismatched. - The second speculative WT responds back and server completes it. - */ + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - mustStartWorkflow(s, s.Tv()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} // WTScheduled event which delivered update to the worker. + 9 WorkflowExecutionUpdateCompleted {"AcceptedEventId": 8} + 10 WorkflowExecutionSignaled + 11 WorkflowTaskScheduled + 12 WorkflowTaskStarted + 13 WorkflowTaskCompleted + 14 WorkflowExecutionCompleted + `, events) + }) + } +} - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Schedule activity. - return []*commandpb.Command{{ - CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, - Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ - ActivityId: s.Tv().ActivityID(), - ActivityType: s.Tv().ActivityType(), - TaskQueue: s.Tv().TaskQueue(), - ScheduleToCloseTimeout: s.Tv().Any().InfiniteTimeout(), - }}, - }}, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestStaleSpeculativeWorkflowTask_Fail_BecauseOfDifferentStartedId() { + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + /* + Test scenario: + An update created a speculative WT and WT is dispatched to the worker (started). + Update registry is cleared, speculative WT disappears from server. + Update is retired and second speculative WT is scheduled but not dispatched yet. + An activity completes, it converts the 2nd speculative WT into normal one. + The first speculative WT responds back, server fails request because WorkflowTaskStarted event Id is mismatched. + The second speculative WT responds back and server completes it. + */ + + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Schedule activity. + return []*commandpb.Command{{ + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: env.Tv().ActivityID(), + ActivityType: env.Tv().ActivityType(), + TaskQueue: env.Tv().TaskQueue(), + ScheduleToCloseTimeout: env.Tv().Any().InfiniteTimeout(), + }}, + }}, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { - return s.Tv().Any().Payloads(), false, nil - } + atHandler := func(task *workflowservice.PollActivityTaskQueueResponse) (*commonpb.Payloads, bool, error) { + return env.Tv().Any().Payloads(), false, nil + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskHandler: wtHandler, - ActivityTaskHandler: atHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskHandler: wtHandler, + ActivityTaskHandler: atHandler, + Logger: env.Logger, + T: s.T(), + } - // First WT will schedule activity. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) + // First WT will schedule activity. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) - // Send 1st update. It will create 2nd WT as speculative. - sendUpdateNoError(s, s.Tv()) + // Send 1st update. It will create 2nd WT as speculative. + sendUpdateNoError(env, env.Tv()) - // Poll 2nd speculative WT with 1st update. - wt2, err := s.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(s.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt2) - s.NotEmpty(wt2.TaskToken, "2nd workflow task must have valid task token") - s.Len(wt2.Messages, 1, "2nd workflow task must have a message with 1st update") - s.EqualValues(7, wt2.StartedEventId) - s.EqualValues(6, wt2.Messages[0].GetEventId()) - s.EqualHistory(` + // Poll 2nd speculative WT with 1st update. + wt2, err := env.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(env.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt2) + s.NotEmpty(wt2.TaskToken, "2nd workflow task must have valid task token") + s.Len(wt2.Messages, 1, "2nd workflow task must have a message with 1st update") + s.EqualValues(7, wt2.StartedEventId) + s.EqualValues(6, wt2.Messages[0].GetEventId()) + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4072,28 +4101,28 @@ func TestWorkflowUpdateSuite(t *testing.T) { 6 WorkflowTaskScheduled 7 WorkflowTaskStarted`, wt2.History) - // Clear update registry. Speculative WFT disappears from server. - clearUpdateRegistryAndAbortPendingUpdates(s, s.Tv()) + // Clear update registry. Speculative WFT disappears from server. + clearUpdateRegistryAndAbortPendingUpdates(env, env.Tv()) - // Wait for update request to be retry by frontend and recreated in registry. This will create a 3rd WFT as speculative. - waitUpdateAdmitted(s, s.Tv()) + // Wait for update request to be retry by frontend and recreated in registry. This will create a 3rd WFT as speculative. + waitUpdateAdmitted(env, env.Tv()) - // Before polling for the 3rd speculative WT, process activity. This will convert 3rd speculative WT to normal WT. - err = poller.PollAndProcessActivityTask(false) - s.NoError(err) + // Before polling for the 3rd speculative WT, process activity. This will convert 3rd speculative WT to normal WT. + err = poller.PollAndProcessActivityTask(false) + s.NoError(err) - // Poll the 3rd WFT (not speculative anymore) but must have 2nd update. - wt3, err := s.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(s.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt3) - s.NotEmpty(wt3.TaskToken, "3rd workflow task must have valid task token") - s.Len(wt3.Messages, 1, "3rd workflow task must have a message with 2nd update") - s.EqualValues(9, wt3.StartedEventId) - s.EqualValues(8, wt3.Messages[0].GetEventId()) - s.EqualHistory(` + // Poll the 3rd WFT (not speculative anymore) but must have 2nd update. + wt3, err := env.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(env.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt3) + s.NotEmpty(wt3.TaskToken, "3rd workflow task must have valid task token") + s.Len(wt3.Messages, 1, "3rd workflow task must have a message with 2nd update") + s.EqualValues(9, wt3.StartedEventId) + s.EqualValues(8, wt3.Messages[0].GetEventId()) + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4104,28 +4133,28 @@ func TestWorkflowUpdateSuite(t *testing.T) { 8 ActivityTaskCompleted 9 WorkflowTaskStarted`, wt3.History) - // Now try to complete 2nd WT (speculative). It should fail because WorkflowTaskStarted event Id is mismatched. - _, err = s.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(s.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt2.TaskToken, - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), wt2.Messages[0]), - }) - s.Error(err, "Must fail because WorkflowTaskStarted event Id is different.") - s.ErrorAs(err, new(*serviceerror.NotFound)) - s.Contains(err.Error(), "Workflow task not found") - - // Complete 3rd WT. It should succeed. - _, err = s.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(s.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt3.TaskToken, - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), wt3.Messages[0]), - }) - s.NoError(err) + // Now try to complete 2nd WT (speculative). It should fail because WorkflowTaskStarted event Id is mismatched. + _, err = env.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(env.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt2.TaskToken, + Commands: env.UpdateAcceptCompleteCommands(env.Tv()), + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), wt2.Messages[0]), + }) + s.Error(err, "Must fail because WorkflowTaskStarted event Id is different.") + s.ErrorAs(err, new(*serviceerror.NotFound)) + s.Contains(err.Error(), "Workflow task not found") + + // Complete 3rd WT. It should succeed. + _, err = env.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(env.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt3.TaskToken, + Commands: env.UpdateAcceptCompleteCommands(env.Tv()), + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), wt3.Messages[0]), + }) + s.NoError(err) - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4139,62 +4168,62 @@ func TestWorkflowUpdateSuite(t *testing.T) { 11 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId":8} 12 WorkflowExecutionUpdateCompleted `, events) - }) - - t.Run("StaleSpeculativeWorkflowTask_Fail_BecauseOfDifferentStartTime", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - /* - Test scenario: - An update created a speculative WT and WT is dispatched to the worker (started). - WF context is cleared, speculative WT is disappeared from server. - Update is retried and second speculative WT is dispatched to worker with same WT scheduled/started Id and update Id. - The first speculative WT respond back, server reject it because startTime is different. - The second speculative WT respond back, server accept it. - */ - mustStartWorkflow(s, s.Tv()) +} - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - return nil, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestStaleSpeculativeWorkflowTask_Fail_BecauseOfDifferentStartTime() { + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + /* + Test scenario: + An update created a speculative WT and WT is dispatched to the worker (started). + WF context is cleared, speculative WT is disappeared from server. + Update is retried and second speculative WT is dispatched to worker with same WT scheduled/started Id and update Id. + The first speculative WT respond back, server reject it because startTime is different. + The second speculative WT respond back, server accept it. + */ + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + return nil, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskHandler: wtHandler, - Logger: s.Logger, - T: s.T(), - } + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskHandler: wtHandler, + Logger: env.Logger, + T: s.T(), + } - // First WT will schedule activity. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.NotNil(res) + // First WT will schedule activity. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.NotNil(res) - // Send update. It will create 2nd WT as speculative. - sendUpdateNoError(s, s.Tv()) + // Send update. It will create 2nd WT as speculative. + sendUpdateNoError(env, env.Tv()) - // Poll 2nd speculative WT with 1st update. - wt2, err := s.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(s.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt2) - s.NotEmpty(wt2.TaskToken, "2nd workflow task must have valid task token") - s.Len(wt2.Messages, 1, "2nd workflow task must have a message with 1st update") - s.EqualValues(6, wt2.StartedEventId) - s.EqualValues(5, wt2.Messages[0].GetEventId()) - s.EqualHistory(` + // Poll 2nd speculative WT with 1st update. + wt2, err := env.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(env.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt2) + s.NotEmpty(wt2.TaskToken, "2nd workflow task must have valid task token") + s.Len(wt2.Messages, 1, "2nd workflow task must have a message with 1st update") + s.EqualValues(6, wt2.StartedEventId) + s.EqualValues(5, wt2.Messages[0].GetEventId()) + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4202,24 +4231,24 @@ func TestWorkflowUpdateSuite(t *testing.T) { 5 WorkflowTaskScheduled 6 WorkflowTaskStarted`, wt2.History) - // Clear update registry. Speculative WFT disappears from server. - clearUpdateRegistryAndAbortPendingUpdates(s, s.Tv()) + // Clear update registry. Speculative WFT disappears from server. + clearUpdateRegistryAndAbortPendingUpdates(env, env.Tv()) - // Wait for update request to be retry by frontend and recreated in registry. This will create a 3rd WFT as speculative. - waitUpdateAdmitted(s, s.Tv()) + // Wait for update request to be retry by frontend and recreated in registry. This will create a 3rd WFT as speculative. + waitUpdateAdmitted(env, env.Tv()) - // Poll for the 3rd speculative WT. - wt3, err := s.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(s.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt3) - s.NotEmpty(wt3.TaskToken, "3rd workflow task must have valid task token") - s.Len(wt3.Messages, 1, "3rd workflow task must have a message with 1st update") - s.EqualValues(6, wt3.StartedEventId) - s.EqualValues(5, wt3.Messages[0].GetEventId()) - s.EqualHistory(` + // Poll for the 3rd speculative WT. + wt3, err := env.FrontendClient().PollWorkflowTaskQueue(testcore.NewContext(env.Context()), &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt3) + s.NotEmpty(wt3.TaskToken, "3rd workflow task must have valid task token") + s.Len(wt3.Messages, 1, "3rd workflow task must have a message with 1st update") + s.EqualValues(6, wt3.StartedEventId) + s.EqualValues(5, wt3.Messages[0].GetEventId()) + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4227,28 +4256,28 @@ func TestWorkflowUpdateSuite(t *testing.T) { 5 WorkflowTaskScheduled 6 WorkflowTaskStarted`, wt3.History) - // Now try to complete 2nd (speculative) WT, it should fail. - _, err = s.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(s.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt2.TaskToken, - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), wt2.Messages[0]), - }) - s.Error(err, "Must fail because workflow task start time is different.") - s.ErrorAs(err, new(*serviceerror.NotFound)) - s.Contains(err.Error(), "Workflow task not found") - - // Try to complete 3rd WT, it should succeed - _, err = s.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(s.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt3.TaskToken, - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), wt3.Messages[0]), - }) - s.NoError(err, "2nd speculative WT should be completed because it has same WT scheduled/started Id and startTime matches the accepted message is valid (same update Id)") + // Now try to complete 2nd (speculative) WT, it should fail. + _, err = env.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(env.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt2.TaskToken, + Commands: env.UpdateAcceptCompleteCommands(env.Tv()), + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), wt2.Messages[0]), + }) + s.Error(err, "Must fail because workflow task start time is different.") + s.ErrorAs(err, new(*serviceerror.NotFound)) + s.Contains(err.Error(), "Workflow task not found") + + // Try to complete 3rd WT, it should succeed + _, err = env.FrontendClient().RespondWorkflowTaskCompleted(testcore.NewContext(env.Context()), &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt3.TaskToken, + Commands: env.UpdateAcceptCompleteCommands(env.Tv()), + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), wt3.Messages[0]), + }) + s.NoError(err, "2nd speculative WT should be completed because it has same WT scheduled/started Id and startTime matches the accepted message is valid (same update Id)") - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4259,54 +4288,54 @@ func TestWorkflowUpdateSuite(t *testing.T) { 8 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId":5} 9 WorkflowExecutionUpdateCompleted `, events) - }) +} - t.Run("StaleSpeculativeWorkflowTask_Fail_NewWorkflowTaskWith2Updates", func(t *testing.T) { - s := testcore.NewEnv(t, testcore.WithDedicatedCluster()) - /* - Test scenario: - An update created a speculative WT and WT is dispatched to the worker (started). - Mutable state cleared, speculative WT and update registry are disappeared from server. - First update is retried and another update come in. - Second speculative WT is dispatched to worker with same WT scheduled/started Id but 2 updates. - The first speculative WT responds back, server rejected it (different start time). - The second speculative WT responds back, server accepted it. - */ - - mustStartWorkflow(s, s.Tv()) - tv1 := s.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1) - tv2 := s.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2) - - testCtx := testcore.NewContext(s.Context()) - - // Drain first WFT. - wt1, err := s.FrontendClient().PollWorkflowTaskQueue(testCtx, &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt1) - _, err = s.FrontendClient().RespondWorkflowTaskCompleted(testCtx, &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt1.TaskToken, - }) - s.NoError(err) +func (s *WorkflowUpdateSuite) TestStaleSpeculativeWorkflowTask_Fail_NewWorkflowTaskWith2Updates() { + env := testcore.NewEnv(s.T(), testcore.WithDedicatedCluster()) + /* + Test scenario: + An update created a speculative WT and WT is dispatched to the worker (started). + Mutable state cleared, speculative WT and update registry are disappeared from server. + First update is retried and another update come in. + Second speculative WT is dispatched to worker with same WT scheduled/started Id but 2 updates. + The first speculative WT responds back, server rejected it (different start time). + The second speculative WT responds back, server accepted it. + */ + + mustStartWorkflow(env, env.Tv()) + tv1 := env.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1) + tv2 := env.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2) + + testCtx := testcore.NewContext(env.Context()) + + // Drain first WFT. + wt1, err := env.FrontendClient().PollWorkflowTaskQueue(testCtx, &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt1) + _, err = env.FrontendClient().RespondWorkflowTaskCompleted(testCtx, &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt1.TaskToken, + }) + s.NoError(err) - // Send 1st update. It will create 2nd speculative WFT. - sendUpdateNoError(s, tv1) + // Send 1st update. It will create 2nd speculative WFT. + sendUpdateNoError(env, tv1) - // Poll 2nd speculative WFT with 1st update. - wt2, err := s.FrontendClient().PollWorkflowTaskQueue(testCtx, &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt2) - s.NotEmpty(wt2.TaskToken, "2nd workflow task must have valid task token") - s.Len(wt2.Messages, 1, "2nd workflow task must have a message with 1st update") - s.EqualValues(6, wt2.StartedEventId) - s.EqualValues(5, wt2.Messages[0].GetEventId()) - s.EqualHistory(` + // Poll 2nd speculative WFT with 1st update. + wt2, err := env.FrontendClient().PollWorkflowTaskQueue(testCtx, &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt2) + s.NotEmpty(wt2.TaskToken, "2nd workflow task must have valid task token") + s.Len(wt2.Messages, 1, "2nd workflow task must have a message with 1st update") + s.EqualValues(6, wt2.StartedEventId) + s.EqualValues(5, wt2.Messages[0].GetEventId()) + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4314,29 +4343,29 @@ func TestWorkflowUpdateSuite(t *testing.T) { 5 WorkflowTaskScheduled 6 WorkflowTaskStarted`, wt2.History) - // Clear update registry. Speculative WFT disappears from server. - clearUpdateRegistryAndAbortPendingUpdates(s, s.Tv()) + // Clear update registry. Speculative WFT disappears from server. + clearUpdateRegistryAndAbortPendingUpdates(env, env.Tv()) - // Make sure UpdateWorkflowExecution call for the update "1" is retried and new (3rd) WFT is created as speculative with updateID=1. - waitUpdateAdmitted(s, tv1) + // Make sure UpdateWorkflowExecution call for the update "1" is retried and new (3rd) WFT is created as speculative with updateID=1. + waitUpdateAdmitted(env, tv1) - // Send 2nd update (with DIFFERENT updateId). It reuses already created 3rd WFT. - sendUpdateNoError(s, tv2) - // updateID=1 is still blocked. There must be 2 blocked updates now. + // Send 2nd update (with DIFFERENT updateId). It reuses already created 3rd WFT. + sendUpdateNoError(env, tv2) + // updateID=1 is still blocked. There must be 2 blocked updates now. - // Poll the 3rd speculative WFT. - wt3, err := s.FrontendClient().PollWorkflowTaskQueue(testCtx, &workflowservice.PollWorkflowTaskQueueRequest{ - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - }) - s.NoError(err) - s.NotNil(wt3) - s.NotEmpty(wt3.TaskToken, "3rd workflow task must have valid task token") - s.Len(wt3.Messages, 2, "3rd workflow task must have a message with 1st and 2nd updates") - s.EqualValues(6, wt3.StartedEventId) - s.EqualValues(5, wt3.Messages[0].GetEventId()) - s.EqualValues(5, wt3.Messages[1].GetEventId()) - s.EqualHistory(` + // Poll the 3rd speculative WFT. + wt3, err := env.FrontendClient().PollWorkflowTaskQueue(testCtx, &workflowservice.PollWorkflowTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + }) + s.NoError(err) + s.NotNil(wt3) + s.NotEmpty(wt3.TaskToken, "3rd workflow task must have valid task token") + s.Len(wt3.Messages, 2, "3rd workflow task must have a message with 1st and 2nd updates") + s.EqualValues(6, wt3.StartedEventId) + s.EqualValues(5, wt3.Messages[0].GetEventId()) + s.EqualValues(5, wt3.Messages[1].GetEventId()) + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4344,35 +4373,35 @@ func TestWorkflowUpdateSuite(t *testing.T) { 5 WorkflowTaskScheduled 6 WorkflowTaskStarted`, wt3.History) - // Now try to complete 2nd speculative WT, it should fail because start time does not match. - _, err = s.FrontendClient().RespondWorkflowTaskCompleted(testCtx, &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt2.TaskToken, - Commands: s.UpdateAcceptCompleteCommands(tv1), - Messages: s.UpdateAcceptCompleteMessages(tv1, wt2.Messages[0]), - ReturnNewWorkflowTask: true, - }) - s.Error(err, "Must fail because start time is different.") - s.Contains(err.Error(), "Workflow task not found") - s.ErrorAs(err, new(*serviceerror.NotFound)) - - // Complete of the 3rd WT should succeed. It must accept both updates. - wt4Resp, err := s.FrontendClient().RespondWorkflowTaskCompleted(testCtx, &workflowservice.RespondWorkflowTaskCompletedRequest{ - Namespace: s.Namespace().String(), - TaskToken: wt3.TaskToken, - Commands: append( - s.UpdateAcceptCompleteCommands(tv1), - s.UpdateAcceptCompleteCommands(tv2)...), - Messages: append( - s.UpdateAcceptCompleteMessages(tv1, wt3.Messages[0]), - s.UpdateAcceptCompleteMessages(tv2, wt3.Messages[1])...), - ReturnNewWorkflowTask: true, - }) - s.NoError(err) - s.NotNil(wt4Resp) + // Now try to complete 2nd speculative WT, it should fail because start time does not match. + _, err = env.FrontendClient().RespondWorkflowTaskCompleted(testCtx, &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt2.TaskToken, + Commands: env.UpdateAcceptCompleteCommands(tv1), + Messages: env.UpdateAcceptCompleteMessages(tv1, wt2.Messages[0]), + ReturnNewWorkflowTask: true, + }) + s.Error(err, "Must fail because start time is different.") + s.Contains(err.Error(), "Workflow task not found") + s.ErrorAs(err, new(*serviceerror.NotFound)) + + // Complete of the 3rd WT should succeed. It must accept both updates. + wt4Resp, err := env.FrontendClient().RespondWorkflowTaskCompleted(testCtx, &workflowservice.RespondWorkflowTaskCompletedRequest{ + Namespace: env.Namespace().String(), + TaskToken: wt3.TaskToken, + Commands: append( + env.UpdateAcceptCompleteCommands(tv1), + env.UpdateAcceptCompleteCommands(tv2)...), + Messages: append( + env.UpdateAcceptCompleteMessages(tv1, wt3.Messages[0]), + env.UpdateAcceptCompleteMessages(tv2, wt3.Messages[1])...), + ReturnNewWorkflowTask: true, + }) + s.NoError(err) + s.NotNil(wt4Resp) - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -4385,1412 +4414,1430 @@ func TestWorkflowUpdateSuite(t *testing.T) { 10 WorkflowExecutionUpdateAccepted {"AcceptedRequestSequencingEventId": 5} 11 WorkflowExecutionUpdateCompleted `, events) - }) - - t.Run("SpeculativeWorkflowTask_WorkerSkippedProcessing_RejectByServer", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) - tv1 := s.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1) - tv2 := s.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2) - - var update2ResultCh <-chan *workflowservice.UpdateWorkflowExecutionResponse - - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Speculative WT. - 6 WorkflowTaskStarted -`, task.History) - update2ResultCh = sendUpdateNoError(s, tv2) - return nil, nil - case 3: - s.EqualHistory(` - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted`, task.History) - commands := append(s.UpdateAcceptCompleteCommands(tv2), - &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, - }) - return commands, nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil - } - } - - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) - - s.Equal("args-value-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.EqualValues(5, updRequestMsg.GetEventId()) - - // Don't process update in WT. - return nil, nil - case 3: - s.Len(task.Messages, 1) - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) +} - s.Equal("args-value-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.EqualValues(5, updRequestMsg.GetEventId()) - return s.UpdateAcceptCompleteMessages(tv2, updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_WorkerSkippedProcessing_RejectByServer() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + tv1 := env.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1) + tv2 := env.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2) + + var update2ResultCh <-chan *workflowservice.UpdateWorkflowExecutionResponse + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Speculative WT. + 6 WorkflowTaskStarted + `, task.History) + update2ResultCh = sendUpdateNoError(env, tv2) + return nil, nil + case 3: + s.EqualHistory(` + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted`, task.History) + commands := append(env.UpdateAcceptCompleteCommands(tv2), + &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}}, + }) + return commands, nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Identity: "old_worker", - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+tv1.UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.EqualValues(5, updRequestMsg.GetEventId()) + + // Don't process update in WT. + return nil, nil + case 3: + s.Len(task.Messages, 1) + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.EqualValues(5, updRequestMsg.GetEventId()) + return env.UpdateAcceptCompleteMessages(tv2, updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) - - updateResultCh := sendUpdateNoError(s, tv1) - - // Process 2nd WT which ignores update message. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResp := res.NewTask - updateResult := <-updateResultCh - s.Equal("Workflow Update is rejected because it wasn't processed by worker. Probably, Workflow Update is not supported by the worker.", updateResult.GetOutcome().GetFailure().GetMessage()) - s.EqualValues(3, updateResp.ResetHistoryEventId) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Identity: "old_worker", + Logger: env.Logger, + T: s.T(), + } - // Process 3rd WT which completes 2nd update and workflow. - update2Resp, err := poller.HandlePartialWorkflowTask(updateResp.GetWorkflowTask(), false) - s.NoError(err) - s.NotNil(update2Resp) - update2Result := <-update2ResultCh - s.Equal("success-result-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), update2Result.GetOutcome().GetSuccess())) - - s.Equal(3, wtHandlerCalls) - s.Equal(3, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // 1st speculative WT is not present in the history. This is 2nd speculative WT. - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted - 9 WorkflowExecutionUpdateCompleted - 10 WorkflowExecutionCompleted`, events) - }) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - t.Run("LastWorkflowTask_HasUpdateMessage", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + updateResultCh := sendUpdateNoError(env, tv1) - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - completeWorkflowCommand := &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ - CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{ - Result: s.Tv().Any().Payloads(), - }, - }, - } - return append(s.UpdateAcceptCommands(s.Tv()), completeWorkflowCommand), nil - }, - MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - return s.UpdateAcceptMessages(s.Tv(), task.Messages[0]), nil - }, - Logger: s.Logger, - T: s.T(), - } + // Process 2nd WT which ignores update message. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResp := res.NewTask + updateResult := <-updateResultCh + s.Equal("Workflow Update is rejected because it wasn't processed by worker. Probably, Workflow Update is not supported by the worker.", updateResult.GetOutcome().GetFailure().GetMessage()) + s.EqualValues(3, updateResp.ResetHistoryEventId) - updateResultCh := sendUpdateNoErrorWaitPolicyAccepted(s, s.Tv()) - _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResult := <-updateResultCh - s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, updateResult.GetStage()) - s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", updateResult.GetOutcome().GetFailure().GetMessage()) + // Process 3rd WT which completes 2nd update and workflow. + update2Resp, err := poller.HandlePartialWorkflowTask(updateResp.GetWorkflowTask(), false) + s.NoError(err) + s.NotNil(update2Resp) + update2Result := <-update2ResultCh + s.Equal("success-result-of-"+tv2.UpdateID(), testcore.DecodeString(s.T(), update2Result.GetOutcome().GetSuccess())) - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 WorkflowExecutionCompleted - `, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) - }) + s.Equal(3, wtHandlerCalls) + s.Equal(3, msgHandlerCalls) - t.Run("SpeculativeWorkflowTask_QueryFailureClearsWFContext", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // 1st speculative WT is not present in the history. This is 2nd speculative WT. + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted + 9 WorkflowExecutionUpdateCompleted + 10 WorkflowExecutionCompleted`, events) +} - wtHandlerCalls := 0 - wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - wtHandlerCalls++ - switch wtHandlerCalls { - case 1: - // Completes first WT with empty command list. - return nil, nil - case 2: - s.EqualHistory(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted -`, task.History) - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - default: - s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) - return nil, nil +func (s *WorkflowUpdateSuite) TestLastWorkflowTask_HasUpdateMessage() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + completeWorkflowCommand := &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ + CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{ + Result: env.Tv().Any().Payloads(), + }, + }, } - } + return append(env.UpdateAcceptCommands(env.Tv()), completeWorkflowCommand), nil + }, + MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + return env.UpdateAcceptMessages(env.Tv(), task.Messages[0]), nil + }, + Logger: env.Logger, + T: s.T(), + } - msgHandlerCalls := 0 - msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - msgHandlerCalls++ - switch msgHandlerCalls { - case 1: - return nil, nil - case 2: - updRequestMsg := task.Messages[0] - updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + updateResultCh := sendUpdateNoErrorWaitPolicyAccepted(env, env.Tv()) + _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResult := <-updateResultCh + s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, updateResult.GetStage()) + s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", updateResult.GetOutcome().GetFailure().GetMessage()) - s.Equal("args-value-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) - s.Equal(s.Tv().HandlerName(), updRequest.GetInput().GetName()) - s.EqualValues(5, updRequestMsg.GetEventId()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 WorkflowExecutionCompleted + `, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) +} - return s.UpdateAcceptCompleteMessages(s.Tv(), updRequestMsg), nil - default: - s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) - return nil, nil - } +func (s *WorkflowUpdateSuite) TestSpeculativeWorkflowTask_QueryFailureClearsWFContext() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + wtHandlerCalls := 0 + wtHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + wtHandlerCalls++ + switch wtHandlerCalls { + case 1: + // Completes first WT with empty command list. + return nil, nil + case 2: + s.EqualHistory(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + `, task.History) + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + default: + s.Failf("wtHandler called too many times", "wtHandler shouldn't be called %d times", wtHandlerCalls) + return nil, nil } + } - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - WorkflowTaskHandler: wtHandler, - MessageHandler: msgHandler, - Logger: s.Logger, - T: s.T(), + msgHandlerCalls := 0 + msgHandler := func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + msgHandlerCalls++ + switch msgHandlerCalls { + case 1: + return nil, nil + case 2: + updRequestMsg := task.Messages[0] + updRequest := protoutils.UnmarshalAny[*updatepb.Request](s.T(), updRequestMsg.GetBody()) + + s.Equal("args-value-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updRequest.GetInput().GetArgs())) + s.Equal(env.Tv().HandlerName(), updRequest.GetInput().GetName()) + s.EqualValues(5, updRequestMsg.GetEventId()) + + return env.UpdateAcceptCompleteMessages(env.Tv(), updRequestMsg), nil + default: + s.Failf("msgHandler called too many times", "msgHandler shouldn't be called %d times", msgHandlerCalls) + return nil, nil } + } - // Drain first WT. - _, err := poller.PollAndProcessWorkflowTask() - s.NoError(err) + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + WorkflowTaskHandler: wtHandler, + MessageHandler: msgHandler, + Logger: env.Logger, + T: s.T(), + } - updateResultCh := sendUpdateNoError(s, s.Tv()) + // Drain first WT. + _, err := poller.PollAndProcessWorkflowTask() + s.NoError(err) - type QueryResult struct { - Resp *workflowservice.QueryWorkflowResponse - Err error - } - queryFn := func(resCh chan<- QueryResult) { - // There is no query handler, and query timeout is ok for this test. - // But first query must not time out before 2nd query reached server, - // because 2 queries overflow the query buffer (default size 1), - // which leads to clearing of WF context. - shortCtx, cancel := context.WithTimeout(s.Context(), 100*time.Millisecond) - defer cancel() - queryResp, err := s.FrontendClient().QueryWorkflow(shortCtx, &workflowservice.QueryWorkflowRequest{ - Namespace: s.Namespace().String(), - Execution: s.Tv().WorkflowExecution(), - Query: &querypb.WorkflowQuery{ - QueryType: s.Tv().Any().String(), - }, - }) - resCh <- QueryResult{Resp: queryResp, Err: err} - } + updateResultCh := sendUpdateNoError(env, env.Tv()) - query1ResultCh := make(chan QueryResult) - query2ResultCh := make(chan QueryResult) - go queryFn(query1ResultCh) - go queryFn(query2ResultCh) - query1Res := <-query1ResultCh - query2Res := <-query2ResultCh - s.Error(query1Res.Err) - s.Error(query2Res.Err) - s.Nil(query1Res.Resp) - s.Nil(query2Res.Resp) - - var queryBufferFullErr *serviceerror.ResourceExhausted - if common.IsContextDeadlineExceededErr(query1Res.Err) { - s.True(common.IsContextDeadlineExceededErr(query1Res.Err), "one of query errors must be CDE") - s.ErrorAs(query2Res.Err, &queryBufferFullErr, "one of query errors must `query buffer is full`") - s.Contains(query2Res.Err.Error(), "query buffer is full", "one of query errors must `query buffer is full`") - } else { - s.ErrorAs(query1Res.Err, &queryBufferFullErr, "one of query errors must `query buffer is full`") - s.Contains(query1Res.Err.Error(), "query buffer is full", "one of query errors must `query buffer is full`") - s.True(common.IsContextDeadlineExceededErr(query2Res.Err), "one of query errors must be CDE") - } + type QueryResult struct { + Resp *workflowservice.QueryWorkflowResponse + Err error + } + queryFn := func(resCh chan<- QueryResult) { + // There is no query handler, and query timeout is ok for this test. + // But first query must not time out before 2nd query reached server, + // because 2 queries overflow the query buffer (default size 1), + // which leads to clearing of WF context. + shortCtx, cancel := context.WithTimeout(env.Context(), 100*time.Millisecond) + defer cancel() + queryResp, err := env.FrontendClient().QueryWorkflow(shortCtx, &workflowservice.QueryWorkflowRequest{ + Namespace: env.Namespace().String(), + Execution: env.Tv().WorkflowExecution(), + Query: &querypb.WorkflowQuery{ + QueryType: env.Tv().Any().String(), + }, + }) + resCh <- QueryResult{Resp: queryResp, Err: err} + } - // "query buffer is full" error clears WF context. If update registry is not cleared together with context (old behaviour), - // then update stays there but speculative WFT which supposed to deliver it, is cleared. - // Subsequent retry attempts of "UpdateWorkflowExecution" API wouldn't help, because update is deduped by registry, - // and new WFT is not created. Update is not delivered to the worker until new WFT is created. - // If registry is cleared together with WF context (current behaviour), retries of "UpdateWorkflowExecution" - // will create new update and WFT. + query1ResultCh := make(chan QueryResult) + query2ResultCh := make(chan QueryResult) + go queryFn(query1ResultCh) + go queryFn(query2ResultCh) + query1Res := <-query1ResultCh + query2Res := <-query2ResultCh + s.Error(query1Res.Err) + s.Error(query2Res.Err) + s.Nil(query1Res.Resp) + s.Nil(query2Res.Resp) + + var queryBufferFullErr *serviceerror.ResourceExhausted + if common.IsContextDeadlineExceededErr(query1Res.Err) { + s.True(common.IsContextDeadlineExceededErr(query1Res.Err), "one of query errors must be CDE") + s.ErrorAs(query2Res.Err, &queryBufferFullErr, "one of query errors must `query buffer is full`") + s.Contains(query2Res.Err.Error(), "query buffer is full", "one of query errors must `query buffer is full`") + } else { + s.ErrorAs(query1Res.Err, &queryBufferFullErr, "one of query errors must `query buffer is full`") + s.Contains(query1Res.Err.Error(), "query buffer is full", "one of query errors must `query buffer is full`") + s.True(common.IsContextDeadlineExceededErr(query2Res.Err), "one of query errors must be CDE") + } - // Wait to make sure that UpdateWorkflowExecution call is retried, update and speculative WFT are recreated. - time.Sleep(500 * time.Millisecond) //nolint:forbidigo + // "query buffer is full" error clears WF context. If update registry is not cleared together with context (old behaviour), + // then update stays there but speculative WFT which supposed to deliver it, is cleared. + // Subsequent retry attempts of "UpdateWorkflowExecution" API wouldn't help, because update is deduped by registry, + // and new WFT is not created. Update is not delivered to the worker until new WFT is created. + // If registry is cleared together with WF context (current behaviour), retries of "UpdateWorkflowExecution" + // will create new update and WFT. - // Process update in workflow. - res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResp := res.NewTask - updateResult := <-updateResultCh - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) - s.EqualValues(0, updateResp.ResetHistoryEventId) - - s.Equal(2, wtHandlerCalls) - s.Equal(2, msgHandlerCalls) - - events := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled // Was speculative WT... - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted // ...and events were written to the history when WT completes. - 8 WorkflowExecutionUpdateAccepted - 9 WorkflowExecutionUpdateCompleted -`, events) - }) + // Wait to make sure that UpdateWorkflowExecution call is retried, update and speculative WFT are recreated. + time.Sleep(500 * time.Millisecond) //nolint:forbidigo - t.Run("UpdatesAreSentToWorkerInOrderOfAdmission", func(t *testing.T) { - s := testcore.NewEnv(t) - // If our implementation is not in fact ordering updates correctly, then it may be ordering them - // non-deterministically. This number should be high enough that the false-negative rate of the test is low, but - // must not exceed our limit on number of in-flight updates. If we were picking a random ordering then the - // false-negative rate would be 1/(nUpdates!). - nUpdates := 10 - - mustStartWorkflow(s, s.Tv()) - for i := range nUpdates { - // Sequentially send updates one by one. - sendUpdateNoError(s, s.Tv().WithUpdateIDNumber(i)) - } + // Process update in workflow. + res, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResp := res.NewTask + updateResult := <-updateResultCh + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + s.EqualValues(0, updateResp.ResetHistoryEventId) - wtHandlerCalls := 0 - msgHandlerCalls := 0 - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - msgHandlerCalls++ - var commands []*commandpb.Command - for i := range task.Messages { - commands = append(commands, s.UpdateAcceptCompleteCommands(s.Tv().WithMessageIDNumber(i))...) - } - return commands, nil - }, - MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - wtHandlerCalls++ - s.Len(task.Messages, nUpdates) - var messages []*protocolpb.Message - // Updates were sent in sequential order of updateId => messages must be ordered in the same way. - for i, m := range task.Messages { - s.Equal(s.Tv().WithUpdateIDNumber(i).UpdateID(), m.ProtocolInstanceId) - messages = append(messages, s.UpdateAcceptCompleteMessages(s.Tv().WithMessageIDNumber(i), m)...) - } - return messages, nil - }, - Logger: s.Logger, - T: s.T(), - } - _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - s.Equal(1, wtHandlerCalls) - s.Equal(1, msgHandlerCalls) - - expectedHistory := ` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted -` - for i := range nUpdates { - tvi := s.Tv().WithUpdateIDNumber(i) - expectedHistory += fmt.Sprintf(` - %d WorkflowExecutionUpdateAccepted {"AcceptedRequest":{"Meta": {"UpdateId": "%s"}}} - %d WorkflowExecutionUpdateCompleted {"Meta": {"UpdateId": "%s"}}`, - 5+2*i, tvi.UpdateID(), - 6+2*i, tvi.UpdateID()) - } + s.Equal(2, wtHandlerCalls) + s.Equal(2, msgHandlerCalls) - history := s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution()) - s.EqualHistoryEvents(expectedHistory, history) - }) + events := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) - t.Run("WaitAccepted_GotCompleted", func(t *testing.T) { - s := testcore.NewEnv(t) - mustStartWorkflow(s, s.Tv()) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled // Was speculative WT... + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted // ...and events were written to the history when WT completes. + 8 WorkflowExecutionUpdateAccepted + 9 WorkflowExecutionUpdateCompleted + `, events) +} - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - return s.UpdateAcceptCompleteCommands(s.Tv()), nil - }, - MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - return s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), nil - }, - Logger: s.Logger, - T: s.T(), - } +func (s *WorkflowUpdateSuite) TestUpdatesAreSentToWorkerInOrderOfAdmission() { + env := testcore.NewEnv(s.T()) + // If our implementation is not in fact ordering updates correctly, then it may be ordering them + // non-deterministically. This number should be high enough that the false-negative rate of the test is low, but + // must not exceed our limit on number of in-flight updates. If we were picking a random ordering then the + // false-negative rate would be 1/(nUpdates!). + nUpdates := 10 + + mustStartWorkflow(env, env.Tv()) + for i := range nUpdates { + // Sequentially send updates one by one. + sendUpdateNoError(env, env.Tv().WithUpdateIDNumber(i)) + } - // Send Update with intent to wait for Accepted stage only, - updateResultCh := sendUpdateNoErrorWaitPolicyAccepted(s, s.Tv()) - _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) - s.NoError(err) - updateResult := <-updateResultCh - // but Update was accepted and completed on the same WFT, and outcome was returned. - s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, updateResult.GetStage()) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + wtHandlerCalls := 0 + msgHandlerCalls := 0 + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + msgHandlerCalls++ + var commands []*commandpb.Command + for i := range task.Messages { + commands = append(commands, env.UpdateAcceptCompleteCommands(env.Tv().WithMessageIDNumber(i))...) + } + return commands, nil + }, + MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + wtHandlerCalls++ + s.Len(task.Messages, nUpdates) + var messages []*protocolpb.Message + // Updates were sent in sequential order of updateId => messages must be ordered in the same way. + for i, m := range task.Messages { + s.Equal(env.Tv().WithUpdateIDNumber(i).UpdateID(), m.ProtocolInstanceId) + messages = append(messages, env.UpdateAcceptCompleteMessages(env.Tv().WithMessageIDNumber(i), m)...) + } + return messages, nil + }, + Logger: env.Logger, + T: s.T(), + } + _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + s.Equal(1, wtHandlerCalls) + s.Equal(1, msgHandlerCalls) + + expectedHistory := ` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + ` + for i := range nUpdates { + tvi := env.Tv().WithUpdateIDNumber(i) + expectedHistory += fmt.Sprintf(` + %d WorkflowExecutionUpdateAccepted {"AcceptedRequest":{"Meta": {"UpdateId": "%s"}}} + %d WorkflowExecutionUpdateCompleted {"Meta": {"UpdateId": "%s"}}`, + 5+2*i, tvi.UpdateID(), + 6+2*i, tvi.UpdateID()) + } - s.EqualHistoryEvents(` + history := env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution()) + s.EqualHistoryEvents(expectedHistory, history) +} + +func (s *WorkflowUpdateSuite) TestWaitAccepted_GotCompleted() { + env := testcore.NewEnv(s.T()) + mustStartWorkflow(env, env.Tv()) + + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + return env.UpdateAcceptCompleteCommands(env.Tv()), nil + }, + MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + return env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), nil + }, + Logger: env.Logger, + T: s.T(), + } + + // Send Update with intent to wait for Accepted stage only, + updateResultCh := sendUpdateNoErrorWaitPolicyAccepted(env, env.Tv()) + _, err := poller.PollAndProcessWorkflowTask(testcore.WithoutRetries) + s.NoError(err) + updateResult := <-updateResultCh + // but Update was accepted and completed on the same WFT, and outcome was returned. + s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED, updateResult.GetStage()) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateResult.GetOutcome().GetSuccess())) + + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted 4 WorkflowTaskCompleted 5 WorkflowExecutionUpdateAccepted 6 WorkflowExecutionUpdateCompleted - `, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) - }) + `, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) +} - t.Run("ContinueAsNew_UpdateIsNotCarriedOver", func(t *testing.T) { - s := testcore.NewEnv(t) - firstRunID := mustStartWorkflow(s, s.Tv()) - tv1 := s.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1) - tv2 := s.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2) +func (s *WorkflowUpdateSuite) TestContinueAsNew_UpdateIsNotCarriedOver() { + env := testcore.NewEnv(s.T()) + firstRunID := mustStartWorkflow(env, env.Tv()) + tv1 := env.Tv().WithUpdateIDNumber(1).WithMessageIDNumber(1) + tv2 := env.Tv().WithUpdateIDNumber(2).WithMessageIDNumber(2) + + /* + 1st Update goes to the 1st run and accepted (but not completed) by Workflow. + While this WFT is running, 2nd Update is sent, and WFT is completing with CAN for the 1st run. + There are 2 Updates in the registry of the 1st run: 1st is accepted and 2nd is admitted. + Both of them are aborted but with different errors: + - Admitted Update is aborted with retryable "workflow is closing" error. SDK should retry this error + and new attempt should land on the new run. + - Accepted Update is aborted with update failure. + */ + + var update2ResponseCh <-chan updateResponseErr + + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller1 := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + // Send 2nd Update while WFT is running. + update2ResponseCh = sendUpdate(env.Context(), env, tv2) + canCommand := &commandpb.Command{ + CommandType: enumspb.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_ContinueAsNewWorkflowExecutionCommandAttributes{ContinueAsNewWorkflowExecutionCommandAttributes: &commandpb.ContinueAsNewWorkflowExecutionCommandAttributes{ + WorkflowType: env.Tv().WorkflowType(), + TaskQueue: env.Tv().WithTaskQueueNumber(2).TaskQueue(), + }}, + } + return append(env.UpdateAcceptCommands(tv1), canCommand), nil + }, + MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + return env.UpdateAcceptMessages(tv1, task.Messages[0]), nil + }, + Logger: env.Logger, + T: s.T(), + } - /* - 1st Update goes to the 1st run and accepted (but not completed) by Workflow. - While this WFT is running, 2nd Update is sent, and WFT is completing with CAN for the 1st run. - There are 2 Updates in the registry of the 1st run: 1st is accepted and 2nd is admitted. - Both of them are aborted but with different errors: - - Admitted Update is aborted with retryable "workflow is closing" error. SDK should retry this error - and new attempt should land on the new run. - - Accepted Update is aborted with update failure. - */ + //nolint:staticcheck // SA1019 TaskPoller replacement needed + poller2 := &testcore.TaskPoller{ + Client: env.FrontendClient(), + Namespace: env.Namespace().String(), + TaskQueue: env.Tv().WithTaskQueueNumber(2).TaskQueue(), + Identity: env.Tv().WorkerIdentity(), + WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { + return nil, nil + }, + MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { + s.Empty(task.Messages, "no Updates should be carried over to the 2nd run") + return nil, nil + }, + Logger: env.Logger, + T: s.T(), + } - var update2ResponseCh <-chan updateResponseErr + update1ResponseCh := sendUpdate(env.Context(), env, tv1) + _, err := poller1.PollAndProcessWorkflowTask() + s.NoError(err) - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller1 := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - // Send 2nd Update while WFT is running. - update2ResponseCh = sendUpdate(s.Context(), s, tv2) - canCommand := &commandpb.Command{ - CommandType: enumspb.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_ContinueAsNewWorkflowExecutionCommandAttributes{ContinueAsNewWorkflowExecutionCommandAttributes: &commandpb.ContinueAsNewWorkflowExecutionCommandAttributes{ - WorkflowType: s.Tv().WorkflowType(), - TaskQueue: s.Tv().WithTaskQueueNumber(2).TaskQueue(), - }}, - } - return append(s.UpdateAcceptCommands(tv1), canCommand), nil - }, - MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - return s.UpdateAcceptMessages(tv1, task.Messages[0]), nil - }, - Logger: s.Logger, - T: s.T(), - } + _, err = poller2.PollAndProcessWorkflowTask() + s.NoError(err) - //nolint:staticcheck // SA1019 TaskPoller replacement needed - poller2 := &testcore.TaskPoller{ - Client: s.FrontendClient(), - Namespace: s.Namespace().String(), - TaskQueue: s.Tv().WithTaskQueueNumber(2).TaskQueue(), - Identity: s.Tv().WorkerIdentity(), - WorkflowTaskHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*commandpb.Command, error) { - return nil, nil - }, - MessageHandler: func(task *workflowservice.PollWorkflowTaskQueueResponse) ([]*protocolpb.Message, error) { - s.Empty(task.Messages, "no Updates should be carried over to the 2nd run") - return nil, nil - }, - Logger: s.Logger, - T: s.T(), - } + update1Response := <-update1ResponseCh + s.NoError(update1Response.err) + s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", update1Response.response.GetOutcome().GetFailure().GetMessage()) - update1ResponseCh := sendUpdate(s.Context(), s, tv1) - _, err := poller1.PollAndProcessWorkflowTask() - s.NoError(err) + update2Response := <-update2ResponseCh + s.Error(update2Response.err) + var resourceExhausted *serviceerror.ResourceExhausted + s.ErrorAs(update2Response.err, &resourceExhausted) + s.Equal("workflow operation can not be applied because workflow is closing", update2Response.err.Error()) - _, err = poller2.PollAndProcessWorkflowTask() - s.NoError(err) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 WorkflowExecutionContinuedAsNew`, env.GetHistory(env.Namespace().String(), env.Tv().WithRunID(firstRunID).WorkflowExecution())) - update1Response := <-update1ResponseCh - s.NoError(update1Response.err) - s.Equal("Workflow Update failed because the Workflow completed before the Update completed.", update1Response.response.GetOutcome().GetFailure().GetMessage()) - - update2Response := <-update2ResponseCh - s.Error(update2Response.err) - var resourceExhausted *serviceerror.ResourceExhausted - s.ErrorAs(update2Response.err, &resourceExhausted) - s.Equal("workflow operation can not be applied because workflow is closing", update2Response.err.Error()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 WorkflowExecutionContinuedAsNew`, s.GetHistory(s.Namespace().String(), s.Tv().WithRunID(firstRunID).WorkflowExecution())) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted`, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) - }) + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted`, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) +} - t.Run("ContinueAsNew_Suggestion", func(t *testing.T) { - // setup CAN suggestion to be at 2nd Update - s := testcore.NewEnv(t, - testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxTotalUpdates, 3), - testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxTotalUpdatesSuggestContinueAsNewThreshold, 0.5), - ) +func (s *WorkflowUpdateSuite) TestContinueAsNew_Suggestion() { + // setup CAN suggestion to be at 2nd Update + env := testcore.NewEnv(s.T(), + testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxTotalUpdates, 3), + testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxTotalUpdatesSuggestContinueAsNewThreshold, 0.5), + ) + + // start workflow + mustStartWorkflow(env, env.Tv()) + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + // send Update #1 - no CAN suggested + tv1 := env.Tv().WithUpdateIDNumber(1) + updateResultCh := sendUpdateNoError(env, tv1) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(tv1, + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted {"SuggestContinueAsNew": false} + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted {"SuggestContinueAsNew": false}`, task.History.Events) + + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(tv1, task.Messages[0]), + }, nil + }) + s.NoError(err) + <-updateResultCh + + // send Update #2 - CAN suggested + tv2 := env.Tv().WithUpdateIDNumber(2) + updateResultCh = sendUpdateNoError(env, tv2) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(tv2, + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + s.EqualHistoryEventsSuffix(` + WorkflowTaskStarted {"SuggestContinueAsNew": true}`, task.History.Events) + + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(tv2, task.Messages[0]), + }, nil + }) + s.NoError(err) + <-updateResultCh +} - // start workflow - mustStartWorkflow(s, s.Tv()) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) +type UpdateWithStartSuite struct { + parallelsuite.Suite[*UpdateWithStartSuite] +} - // send Update #1 - no CAN suggested - tv1 := s.Tv().WithUpdateIDNumber(1) - updateResultCh := sendUpdateNoError(s, tv1) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(tv1, - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted {"SuggestContinueAsNew": false} - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted {"SuggestContinueAsNew": false}`, task.History.Events) +func TestUpdateWithStartSuite(t *testing.T) { + parallelsuite.Run(t, &UpdateWithStartSuite{}) +} - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(tv1, task.Messages[0]), - }, nil +type multiopsResponseErr struct { + response *workflowservice.ExecuteMultiOperationResponse + err error +} + +func (s *UpdateWithStartSuite) sendUpdateWithStart(env testcore.Env, startReq *workflowservice.StartWorkflowExecutionRequest, updateReq *workflowservice.UpdateWorkflowExecutionRequest) chan multiopsResponseErr { + ctx := testcore.NewContext(env.Context()) + capture := env.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() + defer env.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + + retCh := make(chan multiopsResponseErr) + go func() { + resp, err := env.FrontendClient().ExecuteMultiOperation( + ctx, + &workflowservice.ExecuteMultiOperationRequest{ + Namespace: env.Namespace().String(), + Operations: []*workflowservice.ExecuteMultiOperationRequest_Operation{ + { + Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_StartWorkflow{ + StartWorkflow: startReq, + }, + }, + { + Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_UpdateWorkflow{ + UpdateWorkflow: updateReq, + }, + }, + }, }) - s.NoError(err) - <-updateResultCh - // send Update #2 - CAN suggested - tv2 := s.Tv().WithUpdateIDNumber(2) - updateResultCh = sendUpdateNoError(s, tv2) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(tv2, - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - s.EqualHistoryEventsSuffix(` - WorkflowTaskStarted {"SuggestContinueAsNew": true}`, task.History.Events) + if err == nil { + // Use assert (not require) in goroutine - require calls t.FailNow() which panics + //nolint:testifylint // intentional use of assert in goroutine + assert.Len(s.T(), resp.Responses, 2) - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(tv2, task.Messages[0]), - }, nil - }) - s.NoError(err) - <-updateResultCh - }) + startRes := resp.Responses[0].Response.(*workflowservice.ExecuteMultiOperationResponse_Response_StartWorkflow).StartWorkflow + //nolint:testifylint // intentional use of assert in goroutine + assert.NotEmpty(s.T(), startRes.RunId) - t.Run("UpdateWithStart", func(t *testing.T) { - type multiopsResponseErr struct { - response *workflowservice.ExecuteMultiOperationResponse - err error + updateRes := resp.Responses[1].Response.(*workflowservice.ExecuteMultiOperationResponse_Response_UpdateWorkflow).UpdateWorkflow + if updateReq.WaitPolicy.LifecycleStage == enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED { + //nolint:testifylint // intentional use of assert in goroutine + assert.NotNil(s.T(), updateRes.Outcome) + //nolint:testifylint // intentional use of assert in goroutine + assert.NotEmpty(s.T(), updateRes.Outcome.String()) + } } - sendUpdateWithStart := func(s testcore.Env, ctx context.Context, startReq *workflowservice.StartWorkflowExecutionRequest, updateReq *workflowservice.UpdateWorkflowExecutionRequest) chan multiopsResponseErr { - capture := s.GetTestCluster().Host().CaptureMetricsHandler().StartCapture() - defer s.GetTestCluster().Host().CaptureMetricsHandler().StopCapture(capture) + // make sure there's no lock contention + //nolint:testifylint // intentional use of assert in goroutine + assert.Empty(s.T(), capture.Snapshot()[metrics.TaskWorkflowBusyCounter.Name()]) - retCh := make(chan multiopsResponseErr) - go func() { - resp, err := s.FrontendClient().ExecuteMultiOperation( - ctx, - &workflowservice.ExecuteMultiOperationRequest{ - Namespace: s.Namespace().String(), - Operations: []*workflowservice.ExecuteMultiOperationRequest_Operation{ - { - Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_StartWorkflow{ - StartWorkflow: startReq, - }, - }, - { - Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_UpdateWorkflow{ - UpdateWorkflow: updateReq, - }, - }, - }, - }) + retCh <- multiopsResponseErr{resp, err} + }() + return retCh +} - if err == nil { - // Use assert (not require) in goroutine - require calls t.FailNow() which panics - //nolint:testifylint // intentional use of assert in goroutine - assert.Len(s.T(), resp.Responses, 2) +func (s *UpdateWithStartSuite) updateWithStartReq(env testcore.Env, tv *testvars.TestVars) *workflowservice.StartWorkflowExecutionRequest { + return &workflowservice.StartWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowId: tv.WorkflowID(), + WorkflowType: tv.WorkflowType(), + TaskQueue: tv.TaskQueue(), + Identity: tv.WorkerIdentity(), + } +} - startRes := resp.Responses[0].Response.(*workflowservice.ExecuteMultiOperationResponse_Response_StartWorkflow).StartWorkflow - assert.NotEmpty(s.T(), startRes.RunId) +func (s *UpdateWithStartSuite) TestWorkflowIsNotRunning() { + for _, p := range []enumspb.WorkflowIdConflictPolicy{ + enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + } { + s.Run(fmt.Sprintf("start workflow and send update (with conflict policy %v)", p), func(s *UpdateWithStartSuite) { + s.Run("and accept", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = p + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) - updateRes := resp.Responses[1].Response.(*workflowservice.ExecuteMultiOperationResponse_Response_UpdateWorkflow).UpdateWorkflow - if updateReq.WaitPolicy.LifecycleStage == enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED { - assert.NotNil(s.T(), updateRes.Outcome) - assert.NotEmpty(s.T(), updateRes.Outcome.String()) - } - } + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) - // make sure there's no lock contention - //nolint:testifylint // intentional use of assert in goroutine - assert.Empty(s.T(), capture.Snapshot()[metrics.TaskWorkflowBusyCounter.Name()]) + uwsRes := <-uwsCh + s.NoError(err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + requireStartedAndRunning(s.T(), startResp) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) - retCh <- multiopsResponseErr{resp, err} - }() - return retCh - } + // poll update to ensure same outcome is returned + pollRes, err := pollUpdate(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + s.NoError(err) + s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) - startWorkflowReq := func(s testcore.Env, tv *testvars.TestVars) *workflowservice.StartWorkflowExecutionRequest { - return &workflowservice.StartWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowId: tv.WorkflowID(), - WorkflowType: tv.WorkflowType(), - TaskQueue: tv.TaskQueue(), - Identity: tv.WorkerIdentity(), - } - } + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowExecutionUpdateAccepted + 6 WorkflowExecutionUpdateCompleted`, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) + }) - t.Run("workflow is not running", func(t *testing.T) { - for _, p := range []enumspb.WorkflowIdConflictPolicy{ - enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, - } { - t.Run(fmt.Sprintf("start workflow and send update (with conflict policy %v)", p), func(t *testing.T) { - t.Run("and accept", func(t *testing.T) { - s := testcore.NewEnv(t) - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = p - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - - uwsRes := <-uwsCh - s.NoError(err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - requireStartedAndRunning(s.T(), startResp) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) - - // poll update to ensure same outcome is returned - pollRes, err := pollUpdate(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - s.NoError(err) - s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowExecutionUpdateAccepted - 6 WorkflowExecutionUpdateCompleted`, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) - }) + s.Run("and reject", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + startReq := s.updateWithStartReq(env, env.Tv()) + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) - t.Run("and reject", func(t *testing.T) { - s := testcore.NewEnv(t) - startReq := startWorkflowReq(s, s.Tv()) - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateRejectMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - - uwsRes := <-uwsCh - s.NoError(uwsRes.err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - requireStartedAndRunning(s.T(), startResp) - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateRep.GetOutcome().GetFailure().GetMessage()) - - // poll update to ensure same outcome is returned - _, err = pollUpdate(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - s.Error(err) - s.ErrorAs(err, new(*serviceerror.NotFound)) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted`, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateRejectMessages(env.Tv(), task.Messages[0]), + }, nil }) - }) - } - }) + s.NoError(err) - t.Run("workflow is running", func(t *testing.T) { - t.Run("workflow id conflict policy use-existing: only send update", func(t *testing.T) { - t.Run("and accept", func(t *testing.T) { - s := testcore.NewEnv(t) - // start workflow - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) - - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) - - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - - uwsRes := <-uwsCh - s.NoError(uwsRes.err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - requireNotStartedButRunning(s.T(), startResp) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) - - // poll update to ensure same outcome is returned - pollRes, err := pollUpdate(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - s.NoError(err) - s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) - - s.EqualHistoryEvents(` + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + requireStartedAndRunning(s.T(), startResp) + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateRep.GetOutcome().GetFailure().GetMessage()) + + // poll update to ensure same outcome is returned + _, err = pollUpdate(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + s.Error(err) + s.ErrorAs(err, new(*serviceerror.NotFound)) + + s.EqualHistoryEvents(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - 5 WorkflowTaskScheduled - 6 WorkflowTaskStarted - 7 WorkflowTaskCompleted - 8 WorkflowExecutionUpdateAccepted - 9 WorkflowExecutionUpdateCompleted`, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) + 4 WorkflowTaskCompleted`, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) + }) + }) + } +} + +func (s *UpdateWithStartSuite) TestWorkflowIsRunning() { + s.Run("workflow id conflict policy use-existing: only send update", func(s *UpdateWithStartSuite) { + s.Run("and accept", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + + // start workflow + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil }) + s.NoError(err) + + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + requireNotStartedButRunning(s.T(), startResp) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) + + // poll update to ensure same outcome is returned + pollRes, err := pollUpdate(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + s.NoError(err) + s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + 5 WorkflowTaskScheduled + 6 WorkflowTaskStarted + 7 WorkflowTaskCompleted + 8 WorkflowExecutionUpdateAccepted + 9 WorkflowExecutionUpdateCompleted`, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) + }) - t.Run("and reject", func(t *testing.T) { - s := testcore.NewEnv(t) - // start workflow - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) + s.Run("and reject", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + // start workflow + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateRejectMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - - uwsRes := <-uwsCh - s.NoError(uwsRes.err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - requireNotStartedButRunning(s.T(), startResp) - s.Equal("rejection-of-"+s.Tv().UpdateID(), updateRep.GetOutcome().GetFailure().GetMessage()) - - // poll update to ensure same outcome is returned - _, err = pollUpdate(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - s.Error(err) - s.ErrorAs(err, new(*serviceerror.NotFound)) - - s.EqualHistoryEvents(` - 1 WorkflowExecutionStarted - 2 WorkflowTaskScheduled - 3 WorkflowTaskStarted - 4 WorkflowTaskCompleted - `, s.GetHistory(s.Namespace().String(), s.Tv().WorkflowExecution())) + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateRejectMessages(env.Tv(), task.Messages[0]), + }, nil }) - }) + s.NoError(err) - t.Run("workflow id conflict policy terminate-existing", func(t *testing.T) { - t.Run("terminate workflow first, then start and update", func(t *testing.T) { - s := testcore.NewEnv(t) - // start workflow - firstWF, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + requireNotStartedButRunning(s.T(), startResp) + s.Equal("rejection-of-"+env.Tv().UpdateID(), updateRep.GetOutcome().GetFailure().GetMessage()) + + // poll update to ensure same outcome is returned + _, err = pollUpdate(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + s.Error(err) + s.ErrorAs(err, new(*serviceerror.NotFound)) + + s.EqualHistoryEvents(` + 1 WorkflowExecutionStarted + 2 WorkflowTaskScheduled + 3 WorkflowTaskStarted + 4 WorkflowTaskCompleted + `, env.GetHistory(env.Namespace().String(), env.Tv().WorkflowExecution())) + }) + }) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + s.Run("workflow id conflict policy terminate-existing", func(s *UpdateWithStartSuite) { + s.Run("terminate workflow first, then start and update", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) + // start workflow + firstWF, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - - uwsRes := <-uwsCh - s.NoError(uwsRes.err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - requireStartedAndRunning(s.T(), startResp) - s.Equal(startResp.RunId, updateRep.UpdateRef.GetWorkflowExecution().RunId) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) - - // ensure workflow was terminated - descResp, err := s.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(s.Context()), - &workflowservice.DescribeWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - Execution: &commonpb.WorkflowExecution{WorkflowId: startReq.WorkflowId, RunId: firstWF.RunId}, - }) - s.NoError(err) - s.Equal(enumspb.WORKFLOW_EXECUTION_STATUS_TERMINATED, descResp.WorkflowExecutionInfo.Status) - - // poll update to ensure same outcome is returned - pollRes, err := pollUpdate(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - s.NoError(err) - s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil }) + s.NoError(err) - t.Run("given an accepted update, attach to it", func(t *testing.T) { - s := testcore.NewEnv(t) - // 1st update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) - uwsCh1 := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - // accept the update - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - - uwsRes1 := <-uwsCh1 - s.NoError(uwsRes1.err) - startResp1 := uwsRes1.response.Responses[0].GetStartWorkflow() - updateRep1 := uwsRes1.response.Responses[1].GetUpdateWorkflow() - s.True(startResp1.Started) - s.Equal(startResp1.RunId, updateRep1.UpdateRef.GetWorkflowExecution().RunId) - s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED, updateRep1.Stage) - - // 2nd update-with-start: attaches to update instead of terminating workflow - uwsCh2 := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - uwsRes2 := <-uwsCh2 - s.NoError(uwsRes2.err) - startResp2 := uwsRes2.response.Responses[0].GetStartWorkflow() - updateRep2 := uwsRes2.response.Responses[1].GetUpdateWorkflow() - s.False(startResp2.Started) - s.Equal(startResp2.RunId, startResp1.RunId) // no termination - s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED, updateRep2.Stage) + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + requireStartedAndRunning(s.T(), startResp) + s.Equal(startResp.RunId, updateRep.UpdateRef.GetWorkflowExecution().RunId) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) + + // ensure workflow was terminated + descResp, err := env.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(env.Context()), + &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{WorkflowId: startReq.WorkflowId, RunId: firstWF.RunId}, }) - }) + s.NoError(err) + s.Equal(enumspb.WORKFLOW_EXECUTION_STATUS_TERMINATED, descResp.WorkflowExecutionInfo.Status) - t.Run("workflow id conflict policy fail: abort multi operation", func(t *testing.T) { - s := testcore.NewEnv(t) - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) + // poll update to ensure same outcome is returned + pollRes, err := pollUpdate(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + s.NoError(err) + s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) + }) - // start workflow - startWorkflowReq(s, s.Tv()) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + s.Run("given an accepted update, attach to it", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL - updateReq := updateWorkflowRequest(s, s.Tv(), &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - uwsRes := <-uwsCh - s.Error(uwsRes.err) - s.Equal("Update-with-Start could not be executed.", uwsRes.err.Error()) - errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() - s.Len(errs, 2) - var alreadyStartedErr *serviceerror.WorkflowExecutionAlreadyStarted - s.ErrorAs(errs[0], &alreadyStartedErr) - s.Equal("Operation was aborted.", errs[1].Error()) - }) + // 1st update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) + uwsCh1 := s.sendUpdateWithStart(env, startReq, updateReq) - t.Run("receive completed update result", func(t *testing.T) { - _ = testcore.NewEnv(t) // unused s - for _, p := range []enumspb.WorkflowIdConflictPolicy{ - enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, - } { - t.Run(fmt.Sprintf("for workflow id conflict policy %v", p), func(t *testing.T) { - s := testcore.NewEnv(t) - startReq := startWorkflowReq(s, s.Tv()) - updReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - - // 1st update-with-start - uwsCh1 := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updReq) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - uwsRes1 := <-uwsCh1 - s.NoError(uwsRes1.err) - - // 2nd update-with-start: using *same* UpdateID - but *different* RequestID - uwsRes2 := <-sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updReq) - s.NoError(uwsRes2.err) - - s.Equal(uwsRes1.response.Responses[0].GetStartWorkflow().RunId, uwsRes2.response.Responses[0].GetStartWorkflow().RunId) - s.Equal(uwsRes1.response.Responses[1].GetUpdateWorkflow().Outcome.String(), uwsRes2.response.Responses[1].GetUpdateWorkflow().Outcome.String()) - - // poll update to ensure same outcome is returned - pollRes, err := pollUpdate(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - s.NoError(err) - s.Equal(uwsRes1.response.Responses[1].GetUpdateWorkflow().Outcome.String(), pollRes.Outcome.String()) - }) - } - }) + // accept the update + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) - t.Run("dedupes start", func(t *testing.T) { - _ = testcore.NewEnv(t) // unused s - for _, p := range []enumspb.WorkflowIdConflictPolicy{ - enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, - } { - t.Run(fmt.Sprintf("for workflow id conflict policy %v", p), func(t *testing.T) { - s := testcore.NewEnv(t) - startReq := startWorkflowReq(s, s.Tv()) - startReq.RequestId = "request_id" - startReq.WorkflowIdConflictPolicy = p - updReq1 := updateWorkflowRequest(s, s.Tv().WithUpdateIDNumber(1), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - - // 1st update-with-start - uwsCh1 := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updReq1) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - uwsRes1 := <-uwsCh1 - s.NoError(uwsRes1.err) - - // 2nd update-with-start: using *same* RequestID - but *different* UpdateID - updReq2 := updateWorkflowRequest(s, s.Tv().WithUpdateIDNumber(2), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh2 := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updReq2) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - uwsRes2 := <-uwsCh2 - s.NoError(uwsRes1.err) - - s.Equal(uwsRes1.response.Responses[0].GetStartWorkflow().RunId, uwsRes2.response.Responses[0].GetStartWorkflow().RunId) - }) - } - }) + uwsRes1 := <-uwsCh1 + s.NoError(uwsRes1.err) + startResp1 := uwsRes1.response.Responses[0].GetStartWorkflow() + updateRep1 := uwsRes1.response.Responses[1].GetUpdateWorkflow() + s.True(startResp1.Started) + s.Equal(startResp1.RunId, updateRep1.UpdateRef.GetWorkflowExecution().RunId) + s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED, updateRep1.Stage) + + // 2nd update-with-start: attaches to update instead of terminating workflow + uwsCh2 := s.sendUpdateWithStart(env, startReq, updateReq) + + uwsRes2 := <-uwsCh2 + s.NoError(uwsRes2.err) + startResp2 := uwsRes2.response.Responses[0].GetStartWorkflow() + updateRep2 := uwsRes2.response.Responses[1].GetUpdateWorkflow() + s.False(startResp2.Started) + s.Equal(startResp2.RunId, startResp1.RunId) // no termination + s.Equal(enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED, updateRep2.Stage) }) + }) - t.Run("workflow is closed", func(t *testing.T) { - t.Run("workflow id reuse policy allow-duplicate", func(t *testing.T) { - s := testcore.NewEnv(t) - // start and terminate workflow - initialWorkflow, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) + s.Run("workflow id conflict policy fail: abort multi operation", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + // start workflow + s.updateWithStartReq(env, env.Tv()) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) - _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(s.Context()), - &workflowservice.TerminateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowExecution: s.Tv().WorkflowExecution(), - Reason: s.Tv().Any().String(), - }) - s.NoError(err) + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL + updateReq := updateWorkflowRequest(env, env.Tv(), &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + uwsRes := <-uwsCh + s.Error(uwsRes.err) + s.Equal("Update-with-Start could not be executed.", uwsRes.err.Error()) + errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() + s.Len(errs, 2) + var alreadyStartedErr *serviceerror.WorkflowExecutionAlreadyStarted + s.ErrorAs(errs[0], &alreadyStartedErr) + s.Equal("Operation was aborted.", errs[1].Error()) + }) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdReusePolicy = enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE - updateReq := updateWorkflowRequest(s, s.Tv(), + s.Run("receive completed update result", func(s *UpdateWithStartSuite) { + _ = testcore.NewEnv(s.T()) // unused s + for _, p := range []enumspb.WorkflowIdConflictPolicy{ + enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + } { + s.Run(fmt.Sprintf("for workflow id conflict policy %v", p), func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + startReq := s.updateWithStartReq(env, env.Tv()) + updReq := updateWorkflowRequest(env, env.Tv(), &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), + // 1st update-with-start + uwsCh1 := s.sendUpdateWithStart(env, startReq, updReq) + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), }, nil }) s.NoError(err) + uwsRes1 := <-uwsCh1 + s.NoError(uwsRes1.err) - uwsRes := <-uwsCh - s.NoError(uwsRes.err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - requireStartedAndRunning(s.T(), startResp) - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) + // 2nd update-with-start: using *same* UpdateID - but *different* RequestID + uwsRes2 := <-s.sendUpdateWithStart(env, startReq, updReq) + s.NoError(uwsRes2.err) - // ensure terminated workflow is not locked by update-with-start - err = s.SendSignal(s.Namespace().String(), &commonpb.WorkflowExecution{ - WorkflowId: s.Tv().WorkflowID(), - RunId: initialWorkflow.RunId, - }, s.Tv().Any().String(), s.Tv().Any().Payloads(), s.Tv().Any().String()) - s.ErrorContains(err, "workflow execution already completed") + s.Equal(uwsRes1.response.Responses[0].GetStartWorkflow().RunId, uwsRes2.response.Responses[0].GetStartWorkflow().RunId) + s.Equal(uwsRes1.response.Responses[1].GetUpdateWorkflow().Outcome.String(), uwsRes2.response.Responses[1].GetUpdateWorkflow().Outcome.String()) // poll update to ensure same outcome is returned - pollRes, err := pollUpdate(s, s.Tv(), + pollRes, err := pollUpdate(env, env.Tv(), &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) s.NoError(err) - s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) + s.Equal(uwsRes1.response.Responses[1].GetUpdateWorkflow().Outcome.String(), pollRes.Outcome.String()) }) + } + }) - t.Run("workflow id reuse policy reject-duplicate", func(t *testing.T) { - s := testcore.NewEnv(t) - // start and terminate workflow - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) - - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + s.Run("dedupes start", func(s *UpdateWithStartSuite) { + _ = testcore.NewEnv(s.T()) // unused s + for _, p := range []enumspb.WorkflowIdConflictPolicy{ + enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + } { + s.Run(fmt.Sprintf("for workflow id conflict policy %v", p), func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.RequestId = "request_id" + startReq.WorkflowIdConflictPolicy = p + updReq1 := updateWorkflowRequest(env, env.Tv().WithUpdateIDNumber(1), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(s.Context()), - &workflowservice.TerminateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowExecution: s.Tv().WorkflowExecution(), - Reason: s.Tv().Any().String(), + // 1st update-with-start + uwsCh1 := s.sendUpdateWithStart(env, startReq, updReq1) + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil }) s.NoError(err) + uwsRes1 := <-uwsCh1 + s.NoError(uwsRes1.err) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdReusePolicy = enumspb.WORKFLOW_ID_REUSE_POLICY_REJECT_DUPLICATE - updateReq := updateWorkflowRequest(s, s.Tv(), + // 2nd update-with-start: using *same* RequestID - but *different* UpdateID + updReq2 := updateWorkflowRequest(env, env.Tv().WithUpdateIDNumber(2), &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) + uwsCh2 := s.sendUpdateWithStart(env, startReq, updReq2) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) + uwsRes2 := <-uwsCh2 + s.NoError(uwsRes1.err) - uwsRes := <-uwsCh - s.Error(uwsRes.err) - s.Equal("Update-with-Start could not be executed.", uwsRes.err.Error()) - errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() - s.Len(errs, 2) - s.Contains(errs[0].Error(), "Workflow execution already finished") - var alreadyStartedErr *serviceerror.WorkflowExecutionAlreadyStarted - s.ErrorAs(errs[0], &alreadyStartedErr) - s.Equal("Operation was aborted.", errs[1].Error()) + s.Equal(uwsRes1.response.Responses[0].GetStartWorkflow().RunId, uwsRes2.response.Responses[0].GetStartWorkflow().RunId) }) + } + }) +} - t.Run("receive completed update result", func(t *testing.T) { - _ = testcore.NewEnv(t) // unused s - for _, p := range []enumspb.WorkflowIdConflictPolicy{ - enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, - enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, - } { - t.Run(fmt.Sprintf("for workflow id conflict policy %v", p), func(t *testing.T) { - s := testcore.NewEnv(t) - // 1st update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = p - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - Commands: s.UpdateAcceptCompleteCommands(s.Tv()), - }, nil - }) - s.NoError(err) - - uwsRes := <-uwsCh - s.NoError(uwsRes.err) - startResp1 := uwsRes.response.Responses[0].GetStartWorkflow() - _ = uwsRes.response.Responses[1].GetUpdateWorkflow() - requireStartedAndRunning(s.T(), startResp1) - - // terminate workflow - _, err = s.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(s.Context()), - &workflowservice.TerminateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowExecution: s.Tv().WorkflowExecution(), - Reason: s.Tv().Any().String(), - }) - s.NoError(err) - - // 2nd update-with-start (using the same Update ID but different Request ID) - uwsRes = <-sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - s.NoError(uwsRes.err) - startResp := uwsRes.response.Responses[0].GetStartWorkflow() - updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() - s.False(startResp.Started) - s.Equal(enumspb.WORKFLOW_EXECUTION_STATUS_TERMINATED, startResp.Status) - // TODO: check startResp.Running - s.Equal("success-result-of-"+s.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) - }) - } +func (s *UpdateWithStartSuite) TestWorkflowIsClosed() { + s.Run("workflow id reuse policy allow-duplicate", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + + // start and terminate workflow + initialWorkflow, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + _, err = env.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(env.Context()), + &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: env.Tv().WorkflowExecution(), + Reason: env.Tv().Any().String(), }) - }) + s.NoError(err) - t.Run("workflow start conflict", func(t *testing.T) { - t.Run("workflow id conflict policy fail: use-existing", func(t *testing.T) { - s := testcore.NewEnv(t) - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdReusePolicy = enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) - // simulate a race condition - s.InjectHook(testhooks.NewHook(testhooks.UpdateWithStartInBetweenLockAndStart, func() { - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startReq) - s.NoError(err) - })) + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + requireStartedAndRunning(s.T(), startResp) + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) + + // ensure terminated workflow is not locked by update-with-start + err = env.SendSignal(env.Namespace().String(), &commonpb.WorkflowExecution{ + WorkflowId: env.Tv().WorkflowID(), + RunId: initialWorkflow.RunId, + }, env.Tv().Any().String(), env.Tv().Any().Payloads(), env.Tv().Any().String()) + s.ErrorContains(err, "workflow execution already completed") + + // poll update to ensure same outcome is returned + pollRes, err := pollUpdate(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + s.NoError(err) + s.Equal(updateRep.Outcome.String(), pollRes.Outcome.String()) + }) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) + s.Run("workflow id reuse policy reject-duplicate", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{}, nil - }) - s.NoError(err) + // start and terminate workflow + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) - <-uwsCh + _, err = env.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(env.Context()), + &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: env.Tv().WorkflowExecution(), + Reason: env.Tv().Any().String(), }) - }) + s.NoError(err) - t.Run("update is aborted by closing workflow", func(t *testing.T) { - t.Run("retry request once when workflow was not started", func(t *testing.T) { - s := testcore.NewEnv(t) - // start workflow - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdReusePolicy = enumspb.WORKFLOW_ID_REUSE_POLICY_REJECT_DUPLICATE + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + uwsRes := <-uwsCh + s.Error(uwsRes.err) + s.Equal("Update-with-Start could not be executed.", uwsRes.err.Error()) + errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() + s.Len(errs, 2) + s.Contains(errs[0].Error(), "Workflow execution already finished") + var alreadyStartedErr *serviceerror.WorkflowExecutionAlreadyStarted + s.ErrorAs(errs[0], &alreadyStartedErr) + s.Equal("Operation was aborted.", errs[1].Error()) + }) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) + s.Run("receive completed update result", func(s *UpdateWithStartSuite) { + _ = testcore.NewEnv(s.T()) // unused s + for _, p := range []enumspb.WorkflowIdConflictPolicy{ + enumspb.WORKFLOW_ID_CONFLICT_POLICY_TERMINATE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING, + enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + } { + s.Run(fmt.Sprintf("for workflow id conflict policy %v", p), func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + // 1st update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = p + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) - // wait until the update is admitted - then complete workflow - waitUpdateAdmitted(s, s.Tv()) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ - CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, - }, - }, - }, + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + Commands: env.UpdateAcceptCompleteCommands(env.Tv()), }, nil }) s.NoError(err) - // update-with-start will do a server-side retry + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + startResp1 := uwsRes.response.Responses[0].GetStartWorkflow() + _ = uwsRes.response.Responses[1].GetUpdateWorkflow() + requireStartedAndRunning(s.T(), startResp1) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), - }, nil + // terminate workflow + _, err = env.FrontendClient().TerminateWorkflowExecution(testcore.NewContext(env.Context()), + &workflowservice.TerminateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: env.Tv().WorkflowExecution(), + Reason: env.Tv().Any().String(), }) s.NoError(err) - uwsRes := <-uwsCh + // 2nd update-with-start (using the same Update ID but different Request ID) + uwsRes = <-s.sendUpdateWithStart(env, startReq, updateReq) + s.NoError(uwsRes.err) + startResp := uwsRes.response.Responses[0].GetStartWorkflow() + updateRep := uwsRes.response.Responses[1].GetUpdateWorkflow() + s.False(startResp.Started) + s.Equal(enumspb.WORKFLOW_EXECUTION_STATUS_TERMINATED, startResp.Status) + // TODO: check startResp.Running + s.Equal("success-result-of-"+env.Tv().UpdateID(), testcore.DecodeString(s.T(), updateRep.GetOutcome().GetSuccess())) }) + } + }) +} - t.Run("return retryable error after retry", func(t *testing.T) { - s := testcore.NewEnv(t) - // start workflow - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) - _, err = s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), taskpoller.DrainWorkflowTask) - s.NoError(err) +func (s *UpdateWithStartSuite) TestWorkflowStartConflict() { + s.Run("workflow id conflict policy fail: use-existing", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_COMPLETED}) + + // simulate a race condition + env.InjectHook(testhooks.NewHook(testhooks.UpdateWithStartInBetweenLockAndStart, func() { + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), startReq) + s.NoError(err) + })) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{}, nil + }) + s.NoError(err) + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) - // wait until the update is admitted - waitUpdateAdmitted(s, s.Tv()) + <-uwsCh + }) +} - s.InjectHook(testhooks.NewHook(testhooks.UpdateWithStartOnClosingWorkflowRetry, func() { - _, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(s.Context()), startWorkflowReq(s, s.Tv())) - s.NoError(err) - })) +func (s *UpdateWithStartSuite) TestUpdateIsAbortedByClosingWorkflow() { + s.Run("retry request once when workflow was not started", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - // complete workflow (twice including retry) - for range 2 { - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ - CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, - }, - }, - }, - }, nil - }) - s.NoError(err) - } + // start workflow + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) - // ensure update-with-start returns retryable error - uwsRes := <-uwsCh - s.Error(uwsRes.err) - errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() - s.Len(errs, 2) - s.Equal("Operation was aborted.", errs[0].Error()) - s.ErrorContains(errs[1], update.AbortedByWorkflowClosingErr.Error()) - s.ErrorAs(errs[1], new(*serviceerror.Aborted)) - }) + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) - t.Run("do not retry when workflow was started", func(t *testing.T) { - s := testcore.NewEnv(t) - // update-with-start - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING - updateReq := updateWorkflowRequest(s, s.Tv(), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) - uwsCh := sendUpdateWithStart(s, testcore.NewContext(s.Context()), startReq, updateReq) - - // wait until the update is admitted - then complete workflow - waitUpdateAdmitted(s, s.Tv()) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Commands: []*commandpb.Command{ - { - CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, - Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ - CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, - }, - }, + // wait until the update is admitted - then complete workflow + waitUpdateAdmitted(env, env.Tv()) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ + CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, }, - }, nil - }) - s.NoError(err) + }, + }, + }, nil + }) + s.NoError(err) - uwsRes := <-uwsCh - s.Error(uwsRes.err) - errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() - s.Len(errs, 2) - s.ErrorContains(errs[1], update.AbortedByWorkflowClosingErr.Error()) + // update-with-start will do a server-side retry + + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil }) - }) + s.NoError(err) - t.Run("return update rate limit error", func(t *testing.T) { - // lower maximum total number of updates for testing purposes - s := testcore.NewEnv(t, - testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxTotalUpdates, 1), - ) + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + }) - ctx := testcore.NewContext(s.Context()) - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + s.Run("return retryable error after retry", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - // allows 1st - updateReq := updateWorkflowRequest(s, s.Tv().WithUpdateIDNumber(0), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) - uwsCh := sendUpdateWithStart(s, ctx, startReq, updateReq) - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), + // start workflow + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) + _, err = env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), taskpoller.DrainWorkflowTask) + s.NoError(err) + + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + // wait until the update is admitted + waitUpdateAdmitted(env, env.Tv()) + + env.InjectHook(testhooks.NewHook(testhooks.UpdateWithStartOnClosingWorkflowRetry, func() { + _, err := env.FrontendClient().StartWorkflowExecution(testcore.NewContext(env.Context()), s.updateWithStartReq(env, env.Tv())) + s.NoError(err) + })) + + // complete workflow (twice including retry) + for range 2 { + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptCompleteMessages(s.Tv(), task.Messages[0]), + Commands: []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ + CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, + }, + }, + }, }, nil }) s.NoError(err) - uwsRes := <-uwsCh - s.NoError(uwsRes.err) + } - // denies 2nd - updateReq = updateWorkflowRequest(s, s.Tv().WithUpdateIDNumber(1), updateReq.WaitPolicy) - select { - case <-sendUpdateWithStart(s, ctx, startReq, updateReq): - err = (<-sendUpdateWithStart(s, ctx, startReq, updateReq)).err - s.Error(err) - errs := err.(*serviceerror.MultiOperationExecution).OperationErrors() - s.Len(errs, 2) - s.Equal("Operation was aborted.", errs[0].Error()) - s.Contains(errs[1].Error(), "limit on the total number of distinct updates in this workflow has been reached") - case <-ctx.Done(): - s.Fail("timed out waiting for update") - } - }) + // ensure update-with-start returns retryable error + uwsRes := <-uwsCh + s.Error(uwsRes.err) + errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() + s.Len(errs, 2) + s.Equal("Operation was aborted.", errs[0].Error()) + s.ErrorContains(errs[1], update.AbortedByWorkflowClosingErr.Error()) + s.ErrorAs(errs[1], new(*serviceerror.Aborted)) + }) - t.Run("return update in-flight limit error", func(t *testing.T) { - // lower maximum in-flight updates for testing purposes - maxInFlight := 1 - s := testcore.NewEnv(t, - testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxInFlightUpdates, maxInFlight), - ) + s.Run("do not retry when workflow was started", func(s *UpdateWithStartSuite) { + env := testcore.NewEnv(s.T()) - ctx := testcore.NewContext(s.Context()) - startReq := startWorkflowReq(s, s.Tv()) - startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + // update-with-start + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + updateReq := updateWorkflowRequest(env, env.Tv(), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) - // Start workflow and admit 1st update (but don't complete it) - updateReq := updateWorkflowRequest(s, s.Tv().WithUpdateIDNumber(0), - &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) - uwsCh := sendUpdateWithStart(s, ctx, startReq, updateReq) + // wait until the update is admitted - then complete workflow + waitUpdateAdmitted(env, env.Tv()) + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_COMPLETE_WORKFLOW_EXECUTION, + Attributes: &commandpb.Command_CompleteWorkflowExecutionCommandAttributes{ + CompleteWorkflowExecutionCommandAttributes: &commandpb.CompleteWorkflowExecutionCommandAttributes{}, + }, + }, + }, + }, nil + }) + s.NoError(err) - // Poll workflow task but only accept, don't complete the update - _, err := s.TaskPoller().PollAndHandleWorkflowTask(s.Tv(), - func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { - return &workflowservice.RespondWorkflowTaskCompletedRequest{ - Messages: s.UpdateAcceptMessages(s.Tv(), task.Messages[0]), - }, nil - }) - s.NoError(err) - uwsRes := <-uwsCh - s.NoError(uwsRes.err) + uwsRes := <-uwsCh + s.Error(uwsRes.err) + errs := uwsRes.err.(*serviceerror.MultiOperationExecution).OperationErrors() + s.Len(errs, 2) + s.ErrorContains(errs[1], update.AbortedByWorkflowClosingErr.Error()) + }) +} - // Try to send 2nd update-with-start while 1st is still in-flight (not completed) - updateReq = updateWorkflowRequest(s, s.Tv().WithUpdateIDNumber(1), updateReq.WaitPolicy) - uwsCh = sendUpdateWithStart(s, ctx, startReq, updateReq) - select { - case uwsRes := <-uwsCh: - err = uwsRes.err - s.Error(err) +func (s *UpdateWithStartSuite) TestReturnUpdateRateLimitError() { + // lower maximum total number of updates for testing purposes + env := testcore.NewEnv(s.T(), + testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxTotalUpdates, 1), + ) + + ctx := testcore.NewContext(env.Context()) + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + + // allows 1st + updateReq := updateWorkflowRequest(env, env.Tv().WithUpdateIDNumber(0), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptCompleteMessages(env.Tv(), task.Messages[0]), + }, nil + }) + s.NoError(err) + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + + // denies 2nd + updateReq = updateWorkflowRequest(env, env.Tv().WithUpdateIDNumber(1), updateReq.WaitPolicy) + select { + case <-s.sendUpdateWithStart(env, startReq, updateReq): + err = (<-s.sendUpdateWithStart(env, startReq, updateReq)).err + s.Error(err) + errs := err.(*serviceerror.MultiOperationExecution).OperationErrors() + s.Len(errs, 2) + s.Equal("Operation was aborted.", errs[0].Error()) + s.Contains(errs[1].Error(), "limit on the total number of distinct updates in this workflow has been reached") + case <-ctx.Done(): + s.Fail("timed out waiting for update") + } +} - var multiOpsErr *serviceerror.MultiOperationExecution - s.ErrorAs(err, &multiOpsErr) - - errs := multiOpsErr.OperationErrors() - s.Len(errs, 2) - s.Equal("Operation was aborted.", errs[0].Error()) - s.Contains(errs[1].Error(), "limit on number of concurrent in-flight updates has been reached") - - // Verify ResourceExhausted error is accessible with all details preserved - var resExhausted *serviceerror.ResourceExhausted - s.ErrorAs(errs[1], &resExhausted) - s.Equal(enumspb.RESOURCE_EXHAUSTED_CAUSE_CONCURRENT_LIMIT, resExhausted.Cause) - s.Equal(enumspb.RESOURCE_EXHAUSTED_SCOPE_NAMESPACE, resExhausted.Scope) - s.Contains(resExhausted.Message, "limit on number of concurrent in-flight updates") - case <-ctx.Done(): - s.Fail("timed out waiting for update") - } +func (s *UpdateWithStartSuite) TestReturnUpdateInFlightLimitError() { + // lower maximum in-flight updates for testing purposes + maxInFlight := 1 + env := testcore.NewEnv(s.T(), + testcore.WithDynamicConfig(dynamicconfig.WorkflowExecutionMaxInFlightUpdates, maxInFlight), + ) + + ctx := testcore.NewContext(env.Context()) + startReq := s.updateWithStartReq(env, env.Tv()) + startReq.WorkflowIdConflictPolicy = enumspb.WORKFLOW_ID_CONFLICT_POLICY_USE_EXISTING + + // Start workflow and admit 1st update (but don't complete it) + updateReq := updateWorkflowRequest(env, env.Tv().WithUpdateIDNumber(0), + &updatepb.WaitPolicy{LifecycleStage: enumspb.UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ACCEPTED}) + uwsCh := s.sendUpdateWithStart(env, startReq, updateReq) + + // Poll workflow task but only accept, don't complete the update + _, err := env.TaskPoller().PollAndHandleWorkflowTask(env.Tv(), + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Messages: env.UpdateAcceptMessages(env.Tv(), task.Messages[0]), + }, nil }) - }) + s.NoError(err) + uwsRes := <-uwsCh + s.NoError(uwsRes.err) + + // Try to send 2nd update-with-start while 1st is still in-flight (not completed) + updateReq = updateWorkflowRequest(env, env.Tv().WithUpdateIDNumber(1), updateReq.WaitPolicy) + uwsCh = s.sendUpdateWithStart(env, startReq, updateReq) + select { + case uwsRes := <-uwsCh: + err = uwsRes.err + s.Error(err) + + var multiOpsErr *serviceerror.MultiOperationExecution + s.ErrorAs(err, &multiOpsErr) + + errs := multiOpsErr.OperationErrors() + s.Len(errs, 2) + s.Equal("Operation was aborted.", errs[0].Error()) + s.Contains(errs[1].Error(), "limit on number of concurrent in-flight updates has been reached") + + // Verify ResourceExhausted error is accessible with all details preserved + var resExhausted *serviceerror.ResourceExhausted + s.ErrorAs(errs[1], &resExhausted) + s.Equal(enumspb.RESOURCE_EXHAUSTED_CAUSE_CONCURRENT_LIMIT, resExhausted.Cause) + s.Equal(enumspb.RESOURCE_EXHAUSTED_SCOPE_NAMESPACE, resExhausted.Scope) + s.Contains(resExhausted.Message, "limit on number of concurrent in-flight updates") + case <-ctx.Done(): + s.Fail("timed out waiting for update") + } } diff --git a/tests/user_metadata_test.go b/tests/user_metadata_test.go index 2f4422530b..f34ca5d717 100644 --- a/tests/user_metadata_test.go +++ b/tests/user_metadata_test.go @@ -4,132 +4,149 @@ import ( "testing" "github.com/google/uuid" - "github.com/stretchr/testify/suite" commonpb "go.temporal.io/api/common/v1" sdkpb "go.temporal.io/api/sdk/v1" updatepb "go.temporal.io/api/update/v1" "go.temporal.io/api/workflowservice/v1" - "go.temporal.io/server/common/testing/testvars" + "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/tests/testcore" ) type UserMetadataSuite struct { - testcore.FunctionalTestBase + parallelsuite.Suite[*UserMetadataSuite] } -func TestUserMetadataSuite(t *testing.T) { - t.Parallel() - suite.Run(t, new(UserMetadataSuite)) +func TestUserMetadata(t *testing.T) { + parallelsuite.Run(t, &UserMetadataSuite{}) } -func (s *UserMetadataSuite) TestUserMetadata() { - getDescribeWorkflowExecutionInfo := func(client workflowservice.WorkflowServiceClient, namespace string, workflowID string, runID string) (*workflowservice.DescribeWorkflowExecutionResponse, error) { - return client.DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ - Namespace: namespace, - Execution: &commonpb.WorkflowExecution{ - WorkflowId: workflowID, - RunId: runID, - }, - }) +func (s *UserMetadataSuite) TestStartWorkflowExecutionRecordsUserMetadata() { + env := testcore.NewEnv(s.T()) + metadata := &sdkpb.UserMetadata{ + Summary: &commonpb.Payload{ + Metadata: map[string][]byte{"test_summary_key": []byte(`test_summary_val`)}, + Data: []byte(`Test summary Data`), + }, + Details: &commonpb.Payload{ + Metadata: map[string][]byte{"test_details_key": []byte(`test_details_val`)}, + Data: []byte(`Test Details Data`), + }, } - prepareTestUserMetadata := func() *sdkpb.UserMetadata { - return &sdkpb.UserMetadata{ - Summary: &commonpb.Payload{ - Metadata: map[string][]byte{"test_summary_key": []byte(`test_summary_val`)}, - Data: []byte(`Test summary Data`), - }, - Details: &commonpb.Payload{ - Metadata: map[string][]byte{"test_details_key": []byte(`test_details_val`)}, - Data: []byte(`Test Details Data`), - }, - } + request := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: env.Namespace().String(), + WorkflowId: env.Tv().WorkflowID(), + WorkflowType: env.Tv().WorkflowType(), + TaskQueue: env.Tv().TaskQueue(), + UserMetadata: metadata, } - s.Run("StartWorkflowExecution records UserMetadata", func() { - tv := testvars.New(s.T()) - metadata := prepareTestUserMetadata() - request := &workflowservice.StartWorkflowExecutionRequest{ - RequestId: uuid.NewString(), - Namespace: s.Namespace().String(), - WorkflowId: tv.WorkflowID(), - WorkflowType: tv.WorkflowType(), - TaskQueue: tv.TaskQueue(), - UserMetadata: metadata, - } - - we, err := s.FrontendClient().StartWorkflowExecution(testcore.NewContext(), request) - s.NoError(err) + we, err := env.FrontendClient().StartWorkflowExecution(env.Context(), request) + s.NoError(err) - // Verify that the UserMetadata associated with the start event is returned in the describe response. - describeInfo, err := getDescribeWorkflowExecutionInfo(s.FrontendClient(), s.Namespace().String(), tv.WorkflowID(), we.RunId) - s.NoError(err) - s.EqualExportedValues(metadata, describeInfo.ExecutionConfig.UserMetadata) + // Verify that the UserMetadata associated with the start event is returned in the describe response. + describeInfo, err := env.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: env.Tv().WorkflowID(), + RunId: we.RunId, + }, }) + s.NoError(err) + s.EqualExportedValues(metadata, describeInfo.ExecutionConfig.UserMetadata) +} - s.Run("SignalWithStartWorkflowExecution records UserMetadata", func() { - tv := testvars.New(s.T()) - metadata := prepareTestUserMetadata() - request := &workflowservice.SignalWithStartWorkflowExecutionRequest{ - RequestId: uuid.NewString(), - Namespace: s.Namespace().String(), - WorkflowId: tv.WorkflowID(), - WorkflowType: tv.WorkflowType(), - TaskQueue: tv.TaskQueue(), - SignalName: "TEST-SIGNAL", - UserMetadata: metadata, - } +func (s *UserMetadataSuite) TestSignalWithStartWorkflowExecutionRecordsUserMetadata() { + env := testcore.NewEnv(s.T()) + metadata := &sdkpb.UserMetadata{ + Summary: &commonpb.Payload{ + Metadata: map[string][]byte{"test_summary_key": []byte(`test_summary_val`)}, + Data: []byte(`Test summary Data`), + }, + Details: &commonpb.Payload{ + Metadata: map[string][]byte{"test_details_key": []byte(`test_details_val`)}, + Data: []byte(`Test Details Data`), + }, + } + request := &workflowservice.SignalWithStartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: env.Namespace().String(), + WorkflowId: env.Tv().WorkflowID(), + WorkflowType: env.Tv().WorkflowType(), + TaskQueue: env.Tv().TaskQueue(), + SignalName: "TEST-SIGNAL", + UserMetadata: metadata, + } - we, err := s.FrontendClient().SignalWithStartWorkflowExecution(testcore.NewContext(), request) - s.NoError(err) + we, err := env.FrontendClient().SignalWithStartWorkflowExecution(env.Context(), request) + s.NoError(err) - // Verify that the UserMetadata associated with the start event is returned in the describe response. - describeInfo, err := getDescribeWorkflowExecutionInfo(s.FrontendClient(), s.Namespace().String(), tv.WorkflowID(), we.RunId) - s.NoError(err) - s.EqualExportedValues(metadata, describeInfo.ExecutionConfig.UserMetadata) + // Verify that the UserMetadata associated with the start event is returned in the describe response. + describeInfo, err := env.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: env.Tv().WorkflowID(), + RunId: we.RunId, + }, }) + s.NoError(err) + s.EqualExportedValues(metadata, describeInfo.ExecutionConfig.UserMetadata) +} - s.Run("ExecuteMultiOperation records UserMetadata", func() { - tv := testvars.New(s.T()) - metadata := prepareTestUserMetadata() - startWorkflowRequest := &workflowservice.StartWorkflowExecutionRequest{ - RequestId: uuid.NewString(), - Namespace: s.Namespace().String(), - WorkflowId: tv.WorkflowID(), - WorkflowType: tv.WorkflowType(), - TaskQueue: tv.TaskQueue(), - UserMetadata: metadata, - } - updateWorkflowRequest := &workflowservice.UpdateWorkflowExecutionRequest{ - Namespace: s.Namespace().String(), - WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: tv.WorkflowID()}, - Request: &updatepb.Request{ - Meta: &updatepb.Meta{UpdateId: "UPDATE_ID"}, - Input: &updatepb.Input{Name: "NAME"}, - }, - } - request := &workflowservice.ExecuteMultiOperationRequest{ - Namespace: s.Namespace().String(), - Operations: []*workflowservice.ExecuteMultiOperationRequest_Operation{ - { // start workflow operation - Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_StartWorkflow{ - StartWorkflow: startWorkflowRequest, - }, +func (s *UserMetadataSuite) TestExecuteMultiOperationRecordsUserMetadata() { + env := testcore.NewEnv(s.T()) + metadata := &sdkpb.UserMetadata{ + Summary: &commonpb.Payload{ + Metadata: map[string][]byte{"test_summary_key": []byte(`test_summary_val`)}, + Data: []byte(`Test summary Data`), + }, + Details: &commonpb.Payload{ + Metadata: map[string][]byte{"test_details_key": []byte(`test_details_val`)}, + Data: []byte(`Test Details Data`), + }, + } + startWorkflowRequest := &workflowservice.StartWorkflowExecutionRequest{ + RequestId: uuid.NewString(), + Namespace: env.Namespace().String(), + WorkflowId: env.Tv().WorkflowID(), + WorkflowType: env.Tv().WorkflowType(), + TaskQueue: env.Tv().TaskQueue(), + UserMetadata: metadata, + } + updateWorkflowRequest := &workflowservice.UpdateWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: env.Tv().WorkflowID()}, + Request: &updatepb.Request{ + Meta: &updatepb.Meta{UpdateId: "UPDATE_ID"}, + Input: &updatepb.Input{Name: "NAME"}, + }, + } + request := &workflowservice.ExecuteMultiOperationRequest{ + Namespace: env.Namespace().String(), + Operations: []*workflowservice.ExecuteMultiOperationRequest_Operation{ + { // start workflow operation + Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_StartWorkflow{ + StartWorkflow: startWorkflowRequest, }, - { // update workflow operation - Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_UpdateWorkflow{ - UpdateWorkflow: updateWorkflowRequest, - }, + }, + { // update workflow operation + Operation: &workflowservice.ExecuteMultiOperationRequest_Operation_UpdateWorkflow{ + UpdateWorkflow: updateWorkflowRequest, }, }, - } + }, + } - _, err := s.FrontendClient().ExecuteMultiOperation(testcore.NewContext(), request) - s.NoError(err) + _, err := env.FrontendClient().ExecuteMultiOperation(env.Context(), request) + s.NoError(err) - // Verify that the UserMetadata associated with the start event is returned in the describe response. - describeInfo, err := getDescribeWorkflowExecutionInfo(s.FrontendClient(), s.Namespace().String(), tv.WorkflowID(), "") - s.NoError(err) - s.EqualExportedValues(metadata, describeInfo.ExecutionConfig.UserMetadata) + // Verify that the UserMetadata associated with the start event is returned in the describe response. + describeInfo, err := env.FrontendClient().DescribeWorkflowExecution(testcore.NewContext(), &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: env.Tv().WorkflowID(), + }, }) - + s.NoError(err) + s.EqualExportedValues(metadata, describeInfo.ExecutionConfig.UserMetadata) } diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index c053ccff5a..31a83c44d4 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -75,27 +75,20 @@ const ( type Versioning3Suite struct { testcore.FunctionalTestBase - useV32 bool deploymentWorkflowVersion workerdeployment.DeploymentWorkflowVersion - useRevisionNumbers bool - useNewDeploymentData bool } -func TestVersioning3FunctionalSuiteV2(t *testing.T) { +func TestVersioning3FunctionalSuite(t *testing.T) { t.Parallel() suite.Run(t, &Versioning3Suite{ deploymentWorkflowVersion: workerdeployment.VersionDataRevisionNumber, - useV32: true, - useRevisionNumbers: true, - useNewDeploymentData: true, }) } func (s *Versioning3Suite) SetupSuite() { dynamicConfigOverrides := map[dynamicconfig.Key]any{ - dynamicconfig.MatchingDeploymentWorkflowVersion.Key(): int(s.deploymentWorkflowVersion), - dynamicconfig.UseRevisionNumberForWorkerVersioning.Key(): s.useRevisionNumbers, - dynamicconfig.MatchingForwarderMaxChildrenPerNode.Key(): partitionTreeDegree, + dynamicconfig.MatchingDeploymentWorkflowVersion.Key(): int(s.deploymentWorkflowVersion), + dynamicconfig.MatchingForwarderMaxChildrenPerNode.Key(): partitionTreeDegree, // Make sure we don't hit the rate limiter in tests dynamicconfig.FrontendGlobalNamespaceNamespaceReplicationInducingAPIsRPS.Key(): 1000, @@ -131,7 +124,7 @@ func (s *Versioning3Suite) TestPinnedTask_NoProperPoller() { // Cancel the poller after condition is met cancelPoller() - s.startWorkflow(tv, tv.VersioningOverridePinned(s.useV32)) + s.startWorkflow(tv, tv.VersioningOverridePinned()) s.idlePollWorkflow(context.Background(), tv, false, ver3MinPollTime, "unversioned worker should not receive pinned task") // Sleeping to let the pollers arrive to server before ending the test. @@ -153,69 +146,43 @@ func (s *Versioning3Suite) TestUnpinnedTask_NonCurrentDeployment() { } func (s *Versioning3Suite) TestUnpinnedTask_OldDeployment() { - if s.useNewDeploymentData == true { - s.RunTestWithMatchingBehavior( - func() { - tv := testvars.New(s) - tvOldDeployment := tv.WithBuildIDNumber(1) - tvNewDeployment := tv.WithBuildIDNumber(2) - - // previous current deployment - s.updateTaskQueueDeploymentDataWithRoutingConfig(tvOldDeployment, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvOldDeployment.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvOldDeployment.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) - - // current deployment - s.updateTaskQueueDeploymentDataWithRoutingConfig(tvNewDeployment, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvNewDeployment.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvNewDeployment.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) - - s.startWorkflow(tv, nil) - - s.idlePollWorkflow( - context.Background(), - tvOldDeployment, - true, - ver3MinPollTime, - "old deployment should not receive unpinned task", - ) - // Sleeping to let the pollers arrive to server before ending the test. - time.Sleep(200 * time.Millisecond) //nolint:forbidigo - }, - ) - } else { - s.RunTestWithMatchingBehavior( - func() { - tv := testvars.New(s) - tvOldDeployment := tv.WithBuildIDNumber(1) - tvNewDeployment := tv.WithBuildIDNumber(2) - // previous current deployment - s.updateTaskQueueDeploymentData(tvOldDeployment, true, 0, false, time.Minute, tqTypeWf) - // current deployment - s.updateTaskQueueDeploymentData(tvNewDeployment, true, 0, false, 0, tqTypeWf) - - s.startWorkflow(tv, nil) - - s.idlePollWorkflow( - context.Background(), - tvOldDeployment, - true, - ver3MinPollTime, - "old deployment should not receive unpinned task", - ) - // Sleeping to let the pollers arrive to server before ending the test. - time.Sleep(200 * time.Millisecond) //nolint:forbidigo - }, - ) - } + s.RunTestWithMatchingBehavior( + func() { + tv := testvars.New(s) + tvOldDeployment := tv.WithBuildIDNumber(1) + tvNewDeployment := tv.WithBuildIDNumber(2) + + // previous current deployment + s.updateTaskQueueDeploymentDataWithRoutingConfig(tvOldDeployment, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvOldDeployment.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvOldDeployment.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf) + + // current deployment + s.updateTaskQueueDeploymentDataWithRoutingConfig(tvNewDeployment, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvNewDeployment.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvNewDeployment.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf) + + s.startWorkflow(tv, nil) + + s.idlePollWorkflow( + context.Background(), + tvOldDeployment, + true, + ver3MinPollTime, + "old deployment should not receive unpinned task", + ) + // Sleeping to let the pollers arrive to server before ending the test. + time.Sleep(200 * time.Millisecond) //nolint:forbidigo + }, + ) } func (s *Versioning3Suite) TestSessionActivityResourceSpecificTaskQueueNotRegisteredInVersion() { @@ -346,24 +313,24 @@ func (s *Versioning3Suite) testWorkflowWithPinnedOverride(sticky bool) { // Wait for the version to be present in the task queue. Version existence is required before it can be set as an override. s.validatePinnedVersionExistsInTaskQueue(tv) - runID := s.startWorkflow(tv, tv.VersioningOverridePinned(s.useV32)) + runID := s.startWorkflow(tv, tv.VersioningOverridePinned()) s.WaitForChannel(ctx, wftCompleted) - s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(s.useV32), nil) + s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) s.verifyVersioningSAs(tv, vbPinned, enumspb.WORKFLOW_EXECUTION_STATUS_RUNNING, tv) if sticky { s.verifyWorkflowStickyQueue(tv.WithRunID(runID)) } s.WaitForChannel(ctx, actCompleted) - s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(s.useV32), nil) + s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) s.pollWftAndHandle(tv, sticky, nil, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { s.NotNil(task) return respondCompleteWorkflow(tv, vbUnpinned), nil }) - s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(s.useV32), nil) + s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) } func (s *Versioning3Suite) TestQueryWithPinnedOverride_NoSticky() { @@ -434,9 +401,9 @@ func (s *Versioning3Suite) testPinnedQuery_DrainedVersion(pollersPresent bool, r return respondCompleteWorkflow(tv, vbPinned), nil }) - s.startWorkflow(tv, tv.VersioningOverridePinned(s.useV32)) + s.startWorkflow(tv, tv.VersioningOverridePinned()) s.WaitForChannel(ctx, wftCompleted) - s.verifyWorkflowVersioning(s.Assertions, tv, vbPinned, tv.Deployment(), tv.VersioningOverridePinned(s.useV32), nil) + s.verifyWorkflowVersioning(s.Assertions, tv, vbPinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) // create version v2 and make it current which shall make v1 go from current -> draining/drained idlePollerDone = make(chan struct{}) @@ -529,10 +496,10 @@ func (s *Versioning3Suite) testQueryWithPinnedOverride(sticky bool) { a.True(resp.GetIsMember()) }, 10*time.Second, 100*time.Millisecond) - runID := s.startWorkflow(tv, tv.VersioningOverridePinned(s.useV32)) + runID := s.startWorkflow(tv, tv.VersioningOverridePinned()) s.WaitForChannel(ctx, wftCompleted) - s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(s.useV32), nil) + s.verifyWorkflowVersioning(s.Assertions, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) if sticky { s.verifyWorkflowStickyQueue(tv.WithRunID(runID)) } @@ -646,7 +613,7 @@ func (s *Versioning3Suite) testPinnedWorkflowWithLateActivityPoller() { }) s.waitForDeploymentDataPropagation(tv, versionStatusInactive, false, tqTypeWf) - override := tv.VersioningOverridePinned(s.useV32) + override := tv.VersioningOverridePinned() s.startWorkflow(tv, override) s.WaitForChannel(ctx, wftCompleted) @@ -756,7 +723,7 @@ func (s *Versioning3Suite) TestSearchByUsedVersion() { }) s.waitForDeploymentDataPropagation(tv, versionStatusInactive, false, tqTypeWf) - s.startWorkflow(tv, tv.VersioningOverridePinned(s.useV32)) + s.startWorkflow(tv, tv.VersioningOverridePinned()) <-wftCompleted s.verifyVersioningSAs(tv, vbPinned, enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED, tv) @@ -1346,17 +1313,13 @@ func (s *Versioning3Suite) testTransitionFromWft(sticky bool, toUnversioned bool s.warmUpSticky(tv1) } - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv1, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf, tqTypeAct) runID := s.startWorkflow(tv1, nil) s.pollWftAndHandle(tv1, false, nil, @@ -1380,15 +1343,11 @@ func (s *Versioning3Suite) testTransitionFromWft(sticky bool, toUnversioned bool if toUnversioned { // unset A as current - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: nil, - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv1, false, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: nil, + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{}, []string{}, tqTypeWf, tqTypeAct) s.unversionedPollWftAndHandle(tv1, false, nil, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { @@ -1401,19 +1360,15 @@ func (s *Versioning3Suite) testTransitionFromWft(sticky bool, toUnversioned bool } else { // Set B as the current deployment - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }, tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv2, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }}, []string{}, tqTypeWf, tqTypeAct) s.pollWftAndHandle(tv2, false, nil, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { @@ -1472,17 +1427,13 @@ func (s *Versioning3Suite) testDoubleTransition(unversionedSrc bool, signal bool if !unversionedSrc { // sourceV is v1, set current version to it - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv1, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf, tqTypeAct) } s.doPollWftAndHandle(tv1, !unversionedSrc, false, nil, @@ -1499,19 +1450,15 @@ func (s *Versioning3Suite) testDoubleTransition(unversionedSrc bool, signal bool } // set current version to v2 - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }, tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv2, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }}, []string{}, tqTypeWf, tqTypeAct) // poll activity from v2 worker, this should start a transition but should not immediately start the activity. go s.idlePollActivity(context.Background(), tv2, true, time.Minute, "v2 worker should not receive the activity") @@ -1531,29 +1478,21 @@ func (s *Versioning3Suite) testDoubleTransition(unversionedSrc bool, signal bool // Back to sourceV if unversionedSrc { - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: nil, - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 3, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv2, false, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: nil, + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 3, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{}, []string{}, tqTypeWf, tqTypeAct) } else { - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 3, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }, tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv1, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 3, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }}, []string{}, tqTypeWf, tqTypeAct) } // Now poll for wf task from sourceV while there is a transition to v2 @@ -1575,19 +1514,15 @@ func (s *Versioning3Suite) testDoubleTransition(unversionedSrc bool, signal bool }) // Set v2 as the current version again - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 4, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }, tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv2, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 4, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }}, []string{}, tqTypeWf, tqTypeAct) s.pollWftAndHandle(tv2, false, nil, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { @@ -1623,35 +1558,27 @@ func (s *Versioning3Suite) nexusTaskStaysOnCurrentDeployment() { } // current deployment is -> tv1 - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeNexus) - } else { - s.updateTaskQueueDeploymentData(tv1, true, 0, false, 0, tqTypeNexus) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeNexus) // local poller with deployment A receives task s.pollAndDispatchNexusTask(tv1, nexusRequest) // current deployment is now -> tv2 - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }, tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, - }}, []string{}, tqTypeNexus) - } else { - s.updateTaskQueueDeploymentData(tv2, true, 0, false, 0, tqTypeNexus) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }}, []string{}, tqTypeNexus) // Pollers of tv1 are there but should not get any task go s.idlePollNexus(tv1, true, ver3MinPollTime, "nexus task should not go to the old deployment") @@ -1684,17 +1611,13 @@ func (s *Versioning3Suite) TestEagerActivity() { s.OverrideDynamicConfig(dynamicconfig.EnableActivityEagerExecution, true) tv := testvars.New(s) - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf, tqTypeAct) s.startWorkflow(tv, nil) poller, resp := s.pollWftAndHandle(tv, false, nil, @@ -1762,17 +1685,13 @@ func (s *Versioning3Suite) testTransitionFromActivity(sticky bool) { s.warmUpSticky(tv1) } - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv1, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv1, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv1.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf, tqTypeAct) runID := s.startWorkflow(tv1, nil) s.pollWftAndHandle(tv1, false, nil, @@ -1832,19 +1751,15 @@ func (s *Versioning3Suite) testTransitionFromActivity(sticky bool) { s.verifyWorkflowVersioning(s.Assertions, tv1, vbUnpinned, tv1.Deployment(), nil, nil) // 2. Set d2 as the current deployment - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }, tv1.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, - }}, []string{}, tqTypeWf, tqTypeAct) - } else { - s.updateTaskQueueDeploymentData(tv2, true, 0, false, 0, tqTypeWf, tqTypeAct) - } + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }}, []string{}, tqTypeWf, tqTypeAct) // Although updateTaskQueueDeploymentData waits for deployment data to reach the TQs, backlogged // tasks might still be waiting behind the old deployment's poll channel. Partition manage should // immediately react to the deployment data changes, but there still is a race possible and the @@ -1936,30 +1851,23 @@ func (s *Versioning3Suite) testIndependentActivity(behavior enumspb.VersioningBe tvAct := testvars.New(s).WithDeploymentSeriesNumber(2).WithTaskQueueNumber(2) // Set current deployment for each TQ - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tvWf, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvWf.DeploymentVersionString()), + s.updateTaskQueueDeploymentDataWithRoutingConfig(tvWf, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvWf.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 1, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvWf.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf) + + if !unversionedActivity { + // Different deployment here for the activity TQ. + s.updateTaskQueueDeploymentDataWithRoutingConfig(tvAct, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvAct.DeploymentVersionString()), CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvWf.DeploymentVersion().GetBuildId(): { + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvAct.DeploymentVersion().GetBuildId(): { Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) - - if !unversionedActivity { - // Different deployment here for the activity TQ. - s.updateTaskQueueDeploymentDataWithRoutingConfig(tvAct, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tvAct.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 1, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tvAct.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeAct) - } - } else { - s.updateTaskQueueDeploymentData(tvWf, true, 0, false, 0, tqTypeWf) - if !unversionedActivity { - s.updateTaskQueueDeploymentData(tvAct, true, 0, false, 0, tqTypeAct) - } + }}, []string{}, tqTypeAct) } s.startWorkflow(tvWf, nil) @@ -2025,7 +1933,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritance_ExpectInherit(crossTq bo var override *workflowpb.VersioningOverride if withOverride { - override = tv1.VersioningOverridePinned(s.useV32) + override = tv1.VersioningOverridePinned() } // This is the registered behavior which can be unpinned, but only if withOverride. We want @@ -2112,29 +2020,21 @@ func (s *Versioning3Suite) testChildWorkflowInheritance_ExpectInherit(crossTq bo close(wfStarted) // force panic if replayed // make v2 current for both parent and child and unblock the wf to start the child - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), + CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), + RevisionNumber: 2, + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf) + if crossTq { + s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2Child, &deploymentpb.RoutingConfig{ + CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2Child.DeploymentVersionString()), CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { + }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2Child.DeploymentVersion().GetBuildId(): { Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, }}, []string{}, tqTypeWf) - } else { - s.updateTaskQueueDeploymentData(tv2, true, 0, false, 0, tqTypeWf) - } - if crossTq { - if s.useNewDeploymentData { - s.updateTaskQueueDeploymentDataWithRoutingConfig(tv2Child, &deploymentpb.RoutingConfig{ - CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2Child.DeploymentVersionString()), - CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), - RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2Child.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) - } else { - s.updateTaskQueueDeploymentData(tv2Child, true, 0, false, 0, tqTypeWf) - } } currentChanged <- struct{}{} @@ -2934,13 +2834,9 @@ func (s *Versioning3Suite) TestDescribeTaskQueueVersioningInfo() { RevisionNumber: revisionNumber, } - if s.useNewDeploymentData { - s.syncTaskQueueDeploymentDataWithRoutingConfig(tv, newRoutingConfig, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) - } else { - s.syncTaskQueueDeploymentData(tv, false, 20, false, t1, tqTypeWf) - } + s.syncTaskQueueDeploymentDataWithRoutingConfig(tv, newRoutingConfig, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeWf) wfInfo, err := s.FrontendClient().DescribeTaskQueue(ctx, &workflowservice.DescribeTaskQueueRequest{ Namespace: s.Namespace().String(), TaskQueue: tv.TaskQueue(), @@ -2962,13 +2858,9 @@ func (s *Versioning3Suite) TestDescribeTaskQueueVersioningInfo() { CurrentVersionChangedTime: timestamp.TimePtr(t1), RevisionNumber: revisionNumber, } - if s.useNewDeploymentData { - s.syncTaskQueueDeploymentDataWithRoutingConfig(tv, newRoutingConfig, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeAct) - } else { - s.syncTaskQueueDeploymentData(tv, true, 0, false, t1, tqTypeAct) - } + s.syncTaskQueueDeploymentDataWithRoutingConfig(tv, newRoutingConfig, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeAct) actInfo, err := s.FrontendClient().DescribeTaskQueue(ctx, &workflowservice.DescribeTaskQueueRequest{ Namespace: s.Namespace().String(), @@ -2992,13 +2884,9 @@ func (s *Versioning3Suite) TestDescribeTaskQueueVersioningInfo() { RampingVersionPercentageChangedTime: timestamp.TimePtr(t2), RevisionNumber: 2, } - if s.useNewDeploymentData { - s.syncTaskQueueDeploymentDataWithRoutingConfig(tv, newRoutingConfig, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeAct) - } else { - s.syncTaskQueueDeploymentData(tv, false, 10, true, t2, tqTypeAct) - } + s.syncTaskQueueDeploymentDataWithRoutingConfig(tv, newRoutingConfig, map[string]*deploymentspb.WorkerDeploymentVersionData{tv.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }}, []string{}, tqTypeAct) s.waitForDeploymentDataPropagation(tv, versionStatusNil, true, tqTypeAct) actInfo, err = s.FrontendClient().DescribeTaskQueue(ctx, &workflowservice.DescribeTaskQueueRequest{ @@ -3018,9 +2906,6 @@ func (s *Versioning3Suite) TestDescribeTaskQueueVersioningInfo() { } func (s *Versioning3Suite) TestSyncDeploymentUserDataWithRoutingConfig_Update() { - if s.useNewDeploymentData == false { - s.T().Skip() - } tv := testvars.New(s) data := s.getTaskQueueDeploymentData(tv, tqTypeAct) @@ -3196,96 +3081,6 @@ func (s *Versioning3Suite) TestSyncDeploymentUserDataWithRoutingConfig_Update() } -func (s *Versioning3Suite) TestSyncDeploymentUserData_Update() { - if s.useNewDeploymentData == true { - s.T().Skip() - } - tv := testvars.New(s) - - data := s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.Nil(data) - data = s.getTaskQueueDeploymentData(tv, tqTypeWf) - s.Nil(data) - - t1 := time.Now() - tv1 := tv.WithBuildIDNumber(1) - - s.syncTaskQueueDeploymentData(tv1, true, 0, false, t1, tqTypeAct) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv1.DeploymentVersion(), CurrentSinceTime: timestamp.TimePtr(t1), RoutingUpdateTime: timestamp.TimePtr(t1)}, - }}, data) - data = s.getTaskQueueDeploymentData(tv, tqTypeWf) - s.Nil(data) - - // Changing things with an older timestamp should not have effect. - t0 := t1.Add(-time.Second) - s.syncTaskQueueDeploymentData(tv1, false, 0, false, t0, tqTypeAct) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv1.DeploymentVersion(), CurrentSinceTime: timestamp.TimePtr(t1), RoutingUpdateTime: timestamp.TimePtr(t1)}, - }}, data) - - // Changing things with a newer timestamp should apply - t2 := t1.Add(time.Second) - s.syncTaskQueueDeploymentData(tv1, false, 20, false, t2, tqTypeAct) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv1.DeploymentVersion(), CurrentSinceTime: nil, RampingSinceTime: timestamp.TimePtr(t2), RampPercentage: 20, RoutingUpdateTime: timestamp.TimePtr(t2)}, - }}, data) - - // Add another version, this time to both tq types - tv2 := tv.WithBuildIDNumber(2) - s.syncTaskQueueDeploymentData(tv2, false, 10, false, t1, tqTypeAct, tqTypeWf) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv1.DeploymentVersion(), CurrentSinceTime: nil, RampingSinceTime: timestamp.TimePtr(t2), RampPercentage: 20, RoutingUpdateTime: timestamp.TimePtr(t2)}, - {Version: tv2.DeploymentVersion(), CurrentSinceTime: nil, RampingSinceTime: timestamp.TimePtr(t1), RampPercentage: 10, RoutingUpdateTime: timestamp.TimePtr(t1)}, - }}, data) - data = s.getTaskQueueDeploymentData(tv, tqTypeWf) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv2.DeploymentVersion(), CurrentSinceTime: nil, RampingSinceTime: timestamp.TimePtr(t1), RampPercentage: 10, RoutingUpdateTime: timestamp.TimePtr(t1)}, - }}, data) - - // Make v2 current - s.syncTaskQueueDeploymentData(tv2, true, 0, false, t2, tqTypeAct) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv1.DeploymentVersion(), CurrentSinceTime: nil, RampingSinceTime: timestamp.TimePtr(t2), RampPercentage: 20, RoutingUpdateTime: timestamp.TimePtr(t2)}, - {Version: tv2.DeploymentVersion(), CurrentSinceTime: timestamp.TimePtr(t2), RoutingUpdateTime: timestamp.TimePtr(t2)}, - }}, data) - - // Forget v1 - s.forgetTaskQueueDeploymentVersion(tv1, tqTypeAct, false) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv2.DeploymentVersion(), CurrentSinceTime: timestamp.TimePtr(t2), RoutingUpdateTime: timestamp.TimePtr(t2)}, - }}, data) - - // Forget v1 again should be a noop - s.forgetTaskQueueDeploymentVersion(tv1, tqTypeAct, false) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv2.DeploymentVersion(), CurrentSinceTime: timestamp.TimePtr(t2), RoutingUpdateTime: timestamp.TimePtr(t2)}, - }}, data) - - // Ramp unversioned - s.syncTaskQueueDeploymentData(tv2, false, 90, true, t2, tqTypeAct) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{Versions: []*deploymentspb.DeploymentVersionData{ - {Version: tv2.DeploymentVersion(), CurrentSinceTime: timestamp.TimePtr(t2), RoutingUpdateTime: timestamp.TimePtr(t2)}, - }, - UnversionedRampData: &deploymentspb.DeploymentVersionData{RampingSinceTime: timestamp.TimePtr(t2), RampPercentage: 90, RoutingUpdateTime: timestamp.TimePtr(t2)}, - }, data) - - // Forget v2 - s.forgetTaskQueueDeploymentVersion(tv2, tqTypeAct, false) - data = s.getTaskQueueDeploymentData(tv, tqTypeAct) - s.ProtoEqual(&persistencespb.DeploymentData{ - UnversionedRampData: &deploymentspb.DeploymentVersionData{RampingSinceTime: timestamp.TimePtr(t2), RampPercentage: 90, RoutingUpdateTime: timestamp.TimePtr(t2)}, - }, data) -} - func (s *Versioning3Suite) setCurrentDeployment(tv *testvars.TestVars) { ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() @@ -3294,11 +3089,7 @@ func (s *Versioning3Suite) setCurrentDeployment(tv *testvars.TestVars) { Namespace: s.Namespace().String(), DeploymentName: tv.DeploymentSeries(), } - if s.useV32 { - req.BuildId = tv.BuildID() - } else { - req.Version = tv.DeploymentVersionString() //nolint:staticcheck // SA1019: worker versioning v0.31 - } + req.BuildId = tv.BuildID() _, err := s.FrontendClient().SetWorkerDeploymentCurrentVersion(ctx, req) var notFound *serviceerror.NotFound if errors.As(err, ¬Found) || (err != nil && strings.Contains(err.Error(), serviceerror.NewFailedPreconditionf(workerdeployment.ErrCurrentVersionDoesNotHaveAllTaskQueues, tv.DeploymentVersionStringV32()).Error())) { @@ -3340,10 +3131,8 @@ func (s *Versioning3Suite) setRampingDeployment( ) { ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() - v := tv.DeploymentVersionString() bid := tv.BuildID() if rampUnversioned { - v = "__unversioned__" bid = "" } @@ -3353,11 +3142,7 @@ func (s *Versioning3Suite) setRampingDeployment( DeploymentName: tv.DeploymentSeries(), Percentage: percentage, } - if s.useV32 { - req.BuildId = bid - } else { - req.Version = v //nolint:staticcheck // SA1019: worker versioning v0.31 - } + req.BuildId = bid _, err := s.FrontendClient().SetWorkerDeploymentRampingVersion(ctx, req) var notFound *serviceerror.NotFound if errors.As(err, ¬Found) || (err != nil && strings.Contains(err.Error(), serviceerror.NewFailedPreconditionf(workerdeployment.ErrRampingVersionDoesNotHaveAllTaskQueues, tv.DeploymentVersionStringV32()).Error())) { @@ -3644,28 +3429,13 @@ func (s *Versioning3Suite) verifyWorkflowVersioning( )) } - if s.useV32 { - // v0.32 override - a.Equal(override.GetAutoUpgrade(), versioningInfo.GetVersioningOverride().GetAutoUpgrade()) - a.Equal(override.GetPinned().GetVersion().GetBuildId(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetBuildId()) - a.Equal(override.GetPinned().GetVersion().GetDeploymentName(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetDeploymentName()) - a.Equal(override.GetPinned().GetBehavior(), versioningInfo.GetVersioningOverride().GetPinned().GetBehavior()) - if worker_versioning.OverrideIsPinned(override) { - a.Equal(override.GetPinned().GetVersion().GetDeploymentName(), dwf.WorkflowExecutionInfo.GetWorkerDeploymentName()) - } - } else { - // v0.31 override - a.Equal(override.GetBehavior().String(), versioningInfo.GetVersioningOverride().GetBehavior().String()) //nolint:staticcheck // SA1019: worker versioning v0.31 - if actualOverrideDeployment := versioningInfo.GetVersioningOverride().GetPinnedVersion(); override.GetPinnedVersion() != actualOverrideDeployment { //nolint:staticcheck // SA1019: worker versioning v0.31 - a.Fail(fmt.Sprintf("pinned override mismatch. expected: {%s}, actual: {%s}", - override.GetPinnedVersion(), //nolint:staticcheck // SA1019: worker versioning v0.31 - actualOverrideDeployment, - )) - } - if worker_versioning.OverrideIsPinned(override) { - d, _ := worker_versioning.WorkerDeploymentVersionFromStringV31(override.GetPinnedVersion()) //nolint:staticcheck // SA1019: worker versioning v0.31 - a.Equal(d.GetDeploymentName(), dwf.WorkflowExecutionInfo.GetWorkerDeploymentName()) - } + // v0.32 override + a.Equal(override.GetAutoUpgrade(), versioningInfo.GetVersioningOverride().GetAutoUpgrade()) + a.Equal(override.GetPinned().GetVersion().GetBuildId(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetBuildId()) + a.Equal(override.GetPinned().GetVersion().GetDeploymentName(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetDeploymentName()) + a.Equal(override.GetPinned().GetBehavior(), versioningInfo.GetVersioningOverride().GetPinned().GetBehavior()) + if worker_versioning.OverrideIsPinned(override) { + a.Equal(override.GetPinned().GetVersion().GetDeploymentName(), dwf.WorkflowExecutionInfo.GetWorkerDeploymentName()) } if !versioningInfo.GetVersionTransition().Equal(transition) { @@ -4308,10 +4078,6 @@ func (s *Versioning3Suite) verifyVersioningSAs( } func (s *Versioning3Suite) TestAutoUpgradeWorkflows_NoBouncingBetweenVersions() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } - s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 1) s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 1) @@ -4365,9 +4131,6 @@ func (s *Versioning3Suite) TestAutoUpgradeWorkflows_NoBouncingBetweenVersions() } func (s *Versioning3Suite) TestWorkflowTQLags_DependentActivityStartsTransition() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } /* The aim of this test is to show the following does not occur when using revisionNumber mechanics: - If the workflow TQ lags behind the activity TQ, with respect to the current version of a deployment, the activity should not be @@ -4459,9 +4222,6 @@ func (s *Versioning3Suite) TestWorkflowTQLags_DependentActivityStartsTransition( } func (s *Versioning3Suite) TestActivityTQLags_DependentActivityCompletesOnTheNewVersion() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } /* The aim of this test is to show the following does not occur when using revisionNumber mechanics: - If the activity TQ lags behind the workflow TQ, with respect to the current version of a deployment, the activity should not be @@ -4564,10 +4324,6 @@ func (s *Versioning3Suite) TestActivityTQLags_DependentActivityCompletesOnTheNew // the test is present to show that revision number mechanics work as expected even when the task-queue // partitions have a more updated view of the current version than the mutable state of a workflow. func (s *Versioning3Suite) TestChildStartsWithParentRevision_SameTQ_TQAhead() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } - s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 1) s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 1) @@ -4696,10 +4452,6 @@ func (s *Versioning3Suite) TestVersionedPoller_FailsWithEmptyNormalName() { } func (s *Versioning3Suite) TestChildStartsWithParentRevision_SameTQ_TQLags() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } - s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 1) s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 1) @@ -4796,10 +4548,6 @@ func (s *Versioning3Suite) TestChildStartsWithParentRevision_SameTQ_TQLags() { // TestChildStartsWithNoInheritedAutoUpgradeInfo_CrossTQ demonstrates that a child workflow of an AutoUpgrade parent, not sharing // the same task queue, starts with no inherited auto upgrade info. func (s *Versioning3Suite) TestChildStartsWithNoInheritedAutoUpgradeInfo_CrossTQ() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } - s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 1) s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 1) @@ -4885,10 +4633,6 @@ func (s *Versioning3Suite) TestChildStartsWithNoInheritedAutoUpgradeInfo_CrossTQ // Tests testing continue-as-new of an AutoUpgrade workflow using revision number mechanics. func (s *Versioning3Suite) TestContinueAsNewOfAutoUpgradeWorkflow_RevisionNumberMechanics() { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } - s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 1) s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 1) @@ -4980,10 +4724,6 @@ func (s *Versioning3Suite) TestContinueAsNewOfAutoUpgradeWorkflow_RevisionNumber // If testContinueAsNew is true, tests a ContinueAsNew followed by retry; otherwise tests a direct retry of a workflow. // If testChildWorkflow is true, tests that a child workflow's retry doesn't bounce back (child spawned by parent with retry policy). func (s *Versioning3Suite) testRetryNoBounceBack(testContinueAsNew bool, testChildWorkflow bool) { - if !s.useRevisionNumbers { - s.T().Skip("This test is only supported on revision number mechanics") - } - s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueReadPartitions, 1) s.OverrideDynamicConfig(dynamicconfig.MatchingNumTaskqueueWritePartitions, 1) diff --git a/tests/worker_deployment_test.go b/tests/worker_deployment_test.go index 465947fcc2..a8ca1060b6 100644 --- a/tests/worker_deployment_test.go +++ b/tests/worker_deployment_test.go @@ -38,7 +38,7 @@ type ( } ) -func TestWorkerDeploymentSuiteV2(t *testing.T) { +func TestWorkerDeploymentSuite(t *testing.T) { t.Parallel() suite.Run(t, &WorkerDeploymentSuite{workflowVersion: workerdeployment.VersionDataRevisionNumber}) } diff --git a/tests/worker_deployment_version_test.go b/tests/worker_deployment_version_test.go index d50e833b37..3e05795b4c 100644 --- a/tests/worker_deployment_version_test.go +++ b/tests/worker_deployment_version_test.go @@ -26,6 +26,8 @@ import ( "go.temporal.io/sdk/worker" "go.temporal.io/sdk/workflow" deploymentspb "go.temporal.io/server/api/deployment/v1" + "go.temporal.io/server/api/matchingservice/v1" + persistencespb "go.temporal.io/server/api/persistence/v1" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/testing/testhooks" "go.temporal.io/server/common/testing/testvars" @@ -63,7 +65,7 @@ var ( testRandomMetadataValue = []byte("random metadata value") ) -func TestDeploymentVersionSuiteV2(t *testing.T) { +func TestDeploymentVersionSuite(t *testing.T) { t.Parallel() suite.Run(t, &DeploymentVersionSuite{workflowVersion: workerdeployment.VersionDataRevisionNumber, useV32: true}) } @@ -145,7 +147,9 @@ func (s *DeploymentVersionSuite) startVersionWorkflow(ctx context.Context, tv *t s.EventuallyWithT(func(t *assert.CollectT) { a := assert.New(t) resp, err := s.describeVersion(tv) - a.NoError(err) + if !a.NoError(err) { + return + } // regardless of s.useV32, we want to read both version formats a.Equal(tv.DeploymentVersionString(), resp.GetWorkerDeploymentVersionInfo().GetVersion()) a.Equal(tv.ExternalDeploymentVersion().GetDeploymentName(), resp.GetWorkerDeploymentVersionInfo().GetDeploymentVersion().GetDeploymentName()) @@ -156,7 +160,9 @@ func (s *DeploymentVersionSuite) startVersionWorkflow(ctx context.Context, tv *t Namespace: s.Namespace().String(), DeploymentName: tv.DeploymentSeries(), }) - a.NoError(err) + if !a.NoError(err) { + return + } var versionSummaryNames []string var versionSummaryVersions []*deploymentpb.WorkerDeploymentVersion for _, versionSummary := range newResp.GetWorkerDeploymentInfo().GetVersionSummaries() { @@ -945,6 +951,12 @@ func (s *DeploymentVersionSuite) TestDeleteVersion_ValidDelete() { s.tryDeleteVersion(ctx, tv1, "", false) } +func (s *DeploymentVersionSuite) skipBeforeVersion(version workerdeployment.DeploymentWorkflowVersion) { + if s.workflowVersion < version { + s.T().Skipf("test supports version %v and newer", version) + } +} + func (s *DeploymentVersionSuite) TestDeleteVersion_ValidDelete_SkipDrainage() { s.OverrideDynamicConfig(dynamicconfig.PollerHistoryTTL, 500*time.Millisecond) @@ -1078,7 +1090,7 @@ func (s *DeploymentVersionSuite) TestVersionMissingTaskQueues_InvalidSetCurrentV pollerCancel1() // Start a workflow on task_queue_1 to increase the add rate - s.startWorkflow(tv1, tv1.VersioningOverridePinned(s.useV32)) + s.startWorkflow(tv1, tv1.VersioningOverridePinned()) // SetCurrent tv2 err = s.setCurrent(tv2, false) @@ -1136,7 +1148,7 @@ func (s *DeploymentVersionSuite) TestVersionMissingTaskQueues_InvalidSetRampingV pollerCancel1() // Start a workflow on task_queue_1 to increase the add rate - s.startWorkflow(tv1, tv1.VersioningOverridePinned(s.useV32)) + s.startWorkflow(tv1, tv1.VersioningOverridePinned()) // SetRampingVersion to tv2 err = s.setRamping(tv2, 0) @@ -3329,3 +3341,107 @@ func (s *DeploymentVersionSuite) TestReactivationSignalCache_Deduplication_Reset var result2 string s.NoError(resetRun2.Get(ctx, &result2)) } + +func (s *DeploymentVersionSuite) TestDeleteVersion_ThenRecreateByPolling() { + s.skipBeforeVersion(workerdeployment.VersionDataRevisionNumber) + s.OverrideDynamicConfig(dynamicconfig.PollerHistoryTTL, 500*time.Millisecond) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) + defer cancel() + tv := testvars.New(s).WithBuildIDNumber(1) + + s.startVersionWorkflow(ctx, tv) + + vd := s.getTaskQueueVersionData(tv, enumspb.TASK_QUEUE_TYPE_WORKFLOW, tv.ExternalDeploymentVersion()) + s.Equal(int64(0), vd.GetRevisionNumber()) + s.False(vd.GetDeleted()) + + // Wait for pollers to go away + s.EventuallyWithT(func(t *assert.CollectT) { + resp, err := s.FrontendClient().DescribeTaskQueue(ctx, &workflowservice.DescribeTaskQueueRequest{ + Namespace: s.Namespace().String(), + TaskQueue: tv.TaskQueue(), + TaskQueueType: enumspb.TASK_QUEUE_TYPE_WORKFLOW, + }) + require.NoError(t, err) + require.Empty(t, resp.Pollers) + }, 5*time.Second, time.Second) + + // Delete the version + s.tryDeleteVersion(ctx, tv, "", false) + // Verify the version is gone from the task queue + s.EventuallyWithT(func(t *assert.CollectT) { + vd = s.getTaskQueueVersionData(tv, enumspb.TASK_QUEUE_TYPE_WORKFLOW, tv.ExternalDeploymentVersion()) + require.New(t).Nil(vd) + }, time.Second*5, time.Millisecond*200) + + // Verify the version is gone from the deployment + s.EventuallyWithT(func(t *assert.CollectT) { + a := require.New(t) + resp, err := s.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ + Namespace: s.Namespace().String(), + DeploymentName: tv.DeploymentSeries(), + }) + a.NoError(err) + for _, vs := range resp.GetWorkerDeploymentInfo().GetVersionSummaries() { + //nolint:staticcheck // SA1019 deprecated Version will clean up later + a.NotEqual(tv.DeploymentVersionString(), vs.Version) + } + }, time.Second*5, time.Millisecond*200) + + // Poll again to recreate the version + + s.startVersionWorkflow(ctx, tv) + + // Verify the version is back (undeleted) in the deployment + s.EventuallyWithT(func(t *assert.CollectT) { + a := require.New(t) + resp, err := s.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ + Namespace: s.Namespace().String(), + DeploymentName: tv.DeploymentSeries(), + }) + a.NoError(err) + found := false + for _, vs := range resp.GetWorkerDeploymentInfo().GetVersionSummaries() { + //nolint:staticcheck // SA1019 deprecated Version will clean up later + if vs.Version == tv.DeploymentVersionString() { + found = true + } + } + a.True(found, "version should be recreated after polling") + }, time.Second*5, time.Millisecond*200) + + // Ensure the version data revived properly in the task queue + vd = s.getTaskQueueVersionData(tv, enumspb.TASK_QUEUE_TYPE_WORKFLOW, tv.ExternalDeploymentVersion()) + s.Equal(int64(0), vd.GetRevisionNumber()) + s.False(vd.GetDeleted()) +} + +// getTaskQueueDeploymentData gets the deployment data for a given TQ type. The data is always +// returned from the WF type root partition, so no need to wait for propagation before calling this +// function. +func (s *DeploymentVersionSuite) getTaskQueueDeploymentData( + tv *testvars.TestVars, + tqType enumspb.TaskQueueType, +) *persistencespb.DeploymentData { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + defer cancel() + resp, err := s.GetTestCluster().MatchingClient().GetTaskQueueUserData( + ctx, + &matchingservice.GetTaskQueueUserDataRequest{ + NamespaceId: s.NamespaceID().String(), + TaskQueue: tv.TaskQueue().GetName(), + TaskQueueType: tqTypeWf, + }) + s.NoError(err) + return resp.GetUserData().GetData().GetPerType()[int32(tqType)].GetDeploymentData() +} + +func (s *DeploymentVersionSuite) getTaskQueueVersionData( + tv *testvars.TestVars, + tqType enumspb.TaskQueueType, + version *deploymentpb.WorkerDeploymentVersion, +) *deploymentspb.WorkerDeploymentVersionData { + data := s.getTaskQueueDeploymentData(tv, tqType) + return data.GetDeploymentsData()[version.GetDeploymentName()].GetVersions()[version.GetBuildId()] +} diff --git a/tests/workflow_alias_search_attribute_test.go b/tests/workflow_alias_search_attribute_test.go index bb56c8082b..557a866307 100644 --- a/tests/workflow_alias_search_attribute_test.go +++ b/tests/workflow_alias_search_attribute_test.go @@ -104,7 +104,7 @@ func (s *WorkflowAliasSearchAttributeTestSuite) createWorkflow( WorkflowType: tv.WorkflowType(), TaskQueue: tv.TaskQueue(), Identity: tv.WorkerIdentity(), - VersioningOverride: tv.VersioningOverridePinned(true), + VersioningOverride: tv.VersioningOverridePinned(), SearchAttributes: sa, } return s.FrontendClient().StartWorkflowExecution(ctx, request) diff --git a/tests/xdc/history_replication_signals_and_updates_test.go b/tests/xdc/history_replication_signals_and_updates_test.go index 28f0cfdce2..3a7cde3006 100644 --- a/tests/xdc/history_replication_signals_and_updates_test.go +++ b/tests/xdc/history_replication_signals_and_updates_test.go @@ -1046,7 +1046,7 @@ func (c *hrsuTestCluster) getHistoryForRunId(ctx context.Context, runId string) } func (c *hrsuTestCluster) pollWorkflowResult(ctx context.Context, runId string) *historypb.HistoryEvent { - getHistoryWithLongPoll := func(token []byte) ([]*historypb.HistoryEvent, []byte) { + getHistoryWithLongPoll := func(token []byte) ([]*historypb.HistoryEvent, []byte, error) { responseInner, err := c.testCluster.FrontendClient().GetWorkflowExecutionHistory(ctx, &workflowservice.GetWorkflowExecutionHistoryRequest{ Namespace: c.t.tv.NamespaceName().String(), Execution: &commonpb.WorkflowExecution{ @@ -1058,25 +1058,33 @@ func (c *hrsuTestCluster) pollWorkflowResult(ctx context.Context, runId string) NextPageToken: token, HistoryEventFilterType: enumspb.HISTORY_EVENT_FILTER_TYPE_CLOSE_EVENT, }) - c.t.s.NoError(err) - return responseInner.History.Events, responseInner.NextPageToken + if err != nil { + return nil, nil, err + } + return responseInner.History.Events, responseInner.NextPageToken, nil } var token []byte var allEvents []*historypb.HistoryEvent - multiPoll := false for { - events, nextPageToken := getHistoryWithLongPoll(token) + if ctx.Err() != nil { + c.t.s.NoError(ctx.Err(), "context expired while waiting for workflow result") + return nil + } + events, nextPageToken, err := getHistoryWithLongPoll(token) + if err != nil { + // Transient error (e.g. CurrentBranchChanged after conflict resolution): retry from scratch. + token = nil + continue + } allEvents = append(allEvents, events...) if nextPageToken == nil { break } token = nextPageToken - multiPoll = true } c.t.s.Len(allEvents, 1) - c.t.s.True(multiPoll, "Expected to have multiple polls of history events") return allEvents[0] } diff --git a/tests/xdc/nexus_request_forwarding_test.go b/tests/xdc/nexus_request_forwarding_test.go index 7740020a99..ab1be70471 100644 --- a/tests/xdc/nexus_request_forwarding_test.go +++ b/tests/xdc/nexus_request_forwarding_test.go @@ -67,15 +67,11 @@ func TestNexusRequestForwardingTestSuite(t *testing.T) { } func (s *NexusRequestForwardingSuite) SetupSuite() { - re, err := dynamicconfig.ConvertWildcardStringListToRegexp([]string{"internal-test-*"}) - if err != nil { - panic(err) - } s.dynamicConfigOverrides = map[dynamicconfig.Key]any{ // Make sure we don't hit the rate limiter in tests dynamicconfig.FrontendGlobalNamespaceNamespaceReplicationInducingAPIsRPS.Key(): 1000, dynamicconfig.RefreshNexusEndpointsMinWait.Key(): 1 * time.Millisecond, - dynamicconfig.FrontendNexusRequestHeadersBlacklist.Key(): dynamicconfig.GetTypedPropertyFn(re), + dynamicconfig.FrontendNexusRequestHeadersBlacklist.Key(): []string{"internal-test-*"}, } s.setupSuite() } diff --git a/tools/testrunner/log.go b/tools/testrunner/log.go index 05592d9181..94017ec1c4 100644 --- a/tools/testrunner/log.go +++ b/tools/testrunner/log.go @@ -333,3 +333,21 @@ func isTestResultBoundary(line string) bool { func shouldStopOnTestBoundary(line string, _ int, _ int) bool { return isTestResultBoundary(line) } + +// parseFailedTestsFromOutput extracts failing test names from gotestsum stdout. +// It looks for "--- FAIL: TestName" lines produced as tests complete, and is +// used when the test binary was killed externally before producing a JUnit XML. +func parseFailedTestsFromOutput(stdout string) []string { + var failed []string + seen := make(map[string]struct{}) + for _, line := range strings.Split(strings.ReplaceAll(stdout, "\r\n", "\n"), "\n") { + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, "--- FAIL: ") { + continue + } + if name, ok := parseTripleDashTestName(line); ok { + addUniqueTest(&failed, seen, name) + } + } + return failed +} diff --git a/tools/testrunner/log_test.go b/tools/testrunner/log_test.go index 9285190dfa..2cde8bcf5e 100644 --- a/tools/testrunner/log_test.go +++ b/tools/testrunner/log_test.go @@ -22,6 +22,50 @@ func TestParseTestTimeouts(t *testing.T) { require.Equal(t, string(logOutput), stacktrace) } +func TestParseFailedTestsFromOutput(t *testing.T) { + t.Run("ExtractsFailedTestNames", func(t *testing.T) { + stdout := ` +=== RUN TestFoo +=== RUN TestFoo/SubTest1 + foo_test.go:42: assertion failed +--- FAIL: TestFoo/SubTest1 (1.23s) +--- FAIL: TestFoo (1.23s) +=== RUN TestBar +--- PASS: TestBar (0.00s) +=== RUN TestBaz + baz_test.go:10: something wrong +--- FAIL: TestBaz (0.50s) +FAIL +` + got := parseFailedTestsFromOutput(stdout) + require.Equal(t, []string{"TestFoo/SubTest1", "TestFoo", "TestBaz"}, got) + }) + + t.Run("DeduplicatesDuplicateLines", func(t *testing.T) { + stdout := ` +--- FAIL: TestDupe (0.10s) +--- FAIL: TestDupe (0.10s) +--- FAIL: TestOther (0.20s) +` + got := parseFailedTestsFromOutput(stdout) + require.Equal(t, []string{"TestDupe", "TestOther"}, got) + }) + + t.Run("ReturnsEmptyWhenNoFailures", func(t *testing.T) { + stdout := ` +=== RUN TestPass +--- PASS: TestPass (0.01s) +ok go.temporal.io/server/tests +` + got := parseFailedTestsFromOutput(stdout) + require.Empty(t, got) + }) + + t.Run("ReturnsEmptyOnEmptyInput", func(t *testing.T) { + require.Empty(t, parseFailedTestsFromOutput("")) + }) +} + func TestParseAlerts_DataRaceAndPanic(t *testing.T) { input, err := os.ReadFile("testdata/alerts-input.log") require.NoError(t, err) diff --git a/tools/testrunner/testrunner.go b/tools/testrunner/testrunner.go index 357f5d5dfb..f0565d5bc1 100644 --- a/tools/testrunner/testrunner.go +++ b/tools/testrunner/testrunner.go @@ -13,6 +13,7 @@ import ( "slices" "strconv" "strings" + "time" "github.com/google/uuid" ) @@ -25,6 +26,11 @@ const ( crashReportNameFlag = "--crashreportname=" gotestsumPathFlag = "--gotestsum-path=" + // goTestTimeoutFlag is the go test flag whose value is also used as the + // testrunner's total-run deadline (so results are flushed before an external + // kill such as a GitHub Actions timeout). + goTestTimeoutFlagEq = "-timeout=" + // fullRerunThreshold is the number of test failures above which we do a full // rerun instead of retrying only the failed tests. fullRerunThreshold = 20 @@ -70,6 +76,7 @@ type runner struct { maxAttempts int crashName string alerts []alert + totalTimeout time.Duration // derived from the -timeout go test flag } func newRunner() *runner { @@ -81,6 +88,17 @@ func newRunner() *runner { // nolint:revive,cognitive-complexity func (r *runner) sanitizeAndParseArgs(command string, args []string) ([]string, error) { + // Pre-pass: read the go test -timeout value and use it as the testrunner's + // total deadline so results are flushed before an external kill (e.g. GitHub + // Actions timeout). The flag is NOT consumed — it still passes through to gotestsum. + for _, arg := range args { + if strings.HasPrefix(arg, goTestTimeoutFlagEq) { + if d, err := time.ParseDuration(strings.TrimPrefix(arg, goTestTimeoutFlagEq)); err == nil { + r.totalTimeout = d + } + } + } + var sanitizedArgs []string for _, arg := range args { if strings.HasPrefix(arg, maxAttemptsFlag) { @@ -189,6 +207,12 @@ func Main() { log.Fatalf("failed to parse command line options: %v", err) } + if r.totalTimeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, r.totalTimeout) + defer cancel() + } + switch command { case testCommand: r.runTests(ctx, args) @@ -208,6 +232,30 @@ func (r *runner) reportCrash() { } } +// writeCurrentReport writes the merged report from all completed attempts to the +// final output path. It is called after each attempt so that partial results +// survive if the process is killed externally between attempts. +// Reporting errors (e.g. unexpected missing reruns) are intentionally ignored +// here; they are only checked for the final write at the end of runTests. +func (r *runner) writeCurrentReport() { + reports := r.allReports() + if len(reports) == 0 { + return + } + merged, err := mergeReports(reports) + if err != nil { + log.Printf("warning: failed to merge reports for intermediate write: %v", err) + return + } + if len(r.alerts) > 0 { + merged.appendAlertsSuite(r.alerts) + } + merged.path = r.junitOutputPath + if err := merged.write(); err != nil { + log.Printf("warning: failed to write intermediate report: %v", err) + } +} + func (r *runner) runTests(ctx context.Context, args []string) { var currentAttempt *attempt for a := 1; a <= r.maxAttempts; a++ { @@ -217,6 +265,27 @@ func (r *runner) runTests(ctx context.Context, args []string) { stdout, err := currentAttempt.run(ctx, args) // Extract prominent alerts from this attempt's output. r.alerts = append(r.alerts, parseAlerts(stdout)...) + + // Check whether our total timeout fired (context deadline exceeded). + // This happens when the go test binary hangs and never produces its own + // "test timed out" panic. We collect whatever results are available from + // completed attempts and from the partially-executed current attempt, then + // flush the XML before the external kill arrives. + if ctx.Err() != nil { + log.Printf("total timeout reached, collecting partial results from %d completed attempt(s)", a-1) + // Try to read whatever gotestsum managed to write before it was killed. + if readErr := currentAttempt.junitReport.read(); readErr != nil { + // gotestsum didn't finish writing a JUnit XML. Fall back to parsing + // stdout for any "--- FAIL:" lines that completed before the kill. + if failedTests := parseFailedTestsFromOutput(stdout); len(failedTests) > 0 { + currentAttempt.junitReport = generateStatic(failedTests, "total timeout", "Timeout") + } + // If no failed tests are found either, the current attempt's report + // remains empty and mergeReports will include only prior attempts. + } + break + } + if err != nil && !errors.As(err, ¤tAttempt.exitErr) { log.Fatalf("test run failed with an unexpected error: %v", err) } @@ -237,6 +306,11 @@ func (r *runner) runTests(ctx context.Context, args []string) { log.Fatal(err) } + // Write intermediate results so they survive if we are killed externally + // between attempts (e.g. a GitHub Actions job timeout fires after this + // attempt but before the next one completes). + r.writeCurrentReport() + // If the run completely successfull, no need to retry. if currentAttempt.exitErr == nil { break @@ -282,7 +356,9 @@ func (r *runner) runTests(ctx context.Context, args []string) { if err = mergedReport.write(); err != nil { log.Fatal(err) } - if len(mergedReport.reportingErrs) > 0 { + // Skip the strict rerun-coverage check when the total timeout fired: the + // in-progress attempt was killed before it could execute all expected tests. + if len(mergedReport.reportingErrs) > 0 && ctx.Err() == nil { log.Fatal(mergedReport.reportingErrs) } diff --git a/tools/testrunner/testrunner_test.go b/tools/testrunner/testrunner_test.go index 726b69ae6d..1eccfcb5b4 100644 --- a/tools/testrunner/testrunner_test.go +++ b/tools/testrunner/testrunner_test.go @@ -3,6 +3,7 @@ package testrunner import ( "os" "testing" + "time" "github.com/stretchr/testify/require" ) @@ -35,6 +36,35 @@ func TestRunnerSanitizeAndParseArgs(t *testing.T) { require.Equal(t, "test.cover.out", r.coverProfilePath) }) + t.Run("TotalTimeoutDerivedFromGoTestTimeout", func(t *testing.T) { + r := newRunner() + args, err := r.sanitizeAndParseArgs(testCommand, []string{ + "--gotestsum-path=/bin/gotestsum", + "--junitfile=test.xml", + "--", + "-timeout=35m", + "-coverprofile=test.cover.out", + }) + require.NoError(t, err) + // The testrunner should derive its total deadline from the go test -timeout flag. + require.Equal(t, 35*time.Minute, r.totalTimeout) + // The flag must still be present in the passthrough args so gotestsum/go test + // also honour it. + require.Contains(t, args, "-timeout=35m") + }) + + t.Run("TotalTimeoutNotSetWhenNoGoTestTimeout", func(t *testing.T) { + r := newRunner() + _, err := r.sanitizeAndParseArgs(testCommand, []string{ + "--gotestsum-path=/bin/gotestsum", + "--junitfile=test.xml", + "--", + "-coverprofile=test.cover.out", + }) + require.NoError(t, err) + require.Zero(t, r.totalTimeout) + }) + t.Run("GoTestSumPathMissing", func(t *testing.T) { r := newRunner() _, err := r.sanitizeAndParseArgs(testCommand, []string{ @@ -124,6 +154,43 @@ func TestStripRunFromArgs(t *testing.T) { }) } +func TestWriteCurrentReport(t *testing.T) { + out, err := os.CreateTemp("", "junit-report-*.xml") + require.NoError(t, err) + defer func() { _ = os.Remove(out.Name()) }() + + r := newRunner() + r.junitOutputPath = out.Name() + + // Simulate attempt 1 completing with failures. + j1 := &junitReport{path: "testdata/junit-attempt-1.xml"} + require.NoError(t, j1.read()) + a1 := r.newAttempt() + a1.junitReport = j1 + + r.writeCurrentReport() + + result := &junitReport{path: out.Name()} + require.NoError(t, result.read()) + require.Equal(t, 2, result.Failures) + require.Len(t, result.Suites, 1) + + // Simulate attempt 2 also completing. The intermediate write should now + // contain failures from both attempts, so that if the process is killed + // before attempt 3 the file on disk already has the full picture. + j2 := &junitReport{path: "testdata/junit-attempt-2.xml"} + require.NoError(t, j2.read()) + a2 := r.newAttempt() + a2.junitReport = j2 + + r.writeCurrentReport() + + result2 := &junitReport{path: out.Name()} + require.NoError(t, result2.read()) + require.Equal(t, 4, result2.Failures) // 2 from attempt 1 + 2 from attempt 2 + require.Len(t, result2.Suites, 2) +} + func TestRunnerReportCrash(t *testing.T) { out, err := os.CreateTemp("", "junit-report-*.xml") require.NoError(t, err)