From a335871a3dfe0023df8e6bc465beb5836881f457 Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Mon, 19 Aug 2024 13:04:51 -0600 Subject: [PATCH 01/15] Handle Worker Out of Memory exceptions with retry --- .../OutOfProcMiddleware.cs | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index 4d514c670..e99dbaced 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -214,7 +214,8 @@ await this.LifeCycleNotificationHelper.OrchestratorCompletedAsync( isReplay: false); } } - else if (context.TryGetOrchestrationErrorDetails(out Exception? exception)) + else if (context.TryGetOrchestrationErrorDetails(out Exception? exception) + && (exception?.GetType() != typeof(OrchestrationFailureException) || !exception.Message.Contains("OutOfMemoryException"))) { // the function failed because the orchestrator failed. @@ -235,6 +236,20 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( exception?.Message ?? string.Empty, isReplay: false); } + else if (exception?.GetType() == typeof(OrchestrationFailureException) && exception.Message.Contains("OutOfMemoryException")) + { + string reason = $"Out Of Memory exception thrown by the worker runtime: {exception}"; + + this.TraceHelper.FunctionAborted( + this.Options.HubName, + functionName.Name, + instance.InstanceId, + reason, + functionType: FunctionType.Orchestrator); + + // This will abort the current execution and force an durable retry + throw new SessionAbortedException(reason); + } else { // the function failed for some other reason @@ -556,6 +571,20 @@ public async Task CallActivityAsync(DispatchMiddlewareContext dispatchContext, F result: serializedOutput), }; } + else if (result.Exception is not null && result.Exception.Message.Contains("OutOfMemoryException")) + { + string reason = $"Out Of Memory exception thrown by the worker runtime: {result.Exception}"; + + this.TraceHelper.FunctionAborted( + this.Options.HubName, + functionName.Name, + instance.InstanceId, + reason, + functionType: FunctionType.Activity); + + // This will abort the current execution and force an durable retry + throw new SessionAbortedException(reason); + } else { this.TraceHelper.FunctionFailed( From f2942b2621487fdb5f2b854dd70e0f1881356004 Mon Sep 17 00:00:00 2001 From: David Justo Date: Fri, 13 Sep 2024 13:37:41 -0700 Subject: [PATCH 02/15] implement type-based solution --- .../RemoteOrchestratorContext.cs | 45 ++++++++++++++-- .../OutOfProcMiddleware.cs | 52 ++++++++----------- 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs index 9c4d6a02e..ea3489637 100644 --- a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs +++ b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs @@ -71,7 +71,7 @@ internal bool TryGetOrchestrationErrorDetails(out Exception? failure) return hasError; } - internal void SetResult(IEnumerable actions, string customStatus) + internal void TrySetResult(IEnumerable actions, string customStatus) { var result = new OrchestratorExecutionResult { @@ -79,7 +79,7 @@ internal void SetResult(IEnumerable actions, string customSt Actions = actions, }; - this.SetResultInternal(result); + this.TrySetResultInternal(result); } // TODO: This method should be considered deprecated because SDKs should no longer be returning results as JSON. @@ -117,10 +117,39 @@ internal void SetResult(string orchestratorResponseJsonText) innerException: jsonReaderException); } - this.SetResultInternal(result); + this.TrySetResultInternal(result); } - private void SetResultInternal(OrchestratorExecutionResult result) + /// + /// Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. + /// + /// + /// Today, this method only checks for . In the future, we may want to add more cases. + /// Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield + /// a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be + /// handled in this method. + /// However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` + /// by the isolated process, and thus need special handling. + /// It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level + /// errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them + /// to this method as we encounter them. + /// + /// The failure details of the orchestrator. + /// If an OOM error is detected. + private void ThrowIfPlatformLevelException(FailureDetails failureDetails) + { + if (failureDetails.InnerFailure?.IsCausedBy() ?? false) + { + throw new OutOfMemoryException(failureDetails.ErrorMessage); + } + + if (failureDetails.InnerFailure != null) + { + this.ThrowIfPlatformLevelException(failureDetails.InnerFailure); + } + } + + private void TrySetResultInternal(OrchestratorExecutionResult result) { // Look for an orchestration completion action to see if we need to grab the output. foreach (OrchestratorAction action in result.Actions) @@ -133,6 +162,14 @@ private void SetResultInternal(OrchestratorExecutionResult result) if (completeAction.OrchestrationStatus == OrchestrationStatus.Failed) { + // If the orchestrator failed due to a platform-level error in the isolated process, + // we should re-throw that exception in the host (this process) invocation pipeline, + // so the invocation can be retried. + if (completeAction.FailureDetails != null) + { + this.ThrowIfPlatformLevelException(completeAction.FailureDetails); + } + string message = completeAction switch { { FailureDetails: { } f } => f.ErrorMessage, diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index e99dbaced..8ca785998 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -138,10 +138,15 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( byte[] triggerReturnValueBytes = Convert.FromBase64String(triggerReturnValue); P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes); - context.SetResult( + + // TrySetResult may throw if a platform-level error is encountered (like an out of memory exception). + context.TrySetResult( response.Actions.Select(ProtobufUtils.ToOrchestratorAction), response.CustomStatus); + // Here we throw if the orchestrator completed with an application-level error. When we do this, + // the function's result type will be of type `OrchestrationFailureException` which is reserved + // for application-level errors that do not need to be re-tried. context.ThrowIfFailed(); }, #pragma warning restore CS0618 // Type or member is obsolete (not intended for general public use) @@ -159,6 +164,20 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( // Re-throw so we can abort this invocation. this.HostLifetimeService.OnStopping.ThrowIfCancellationRequested(); } + + // we abort the invocation on "platform level errors" such as: + // - a timeout + // - an out of memory exception + // - a worker process exit + if (functionResult.Exception is Host.FunctionTimeoutException + // see in RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling + || functionResult.Exception?.InnerException is OutOfMemoryException + || (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) + { + // TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. + // Should we add that dependency or should it be exposed in WebJobs.Host? + throw functionResult.Exception; + } } catch (Exception hostRuntimeException) { @@ -214,8 +233,7 @@ await this.LifeCycleNotificationHelper.OrchestratorCompletedAsync( isReplay: false); } } - else if (context.TryGetOrchestrationErrorDetails(out Exception? exception) - && (exception?.GetType() != typeof(OrchestrationFailureException) || !exception.Message.Contains("OutOfMemoryException"))) + else if (context.TryGetOrchestrationErrorDetails(out Exception? exception)) { // the function failed because the orchestrator failed. @@ -236,20 +254,6 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( exception?.Message ?? string.Empty, isReplay: false); } - else if (exception?.GetType() == typeof(OrchestrationFailureException) && exception.Message.Contains("OutOfMemoryException")) - { - string reason = $"Out Of Memory exception thrown by the worker runtime: {exception}"; - - this.TraceHelper.FunctionAborted( - this.Options.HubName, - functionName.Name, - instance.InstanceId, - reason, - functionType: FunctionType.Orchestrator); - - // This will abort the current execution and force an durable retry - throw new SessionAbortedException(reason); - } else { // the function failed for some other reason @@ -571,20 +575,6 @@ public async Task CallActivityAsync(DispatchMiddlewareContext dispatchContext, F result: serializedOutput), }; } - else if (result.Exception is not null && result.Exception.Message.Contains("OutOfMemoryException")) - { - string reason = $"Out Of Memory exception thrown by the worker runtime: {result.Exception}"; - - this.TraceHelper.FunctionAborted( - this.Options.HubName, - functionName.Name, - instance.InstanceId, - reason, - functionType: FunctionType.Activity); - - // This will abort the current execution and force an durable retry - throw new SessionAbortedException(reason); - } else { this.TraceHelper.FunctionFailed( From f2ed1e9b8df221db5850067db08675f4099732eb Mon Sep 17 00:00:00 2001 From: David Justo Date: Mon, 30 Sep 2024 10:37:35 -0700 Subject: [PATCH 03/15] add e2e tests --- .../smoketest-dotnet-isolated-v4.yml | 15 ++ .../DotNetIsolated/DotNetIsolated.sln | 25 +++ .../DotNetIsolated/FaultyOrchestrators.cs | 160 ++++++++++++++++++ .../OOProcSmokeTests/DotNetIsolated/host.json | 3 +- 4 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln create mode 100644 test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs diff --git a/.github/workflows/smoketest-dotnet-isolated-v4.yml b/.github/workflows/smoketest-dotnet-isolated-v4.yml index f818ff7ae..cf29e4e63 100644 --- a/.github/workflows/smoketest-dotnet-isolated-v4.yml +++ b/.github/workflows/smoketest-dotnet-isolated-v4.yml @@ -23,3 +23,18 @@ jobs: - name: Run V4 .NET Isolated Smoke Test run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/StartHelloCitiesTyped -NoValidation shell: pwsh + + # Test that OOM errors are recoverable + - name: Run V4 .NET OOM Test + run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/durable_HttpStartOOMOrchestrator -NoValidation + shell: pwsh + + # Test that FailFast errors are recoverable + - name: Run V4 .NET FailFast Test + run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/durable_HttpStartProcessExitOrchestrator -NoValidation + shell: pwsh + + # Test that timeout errors are recoverable + - name: Run V4 .NET FailFast Test + run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/durable_HttpStartTimeoutOrchestrator -NoValidation + shell: pwsh \ No newline at end of file diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln new file mode 100644 index 000000000..a93cc6f6e --- /dev/null +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.5.002.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotNetIsolated", "DotNetIsolated.csproj", "{B2DBA49D-9D25-46DB-8968-15D5E83B4060}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {0954D7B4-582F-4F85-AE3E-5D503FB07DB1} + EndGlobalSection +EndGlobal diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs new file mode 100644 index 000000000..49dc8668c --- /dev/null +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs @@ -0,0 +1,160 @@ +using Microsoft.Azure.Functions.Worker; +using Microsoft.Azure.Functions.Worker.Http; +using Microsoft.DurableTask; +using Microsoft.DurableTask.Client; +using Microsoft.Extensions.Logging; +using System; + +namespace FaultOrchestrators +{ + public static class FaultyOrchestrators + { + [Function(nameof(OOMOrchestrator))] + public static async Task OOMOrchestrator( + [OrchestrationTrigger] TaskOrchestrationContext context) + { + // this orchestrator is not deterministic, on purpose. + // we use the non-determinism to force an OOM exception on only the first replay + + // check if a file named "replayEvidence" exists in the current directory. + // create it if it does not + string evidenceFile = "replayEvidence"; + bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) + { + System.IO.File.Create(evidenceFile).Close(); + } + + // on the very first replay, OOM the process + if (isTheFirstReplay) + { + // force the process to run out of memory + List data = new List(); + + for (int i = 0; i < 10000000; i++) + { + data.Add(new byte[1024 * 1024 * 1024]); + } + } + + // assuming the orchestrator survived the OOM, delete the evidence file and return + System.IO.File.Delete(evidenceFile) + return "done!"; + } + + [Function(nameof(ProcessExitOrchestrator))] + public static async Task ProcessExitOrchestrator( + [OrchestrationTrigger] TaskOrchestrationContext context) + { + // this orchestrator is not deterministic, on purpose. + // we use the non-determinism to force a sudden process exit on only the first replay + + // check if a file named "replayEvidence" exists in the current directory. + // create it if it does not + string evidenceFile = "replayEvidence"; + bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) + { + System.IO.File.Create(evidenceFile).Close(); + } + + // on the very first replay, OOM the process + if (isTheFirstReplay) + { + // force the process to suddenly exit + Environment.FailFast(-1); + } + + // assuming the orchestrator survived the OOM, delete the evidence file and return + System.IO.File.Delete(evidenceFile) + + return "done!"; + } + + [Function(nameof(TimeoutOrchestrator))] + public static async Task TimeoutOrchestrator( + [OrchestrationTrigger] TaskOrchestrationContext context) + { + // this orchestrator is not deterministic, on purpose. + // we use the non-determinism to force a timeout on only the first replay + + // check if a file named "replayEvidence" exists in the current directory. + // create it if it does not + string evidenceFile = "replayEvidence"; + bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) + { + System.IO.File.Create(evidenceFile).Close(); + } + + // on the very first replay, time out the execution + if (isTheFirstReplay) + { + // force the process to timeout after a 1 minute wait + System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1)); + } + + // assuming the orchestrator survived the timeout, delete the evidence file and return + System.IO.File.Delete(evidenceFile) + + return "done!"; + } + + [Function("durable_HttpStartOOMOrchestrator")] + public static async Task HttpStart( + [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, + [DurableClient] DurableTaskClient client, + FunctionContext executionContext) + { + ILogger logger = executionContext.GetLogger("durable_HttpStartOOMOrchestrator"); + + // Function input comes from the request content. + string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( + nameof(OOMOrchestrator)); + + logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); + + // Returns an HTTP 202 response with an instance management payload. + // See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration + return await client.CreateCheckStatusResponseAsync(req, instanceId); + } + + [Function("durable_HttpStartProcessExitOrchestrator")] + public static async Task HttpStart( + [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, + [DurableClient] DurableTaskClient client, + FunctionContext executionContext) + { + ILogger logger = executionContext.GetLogger("durable_HttpStartProcessExitOrchestrator"); + + // Function input comes from the request content. + string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( + nameof(ProcessExitOrchestrator)); + + logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); + + // Returns an HTTP 202 response with an instance management payload. + // See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration + return await client.CreateCheckStatusResponseAsync(req, instanceId); + } + + [Function("durable_HttpStartTimeoutOrchestrator")] + public static async Task HttpStart( + [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, + [DurableClient] DurableTaskClient client, + FunctionContext executionContext) + { + ILogger logger = executionContext.GetLogger("durable_HttpStartTimeoutOrchestrator"); + + // Function input comes from the request content. + string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( + nameof(TimeoutOrchestrator)); + + logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); + + // Returns an HTTP 202 response with an instance management payload. + // See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration + return await client.CreateCheckStatusResponseAsync(req, instanceId); + } + } +} diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json index 278b52cde..e4b5cdf28 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json @@ -7,5 +7,6 @@ "excludedTypes": "Request" } } - } + }, + "functionTimeout": "00:00:30" } \ No newline at end of file From d6bb8e41e2053c93363f86de9fd43618c2be359e Mon Sep 17 00:00:00 2001 From: David Justo Date: Mon, 30 Sep 2024 10:45:05 -0700 Subject: [PATCH 04/15] fix build warning --- src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index 8ca785998..7ac3e8003 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -257,8 +257,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( else { // the function failed for some other reason - - string exceptionDetails = functionResult.Exception.ToString(); + string exceptionDetails = functionResult.Exception?.ToString(); this.TraceHelper.FunctionFailed( this.Options.HubName, From d333530cad2f2a12f8282d2f16498bfe6b14a1b3 Mon Sep 17 00:00:00 2001 From: David Justo Date: Tue, 1 Oct 2024 11:45:15 -0700 Subject: [PATCH 05/15] checkpoint --- .../smoketest-dotnet-isolated-v4.yml | 83 +++++++++++-- Directory.Build.targets | 37 ++++++ WebJobs.Extensions.DurableTask.sln | 3 +- eng/ci/official-build.yml | 2 + eng/templates/build.yml | 1 + release_notes.md | 8 +- .../AzureStorageDurabilityProviderFactory.cs | 1 + ...t.Azure.WebJobs.Extensions.DurableTask.xml | 18 +++ .../Options/AzureStorageOptions.cs | 14 +++ .../OutOfProcMiddleware.cs | 7 +- .../WebJobs.Extensions.DurableTask.csproj | 4 +- .../AssemblyInfo.cs | 2 +- .../Worker.Extensions.DurableTask.csproj | 6 +- .../DotNetIsolated/Dockerfile | 5 + .../DotNetIsolated/FaultyOrchestrators.cs | 71 +++++------ .../OOProcSmokeTests/DotNetIsolated/host.json | 9 ++ .../DotNetIsolated/run-smoke-tests.ps1 | 115 ++++++++++++++++++ test/SmokeTests/e2e-test.ps1 | 19 +-- tools/triageHelper/function_app.py | 2 +- 19 files changed, 334 insertions(+), 73 deletions(-) create mode 100644 Directory.Build.targets create mode 100644 test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 diff --git a/.github/workflows/smoketest-dotnet-isolated-v4.yml b/.github/workflows/smoketest-dotnet-isolated-v4.yml index cf29e4e63..474f48448 100644 --- a/.github/workflows/smoketest-dotnet-isolated-v4.yml +++ b/.github/workflows/smoketest-dotnet-isolated-v4.yml @@ -19,22 +19,79 @@ jobs: steps: - uses: actions/checkout@v2 - # Validation is blocked on https://github.com/Azure/azure-functions-host/issues/7995 - - name: Run V4 .NET Isolated Smoke Test - run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/StartHelloCitiesTyped -NoValidation + # Install .NET versions + - name: Set up .NET Core 3.1 + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '3.1.x' + + - name: Set up .NET Core 2.1 + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '2.1.x' + + - name: Set up .NET Core 6.x + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '6.x' + + - name: Set up .NET Core 8.x + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '8.x' + + # Install Azurite + - name: Set up Node.js (needed for Azurite) + uses: actions/setup-node@v3 + with: + node-version: '18.x' # Azurite requires at least Node 18 + + - name: Install Azurite + run: npm install -g azurite + + - name: Restore WebJobs extension + run: dotnet restore $solution + + - name: Build and pack WebJobs extension + run: cd ./src/WebJobs.Extensions.DurableTask && + mkdir ./out && + dotnet build -c Release WebJobs.Extensions.DurableTask.csproj --output ./out && + mkdir ~/packages && + dotnet nuget push ./out/Microsoft.Azure.WebJobs.Extensions.DurableTask.*.nupkg --source ~/packages && + dotnet nuget add source ~/packages + + - name: Build .NET Isolated Smoke Test + run: cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && + dotnet restore --verbosity normal && + dotnet build -c Release + + - name: Install core tools + run: npm i -g azure-functions-core-tools@4 --unsafe-perm true + + # Run smoke tests + # Unlike other smoke tests, the .NET isolated smoke tests run outside of a docker container, but to race conditions + # when building the smoke test app in docker, causing the build to fail. This is a temporary workaround until the + # root cause is identified and fixed. + + - name: Run smoke tests (Hello Cities) shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/StartHelloCitiesTyped - # Test that OOM errors are recoverable - - name: Run V4 .NET OOM Test - run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/durable_HttpStartOOMOrchestrator -NoValidation + - name: Run smoke tests (Process Exit) shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartProcessExitOrchestrator - # Test that FailFast errors are recoverable - - name: Run V4 .NET FailFast Test - run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/durable_HttpStartProcessExitOrchestrator -NoValidation + - name: Run smoke tests (Timeout) shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartTimeoutOrchestrator - # Test that timeout errors are recoverable - - name: Run V4 .NET FailFast Test - run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/durable_HttpStartTimeoutOrchestrator -NoValidation - shell: pwsh \ No newline at end of file + - name: Run smoke tests (OOM) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartOOMOrchestrator \ No newline at end of file diff --git a/Directory.Build.targets b/Directory.Build.targets new file mode 100644 index 000000000..47c2b86a2 --- /dev/null +++ b/Directory.Build.targets @@ -0,0 +1,37 @@ + + + + + + + + + false + <_TranslateUrlPattern>(https://azfunc%40dev\.azure\.com/azfunc/internal/_git|https://dev\.azure\.com/azfunc/internal/_git|https://azfunc\.visualstudio\.com/internal/_git|azfunc%40vs-ssh\.visualstudio\.com:v3/azfunc/internal|git%40ssh\.dev\.azure\.com:v3/azfunc/internal)/([^/\.]+)\.(.+) + <_TranslateUrlReplacement>https://github.com/$2/$3 + + + + + + $([System.Text.RegularExpressions.Regex]::Replace($(ScmRepositoryUrl), $(_TranslateUrlPattern), $(_TranslateUrlReplacement))) + + + + $([System.Text.RegularExpressions.Regex]::Replace(%(SourceRoot.ScmRepositoryUrl), $(_TranslateUrlPattern), $(_TranslateUrlReplacement))) + + + + + \ No newline at end of file diff --git a/WebJobs.Extensions.DurableTask.sln b/WebJobs.Extensions.DurableTask.sln index 353e83805..b710584c2 100644 --- a/WebJobs.Extensions.DurableTask.sln +++ b/WebJobs.Extensions.DurableTask.sln @@ -18,6 +18,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution .editorconfig = .editorconfig azure-pipelines-release-dotnet-isolated.yml = azure-pipelines-release-dotnet-isolated.yml azure-pipelines-release.yml = azure-pipelines-release.yml + Directory.Build.targets = Directory.Build.targets nuget.config = nuget.config README.md = README.md release_notes.md = release_notes.md @@ -94,7 +95,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PerfTests", "PerfTests", "{ EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DFPerfScenariosV4", "test\DFPerfScenarios\DFPerfScenariosV4.csproj", "{FC8AD123-F949-4D21-B817-E5A4BBF7F69B}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Worker.Extensions.DurableTask.Tests", "test\Worker.Extensions.DurableTask.Tests\Worker.Extensions.DurableTask.Tests.csproj", "{76DEC17C-BF6A-498A-8E8A-7D6CB2E03284}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Worker.Extensions.DurableTask.Tests", "test\Worker.Extensions.DurableTask.Tests\Worker.Extensions.DurableTask.Tests.csproj", "{76DEC17C-BF6A-498A-8E8A-7D6CB2E03284}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/eng/ci/official-build.yml b/eng/ci/official-build.yml index d0839ba79..e7a871026 100644 --- a/eng/ci/official-build.yml +++ b/eng/ci/official-build.yml @@ -6,6 +6,7 @@ trigger: branches: include: - main + - dev # CI only, does not trigger on PRs. pr: none @@ -19,6 +20,7 @@ schedules: branches: include: - main + - dev always: true resources: diff --git a/eng/templates/build.yml b/eng/templates/build.yml index d61357f0b..3e3e41040 100644 --- a/eng/templates/build.yml +++ b/eng/templates/build.yml @@ -46,6 +46,7 @@ jobs: solution: '**/WebJobs.Extensions.DurableTask.sln' vsVersion: "16.0" configuration: Release + msbuildArgs: /p:FileVersionRevision=$(Build.BuildId) /p:ContinuousIntegrationBuild=true # these flags make package build deterministic - template: ci/sign-files.yml@eng parameters: diff --git a/release_notes.md b/release_notes.md index ad44bb9b3..9b2804fe4 100644 --- a/release_notes.md +++ b/release_notes.md @@ -1,10 +1,10 @@ # Release Notes -## Microsoft.Azure.Functions.Worker.Extensions.DurableTask 1.2.1 +## Microsoft.Azure.Functions.Worker.Extensions.DurableTask 1.1.6 ### New Features -- Fix regression on `TerminateInstanceAsync` API causing invocations to fail with "unimplemented" exceptions (https://github.com/Azure/azure-functions-durable-extension/pull/2829). +- Support for new `AllowReplayingTerminalInstances` setting in Azure Storage backend (https://github.com/Azure/durabletask/pull/1159), settable via `host.json` ### Bug Fixes @@ -12,6 +12,10 @@ ### Dependency Updates +- Microsoft.DurableTask.Client.Grpc to 1.3.0 +- Microsoft.DurableTask.Worker.Grpc to 1.3.0 +- Microsoft.Azure.WebJobs.Extensions.DurableTask (in host process) to 2.13.6 + ## Microsoft.Azure.WebJobs.Extensions.DurableTask ### New Features diff --git a/src/WebJobs.Extensions.DurableTask/AzureStorageDurabilityProviderFactory.cs b/src/WebJobs.Extensions.DurableTask/AzureStorageDurabilityProviderFactory.cs index 0162d26b4..242bef777 100644 --- a/src/WebJobs.Extensions.DurableTask/AzureStorageDurabilityProviderFactory.cs +++ b/src/WebJobs.Extensions.DurableTask/AzureStorageDurabilityProviderFactory.cs @@ -217,6 +217,7 @@ internal AzureStorageOrchestrationServiceSettings GetAzureStorageOrchestrationSe UseSeparateQueueForEntityWorkItems = this.useSeparateQueueForEntityWorkItems, EntityMessageReorderWindowInMinutes = this.options.EntityMessageReorderWindowInMinutes, MaxEntityOperationBatchSize = this.options.MaxEntityOperationBatchSize, + AllowReplayingTerminalInstances = this.azureStorageOptions.AllowReplayingTerminalInstances, }; if (this.inConsumption) diff --git a/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml b/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml index 0088042e9..39653aa48 100644 --- a/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml +++ b/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml @@ -475,6 +475,24 @@ The result. + + + Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. + + + Today, this method only checks for . In the future, we may want to add more cases. + Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield + a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be + handled in this method. + However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` + by the isolated process, and thus need special handling. + It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level + errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them + to this method as we encounter them. + + The failure details of the orchestrator. + If an OOM error is detected. + Defines convenient overloads for calling the context methods, for all the contexts. diff --git a/src/WebJobs.Extensions.DurableTask/Options/AzureStorageOptions.cs b/src/WebJobs.Extensions.DurableTask/Options/AzureStorageOptions.cs index 1667aabaf..4a6a506cb 100644 --- a/src/WebJobs.Extensions.DurableTask/Options/AzureStorageOptions.cs +++ b/src/WebJobs.Extensions.DurableTask/Options/AzureStorageOptions.cs @@ -179,6 +179,20 @@ public string TrackingStoreConnectionStringName /// A boolean indicating whether to use the table partition strategy. Defaults to false. public bool UseTablePartitionManagement { get; set; } = false; + /// + /// When false, when an orchestrator is in a terminal state (e.g. Completed, Failed, Terminated), events for that orchestrator are discarded. + /// Otherwise, events for a terminal orchestrator induce a replay. This may be used to recompute the state of the orchestrator in the "Instances Table". + /// + /// + /// Transactions across Azure Tables are not possible, so we independently update the "History table" and then the "Instances table" + /// to set the state of the orchestrator. + /// If a crash were to occur between these two updates, the state of the orchestrator in the "Instances table" would be incorrect. + /// By setting this configuration to true, you can recover from these inconsistencies by forcing a replay of the orchestrator in response + /// to a client event like a termination request or an external event, which gives the framework another opportunity to update the state of + /// the orchestrator in the "Instances table". To force a replay after enabling this configuration, just send any external event to the affected instanceId. + /// + public bool AllowReplayingTerminalInstances { get; set; } = false; + /// /// Throws an exception if the provided hub name violates any naming conventions for the storage provider. /// diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index 7ac3e8003..d50e5657f 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -170,8 +170,7 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( // - an out of memory exception // - a worker process exit if (functionResult.Exception is Host.FunctionTimeoutException - // see in RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling - || functionResult.Exception?.InnerException is OutOfMemoryException + || functionResult.Exception?.InnerException is OutOfMemoryException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling || (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) { // TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. @@ -257,7 +256,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( else { // the function failed for some other reason - string exceptionDetails = functionResult.Exception?.ToString(); + string exceptionDetails = functionResult.Exception?.ToString() ?? "Framework-internal message: exception details could not be extracted"; this.TraceHelper.FunctionFailed( this.Options.HubName, @@ -276,7 +275,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( orchestratorResult = OrchestratorExecutionResult.ForFailure( message: $"Function '{functionName}' failed with an unhandled exception.", - functionResult.Exception); + functionResult.Exception ?? new Exception($"Function '{functionName}' failed with an unknown unhandled exception")); } // Send the result of the orchestrator function to the DTFx dispatch pipeline. diff --git a/src/WebJobs.Extensions.DurableTask/WebJobs.Extensions.DurableTask.csproj b/src/WebJobs.Extensions.DurableTask/WebJobs.Extensions.DurableTask.csproj index e023c4d4c..c6bae32df 100644 --- a/src/WebJobs.Extensions.DurableTask/WebJobs.Extensions.DurableTask.csproj +++ b/src/WebJobs.Extensions.DurableTask/WebJobs.Extensions.DurableTask.csproj @@ -6,7 +6,7 @@ Microsoft.Azure.WebJobs.Extensions.DurableTask 2 13 - 5 + 6 $(PackageSuffix) $(MajorVersion).$(MinorVersion).$(PatchVersion) $(MajorVersion).0.0.0 @@ -114,7 +114,7 @@ - + diff --git a/src/Worker.Extensions.DurableTask/AssemblyInfo.cs b/src/Worker.Extensions.DurableTask/AssemblyInfo.cs index e7b781cf0..63fc22df6 100644 --- a/src/Worker.Extensions.DurableTask/AssemblyInfo.cs +++ b/src/Worker.Extensions.DurableTask/AssemblyInfo.cs @@ -5,5 +5,5 @@ using Microsoft.Azure.Functions.Worker.Extensions.Abstractions; // TODO: Find a way to generate this dynamically at build-time -[assembly: ExtensionInformation("Microsoft.Azure.WebJobs.Extensions.DurableTask", "2.13.5")] +[assembly: ExtensionInformation("Microsoft.Azure.WebJobs.Extensions.DurableTask", "2.13.6")] [assembly: InternalsVisibleTo("Worker.Extensions.DurableTask.Tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100cd1dabd5a893b40e75dc901fe7293db4a3caf9cd4d3e3ed6178d49cd476969abe74a9e0b7f4a0bb15edca48758155d35a4f05e6e852fff1b319d103b39ba04acbadd278c2753627c95e1f6f6582425374b92f51cca3deb0d2aab9de3ecda7753900a31f70a236f163006beefffe282888f85e3c76d1205ec7dfef7fa472a17b1")] diff --git a/src/Worker.Extensions.DurableTask/Worker.Extensions.DurableTask.csproj b/src/Worker.Extensions.DurableTask/Worker.Extensions.DurableTask.csproj index e6c954cb3..b310e80da 100644 --- a/src/Worker.Extensions.DurableTask/Worker.Extensions.DurableTask.csproj +++ b/src/Worker.Extensions.DurableTask/Worker.Extensions.DurableTask.csproj @@ -29,7 +29,7 @@ ..\..\sign.snk - 1.1.5 + 1.1.6 $(VersionPrefix).0 @@ -39,8 +39,8 @@ - - + + diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile index 51ad96b16..3b24d9213 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile @@ -25,6 +25,11 @@ RUN cd /root/test/SmokeTests/OOProcSmokeTests/DotNetIsolated && \ ls -aR /home/site/wwwroot && \ cat /home/site/wwwroot/extensions.json # debugging +RUN cat /root/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/obj/Release/net6.0/WorkerExtensions/WorkerExtensions.csproj + +RUN ls /root/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/bin/ +# .azurefunctions/function.deps.json + # Step 3: Generate the final app image to run FROM mcr.microsoft.com/azure-functions/dotnet-isolated:4-dotnet-isolated6.0 diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs index 49dc8668c..90006fd20 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs @@ -10,24 +10,22 @@ namespace FaultOrchestrators public static class FaultyOrchestrators { [Function(nameof(OOMOrchestrator))] - public static async Task OOMOrchestrator( + public static Task OOMOrchestrator( [OrchestrationTrigger] TaskOrchestrationContext context) { // this orchestrator is not deterministic, on purpose. // we use the non-determinism to force an OOM exception on only the first replay - // check if a file named "replayEvidence" exists in the current directory. - // create it if it does not - string evidenceFile = "replayEvidence"; + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); if (isTheFirstReplay) { System.IO.File.Create(evidenceFile).Close(); - } - - // on the very first replay, OOM the process - if (isTheFirstReplay) - { + // force the process to run out of memory List data = new List(); @@ -38,70 +36,67 @@ public static async Task OOMOrchestrator( } // assuming the orchestrator survived the OOM, delete the evidence file and return - System.IO.File.Delete(evidenceFile) - return "done!"; + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; } [Function(nameof(ProcessExitOrchestrator))] - public static async Task ProcessExitOrchestrator( + public static Task ProcessExitOrchestrator( [OrchestrationTrigger] TaskOrchestrationContext context) { // this orchestrator is not deterministic, on purpose. // we use the non-determinism to force a sudden process exit on only the first replay - // check if a file named "replayEvidence" exists in the current directory. - // create it if it does not - string evidenceFile = "replayEvidence"; + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); if (isTheFirstReplay) { System.IO.File.Create(evidenceFile).Close(); - } - - // on the very first replay, OOM the process - if (isTheFirstReplay) - { - // force the process to suddenly exit - Environment.FailFast(-1); + + // simulate sudden crash + Environment.FailFast("Simulating crash!"); } // assuming the orchestrator survived the OOM, delete the evidence file and return - System.IO.File.Delete(evidenceFile) + System.IO.File.Delete(evidenceFile); - return "done!"; + return Task.CompletedTask; } [Function(nameof(TimeoutOrchestrator))] - public static async Task TimeoutOrchestrator( + public static Task TimeoutOrchestrator( [OrchestrationTrigger] TaskOrchestrationContext context) { // this orchestrator is not deterministic, on purpose. // we use the non-determinism to force a timeout on only the first replay - // check if a file named "replayEvidence" exists in the current directory. - // create it if it does not - string evidenceFile = "replayEvidence"; + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) { System.IO.File.Create(evidenceFile).Close(); - } - - // on the very first replay, time out the execution - if (isTheFirstReplay) - { + // force the process to timeout after a 1 minute wait System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1)); } // assuming the orchestrator survived the timeout, delete the evidence file and return - System.IO.File.Delete(evidenceFile) + System.IO.File.Delete(evidenceFile); - return "done!"; + return Task.CompletedTask; } [Function("durable_HttpStartOOMOrchestrator")] - public static async Task HttpStart( + public static async Task HttpStartOOMOrchestrator( [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, [DurableClient] DurableTaskClient client, FunctionContext executionContext) @@ -120,7 +115,7 @@ public static async Task HttpStart( } [Function("durable_HttpStartProcessExitOrchestrator")] - public static async Task HttpStart( + public static async Task HttpStartProcessExitOrchestrator( [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, [DurableClient] DurableTaskClient client, FunctionContext executionContext) @@ -139,7 +134,7 @@ public static async Task HttpStart( } [Function("durable_HttpStartTimeoutOrchestrator")] - public static async Task HttpStart( + public static async Task HttpStartTimeoutOrchestrator( [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, [DurableClient] DurableTaskClient client, FunctionContext executionContext) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json index e4b5cdf28..ae6c359f7 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json @@ -8,5 +8,14 @@ } } }, + "extensions": { + "durableTask": { + "hubName": "hubbbb1113324", + "storageProvider": { + "maxQueuePollingInterval": "00:00:01", + "controlQueueVisibilityTimeout": "00:01:00" + } + } + }, "functionTimeout": "00:00:30" } \ No newline at end of file diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 new file mode 100644 index 000000000..839cdea5d --- /dev/null +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 @@ -0,0 +1,115 @@ +# This is a simple test runner to validate the .NET isolated smoke tests. +# It supercedes the usual e2e-tests.ps1 script for the .NET isolated scenario because building the snmoke test app +# on the docker image is unreliable. For more details, see: https://github.com/Azure/azure-functions-host/issues/7995 + +# This script is designed specifically to test cases where the isolated worker process experiences a platform failure: +# timeouts, OOMs, etc. For that reason, it is careful to check that the Functions Host is running and healthy at regular +# intervals. This makes these tests run more slowly than other test categories. + +param( + [Parameter(Mandatory=$true)] + [string]$HttpStartPath +) + +$retryCount = 0; +$statusUrl = $null; +$success = $false; +$haveManuallyRestartedHost = $false; + +Do { + $testIsRunning = $true; + + # Start the functions host if it's not running already. + # Then give it up to 1 minute to start up. + # This is a long wait, but from experience the CI can be slow to start up the host, especially after a platform-error. + $isFunctionsHostRunning = (Get-Process -Name func -ErrorAction SilentlyContinue) + if ($isFunctionsHostRunning -eq $null) { + Write-Host "Starting the Functions host..." -ForegroundColor Yellow + + # The '&' operator is used to run the command in the background + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + Write-Host "Waiting for the Functions host to start up..." -ForegroundColor Yellow + Start-Sleep -Seconds 60 + } + + + try { + # Make sure the Functions runtime is up and running + $pingUrl = "http://localhost:7071/admin/host/ping" + Write-Host "Pinging app at $pingUrl to ensure the host is healthy" -ForegroundColor Yellow + Invoke-RestMethod -Method Post -Uri "http://localhost:7071/admin/host/ping" + Write-Host "Host is healthy!" -ForegroundColor Green + + # Start orchestrator if it hasn't been started yet + if ($statusUrl -eq $null){ + $startOrchestrationUri = "http://localhost:7071/$HttpStartPath" + Write-Host "Starting a new orchestration instance via POST to $startOrchestrationUri..." -ForegroundColor Yellow + + $result = Invoke-RestMethod -Method Post -Uri $startOrchestrationUri + Write-Host "Started orchestration with instance ID '$($result.id)'!" -ForegroundColor Yellow + Write-Host "Waiting for orchestration to complete..." -ForegroundColor Yellow + + $statusUrl = $result.statusQueryGetUri + + # sleep for a bit to give the orchestrator a chance to start, + # then loop once more in case the orchestrator ran quickly, made the host unhealthy, + # and the functions host needs to be restarted + Start-Sleep -Seconds 5 + continue; + } + + # Check the orchestrator status + $result = Invoke-RestMethod -Method Get -Uri $statusUrl + $runtimeStatus = $result.runtimeStatus + Write-Host "Orchestration is $runtimeStatus" -ForegroundColor Yellow + Write-Host $result + + if ($result.runtimeStatus -eq "Completed") { + $success = $true + $testIsRunning = $false + break + } + if ($result.runtimeStatus -eq "Failed") { + $success = $false + $testIsRunning = $false + break + } + + # If the orchestrator did not complete yet, wait for a bit before checking again + Start-Sleep -Seconds 2 + $retryCount = $retryCount + 1 + + } catch { + Write-Host "An error occurred:" -ForegroundColor Red + Write-Host $_ -ForegroundColor Red + + # When testing for platform errors, we want to make sure the Functions host is healthy and ready to take requests. + # The Host can get into bad states (for example, in an OOM-inducing test) where it does not self-heal. + # For these cases, we manually restart the host to ensure it is in a good state. We only do this once per test. + if ($haveManuallyRestartedHost -eq $false) { + + # We stop the host process and wait for a bit before checking if it is running again. + Write-Host "Restarting the Functions host..." -ForegroundColor Yellow + Stop-Process -Name "func" -Force + Start-Sleep -Seconds 5 + + # Log whether the process kill succeeded + $haveManuallyRestartedHost = $true + $isFunctionsHostRunning = ((Get-Process -Name func -ErrorAction SilentlyContinue) -eq $null) + Write-Host "Host process killed: $isFunctionsHostRunning" -ForegroundColor Yellow + + # the beginning of the loop will restart the host + continue + } + + # Rethrow the original exception + throw + } + +} while (($testIsRunning -eq $true) -and ($retryCount -lt 65)) + +if ($success -eq $false) { + throw "Orchestration failed or did not compete in time! :(" +} + +Write-Host "Success!" -ForegroundColor Green \ No newline at end of file diff --git a/test/SmokeTests/e2e-test.ps1 b/test/SmokeTests/e2e-test.ps1 index e7a7aa8c1..f4ed28617 100644 --- a/test/SmokeTests/e2e-test.ps1 +++ b/test/SmokeTests/e2e-test.ps1 @@ -26,7 +26,7 @@ function Exit-OnError() { } $ErrorActionPreference = "Stop" -$AzuriteVersion = "3.26.0" +# $AzuriteVersion = "3.26.0" if ($NoSetup -eq $false) { # Build the docker image first, since that's the most critical step @@ -35,14 +35,15 @@ if ($NoSetup -eq $false) { Exit-OnError # Next, download and start the Azurite emulator Docker image - Write-Host "Pulling down the mcr.microsoft.com/azure-storage/azurite:$AzuriteVersion image..." -ForegroundColor Yellow - docker pull "mcr.microsoft.com/azure-storage/azurite:${AzuriteVersion}" + Write-Host "Pulling down the mcr.microsoft.com/azure-storage/azurite image..." -ForegroundColor Yellow + docker pull "mcr.microsoft.com/azure-storage/azurite" Exit-OnError Write-Host "Starting Azurite storage emulator using default ports..." -ForegroundColor Yellow - docker run --name 'azurite' -p 10000:10000 -p 10001:10001 -p 10002:10002 -d "mcr.microsoft.com/azure-storage/azurite:${AzuriteVersion}" + docker run --name 'azurite' -p 10000:10000 -p 10001:10001 -p 10002:10002 -d "mcr.microsoft.com/azure-storage/azurite" Exit-OnError - + # Author's note: we don't call 'Exit-OnError' here because this container may already be running + if ($SetupSQLServer -eq $true) { Write-Host "Pulling down the mcr.microsoft.com/mssql/server:$tag image..." docker pull mcr.microsoft.com/mssql/server:$tag @@ -51,7 +52,7 @@ if ($NoSetup -eq $false) { # Start the SQL Server docker container with the specified edition Write-Host "Starting SQL Server $tag $sqlpid docker container on port $port" -ForegroundColor DarkYellow docker run --name mssql-server -e 'ACCEPT_EULA=Y' -e "MSSQL_SA_PASSWORD=$pw" -e "MSSQL_PID=$sqlpid" -p ${port}:1433 -d mcr.microsoft.com/mssql/server:$tag - Exit-OnError + # Author's note: we don't call 'Exit-OnError' here because this container may already be running # Wait for SQL Server to be ready Write-Host "Waiting for SQL Server to be ready..." -ForegroundColor Yellow @@ -80,7 +81,7 @@ if ($NoSetup -eq $false) { --env 'AzureWebJobsStorage=UseDevelopmentStorage=true;DevelopmentStorageProxyUri=http://host.docker.internal' ` --env 'WEBSITE_HOSTNAME=localhost:8080' ` $ImageName - Exit-OnError + # Author's note: we don't call 'Exit-OnError' here because this container may already be running } else { Write-Host "Starting $ContainerName application container" -ForegroundColor Yellow @@ -89,7 +90,7 @@ if ($NoSetup -eq $false) { --env 'WEBSITE_HOSTNAME=localhost:8080' ` $ImageName } - Exit-OnError + # Author's note: we don't call 'Exit-OnError' here because this container may already be running } if ($sleep -gt 0) { @@ -107,6 +108,7 @@ try { $pingUrl = "http://localhost:8080/admin/host/ping" Write-Host "Pinging app at $pingUrl to ensure the host is healthy" -ForegroundColor Yellow Invoke-RestMethod -Method Post -Uri "http://localhost:8080/admin/host/ping" + Write-Host "Host is healthy!" -ForegroundColor Green Exit-OnError if ($NoValidation -eq $false) { @@ -129,6 +131,7 @@ try { if ($result.runtimeStatus -eq "Completed") { $success = $true + Write-Host $result break } diff --git a/tools/triageHelper/function_app.py b/tools/triageHelper/function_app.py index 12a6d77ff..c2f4d8aa1 100644 --- a/tools/triageHelper/function_app.py +++ b/tools/triageHelper/function_app.py @@ -12,6 +12,7 @@ "Azure/azure-functions-durable-extension", "Azure/azure-functions-durable-js", "Azure/azure-functions-durable-python", + "Azure/azure-functions-durable-powershell", powershell_worker_repo, "microsoft/durabletask-java", "microsoft/durabletask-dotnet", @@ -40,7 +41,6 @@ def get_triage_issues(repository): 'labels': label, } - payload_str = urllib.parse.urlencode(payload, safe=':+') # Define the GitHub API endpoint api_endpoint = f"https://api.github.com/repos/{repository}/issues" query_str1 = "?labels=Needs%3A%20Triage%20%3Amag%3A" From 72428b9f84b205a45a340505436a02dbb1758d36 Mon Sep 17 00:00:00 2001 From: David Justo Date: Tue, 1 Oct 2024 11:48:32 -0700 Subject: [PATCH 06/15] reset e2e tests --- test/SmokeTests/e2e-test.ps1 | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/test/SmokeTests/e2e-test.ps1 b/test/SmokeTests/e2e-test.ps1 index cdb900150..845c35eb2 100644 --- a/test/SmokeTests/e2e-test.ps1 +++ b/test/SmokeTests/e2e-test.ps1 @@ -26,7 +26,7 @@ function Exit-OnError() { } $ErrorActionPreference = "Stop" -$AzuriteVersion = "3.32.0" +$AzuriteVersion = "3.26.0" if ($NoSetup -eq $false) { # Build the docker image first, since that's the most critical step @@ -35,15 +35,14 @@ if ($NoSetup -eq $false) { Exit-OnError # Next, download and start the Azurite emulator Docker image - Write-Host "Pulling down the mcr.microsoft.com/azure-storage/azurite image..." -ForegroundColor Yellow - docker pull "mcr.microsoft.com/azure-storage/azurite" + Write-Host "Pulling down the mcr.microsoft.com/azure-storage/azurite:$AzuriteVersion image..." -ForegroundColor Yellow + docker pull "mcr.microsoft.com/azure-storage/azurite:${AzuriteVersion}" Exit-OnError Write-Host "Starting Azurite storage emulator using default ports..." -ForegroundColor Yellow - docker run --name 'azurite' -p 10000:10000 -p 10001:10001 -p 10002:10002 -d "mcr.microsoft.com/azure-storage/azurite" + docker run --name 'azurite' -p 10000:10000 -p 10001:10001 -p 10002:10002 -d "mcr.microsoft.com/azure-storage/azurite:${AzuriteVersion}" Exit-OnError - # Author's note: we don't call 'Exit-OnError' here because this container may already be running - + if ($SetupSQLServer -eq $true) { Write-Host "Pulling down the mcr.microsoft.com/mssql/server:$tag image..." docker pull mcr.microsoft.com/mssql/server:$tag @@ -52,7 +51,7 @@ if ($NoSetup -eq $false) { # Start the SQL Server docker container with the specified edition Write-Host "Starting SQL Server $tag $sqlpid docker container on port $port" -ForegroundColor DarkYellow docker run --name mssql-server -e 'ACCEPT_EULA=Y' -e "MSSQL_SA_PASSWORD=$pw" -e "MSSQL_PID=$sqlpid" -p ${port}:1433 -d mcr.microsoft.com/mssql/server:$tag - # Author's note: we don't call 'Exit-OnError' here because this container may already be running + Exit-OnError # Wait for SQL Server to be ready Write-Host "Waiting for SQL Server to be ready..." -ForegroundColor Yellow @@ -66,7 +65,7 @@ if ($NoSetup -eq $false) { # Create the database with strict binary collation Write-Host "Creating '$dbname' database with '$collation' collation" -ForegroundColor DarkYellow - docker exec -d mssql-server /opt/mssql-tools18/bin/sqlcmd -S . -U sa -P "$pw" -Q "CREATE DATABASE [$dbname] COLLATE $collation" + docker exec -d mssql-server /opt/mssql-tools/bin/sqlcmd -S . -U sa -P "$pw" -Q "CREATE DATABASE [$dbname] COLLATE $collation" Exit-OnError # Wait for database to be ready @@ -81,7 +80,7 @@ if ($NoSetup -eq $false) { --env 'AzureWebJobsStorage=UseDevelopmentStorage=true;DevelopmentStorageProxyUri=http://host.docker.internal' ` --env 'WEBSITE_HOSTNAME=localhost:8080' ` $ImageName - # Author's note: we don't call 'Exit-OnError' here because this container may already be running + Exit-OnError } else { Write-Host "Starting $ContainerName application container" -ForegroundColor Yellow @@ -90,7 +89,7 @@ if ($NoSetup -eq $false) { --env 'WEBSITE_HOSTNAME=localhost:8080' ` $ImageName } - # Author's note: we don't call 'Exit-OnError' here because this container may already be running + Exit-OnError } if ($sleep -gt 0) { @@ -108,7 +107,6 @@ try { $pingUrl = "http://localhost:8080/admin/host/ping" Write-Host "Pinging app at $pingUrl to ensure the host is healthy" -ForegroundColor Yellow Invoke-RestMethod -Method Post -Uri "http://localhost:8080/admin/host/ping" - Write-Host "Host is healthy!" -ForegroundColor Green Exit-OnError if ($NoValidation -eq $false) { @@ -131,7 +129,6 @@ try { if ($result.runtimeStatus -eq "Completed") { $success = $true - Write-Host $result break } From 3aaa919bf531bfd2a714f2f14b1dc41cfc608ad1 Mon Sep 17 00:00:00 2001 From: David Justo Date: Tue, 1 Oct 2024 11:49:13 -0700 Subject: [PATCH 07/15] reset e2e tests --- test/SmokeTests/e2e-test.ps1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/SmokeTests/e2e-test.ps1 b/test/SmokeTests/e2e-test.ps1 index 845c35eb2..5eee1e0fd 100644 --- a/test/SmokeTests/e2e-test.ps1 +++ b/test/SmokeTests/e2e-test.ps1 @@ -26,7 +26,7 @@ function Exit-OnError() { } $ErrorActionPreference = "Stop" -$AzuriteVersion = "3.26.0" +$AzuriteVersion = "3.32.0" if ($NoSetup -eq $false) { # Build the docker image first, since that's the most critical step @@ -65,7 +65,7 @@ if ($NoSetup -eq $false) { # Create the database with strict binary collation Write-Host "Creating '$dbname' database with '$collation' collation" -ForegroundColor DarkYellow - docker exec -d mssql-server /opt/mssql-tools/bin/sqlcmd -S . -U sa -P "$pw" -Q "CREATE DATABASE [$dbname] COLLATE $collation" + docker exec -d mssql-server /opt/mssql-tools18/bin/sqlcmd -S . -U sa -P "$pw" -Q "CREATE DATABASE [$dbname] COLLATE $collation" Exit-OnError # Wait for database to be ready From 079f1124316f4463b20cae3a994273d2d5677d88 Mon Sep 17 00:00:00 2001 From: David Justo Date: Tue, 1 Oct 2024 11:49:57 -0700 Subject: [PATCH 08/15] reset host.json --- test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json | 1 - 1 file changed, 1 deletion(-) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json index ae6c359f7..0ec9c6a89 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json @@ -10,7 +10,6 @@ }, "extensions": { "durableTask": { - "hubName": "hubbbb1113324", "storageProvider": { "maxQueuePollingInterval": "00:00:01", "controlQueueVisibilityTimeout": "00:01:00" From 065fdd78c8bff8520187677fa6643d7617dee540 Mon Sep 17 00:00:00 2001 From: David Justo Date: Tue, 1 Oct 2024 11:50:36 -0700 Subject: [PATCH 09/15] reset Dockerfile --- test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile index 3b24d9213..51ad96b16 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile @@ -25,11 +25,6 @@ RUN cd /root/test/SmokeTests/OOProcSmokeTests/DotNetIsolated && \ ls -aR /home/site/wwwroot && \ cat /home/site/wwwroot/extensions.json # debugging -RUN cat /root/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/obj/Release/net6.0/WorkerExtensions/WorkerExtensions.csproj - -RUN ls /root/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/bin/ -# .azurefunctions/function.deps.json - # Step 3: Generate the final app image to run FROM mcr.microsoft.com/azure-functions/dotnet-isolated:4-dotnet-isolated6.0 From 090f8e729a0c584af3863209154f4ecd9ecd4c9c Mon Sep 17 00:00:00 2001 From: David Justo Date: Wed, 2 Oct 2024 16:49:28 -0700 Subject: [PATCH 10/15] do not throw oom, throw sessionAbortedException --- .../RemoteOrchestratorContext.cs | 14 +++++++------- ...rosoft.Azure.WebJobs.Extensions.DurableTask.xml | 4 ++-- .../OutOfProcMiddleware.cs | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs index ea3489637..07054579b 100644 --- a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs +++ b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs @@ -71,7 +71,7 @@ internal bool TryGetOrchestrationErrorDetails(out Exception? failure) return hasError; } - internal void TrySetResult(IEnumerable actions, string customStatus) + internal void SetResult(IEnumerable actions, string customStatus) { var result = new OrchestratorExecutionResult { @@ -79,7 +79,7 @@ internal void TrySetResult(IEnumerable actions, string custo Actions = actions, }; - this.TrySetResultInternal(result); + this.SetResultInternal(result); } // TODO: This method should be considered deprecated because SDKs should no longer be returning results as JSON. @@ -117,14 +117,14 @@ internal void SetResult(string orchestratorResponseJsonText) innerException: jsonReaderException); } - this.TrySetResultInternal(result); + this.SetResultInternal(result); } /// /// Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. /// /// - /// Today, this method only checks for . In the future, we may want to add more cases. + /// Today, this method only checks for . In the future, we may want to add more cases. /// Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield /// a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be /// handled in this method. @@ -135,12 +135,12 @@ internal void SetResult(string orchestratorResponseJsonText) /// to this method as we encounter them. /// /// The failure details of the orchestrator. - /// If an OOM error is detected. + /// If an OOM error is detected. private void ThrowIfPlatformLevelException(FailureDetails failureDetails) { if (failureDetails.InnerFailure?.IsCausedBy() ?? false) { - throw new OutOfMemoryException(failureDetails.ErrorMessage); + throw new SessionAbortedException(failureDetails.ErrorMessage); } if (failureDetails.InnerFailure != null) @@ -149,7 +149,7 @@ private void ThrowIfPlatformLevelException(FailureDetails failureDetails) } } - private void TrySetResultInternal(OrchestratorExecutionResult result) + private void SetResultInternal(OrchestratorExecutionResult result) { // Look for an orchestration completion action to see if we need to grab the output. foreach (OrchestratorAction action in result.Actions) diff --git a/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml b/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml index ecb33a140..4461ba271 100644 --- a/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml +++ b/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml @@ -480,7 +480,7 @@ Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. - Today, this method only checks for . In the future, we may want to add more cases. + Today, this method only checks for . In the future, we may want to add more cases. Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be handled in this method. @@ -491,7 +491,7 @@ to this method as we encounter them. The failure details of the orchestrator. - If an OOM error is detected. + If an OOM error is detected. diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index d50e5657f..88a7612dc 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -140,7 +140,7 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes); // TrySetResult may throw if a platform-level error is encountered (like an out of memory exception). - context.TrySetResult( + context.SetResult( response.Actions.Select(ProtobufUtils.ToOrchestratorAction), response.CustomStatus); @@ -170,7 +170,7 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( // - an out of memory exception // - a worker process exit if (functionResult.Exception is Host.FunctionTimeoutException - || functionResult.Exception?.InnerException is OutOfMemoryException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling + || functionResult.Exception?.InnerException is SessionAbortedException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling || (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) { // TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. From a70d0146fae6771f6cfd834a374407045043a9ec Mon Sep 17 00:00:00 2001 From: David Justo Date: Wed, 2 Oct 2024 16:55:25 -0700 Subject: [PATCH 11/15] improve comments on faulty orchs --- .../DotNetIsolated/FaultyOrchestrators.cs | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs index 90006fd20..63f9f2a5b 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs @@ -33,11 +33,16 @@ public static Task OOMOrchestrator( { data.Add(new byte[1024 * 1024 * 1024]); } + + // we expect the code to never reach this statement, it should OOM. + // we throw just in case the code does not time out. This should fail the test + throw new Exception("this should never be reached"); + } + else { + // if it's not the first replay, delete the evidence file and return + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; } - - // assuming the orchestrator survived the OOM, delete the evidence file and return - System.IO.File.Delete(evidenceFile); - return Task.CompletedTask; } [Function(nameof(ProcessExitOrchestrator))] @@ -57,14 +62,14 @@ public static Task ProcessExitOrchestrator( { System.IO.File.Create(evidenceFile).Close(); - // simulate sudden crash + // force sudden crash Environment.FailFast("Simulating crash!"); } - - // assuming the orchestrator survived the OOM, delete the evidence file and return - System.IO.File.Delete(evidenceFile); - - return Task.CompletedTask; + else { + // if it's not the first replay, delete the evidence file and return + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; + } } [Function(nameof(TimeoutOrchestrator))] @@ -87,12 +92,16 @@ public static Task TimeoutOrchestrator( // force the process to timeout after a 1 minute wait System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1)); + + // we expect the code to never reach this statement, it should time out. + // we throw just in case the code does not time out. This should fail the test + throw new Exception("this should never be reached"); + } + else { + // if it's not the first replay, delete the evidence file and return + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; } - - // assuming the orchestrator survived the timeout, delete the evidence file and return - System.IO.File.Delete(evidenceFile); - - return Task.CompletedTask; } [Function("durable_HttpStartOOMOrchestrator")] From 2ff4b79d571e2ec288e5e2da17db92c8370b23a1 Mon Sep 17 00:00:00 2001 From: David Justo Date: Wed, 2 Oct 2024 16:57:53 -0700 Subject: [PATCH 12/15] add comments to tester script --- .../OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 index 839cdea5d..06564e7cd 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 @@ -80,6 +80,10 @@ Do { $retryCount = $retryCount + 1 } catch { + # we expect to enter this 'catch' block if our HTTP request to the host fail. + # Some failures observed during development include: + # - The host is not running/was restarting/was killed + # - The host is running but not healthy (OOMs may cause this), so it needs to be forcibly restarted Write-Host "An error occurred:" -ForegroundColor Red Write-Host $_ -ForegroundColor Red From 68e750352b0e68a7ea47b2eeeefc0f9292c46c63 Mon Sep 17 00:00:00 2001 From: David Justo Date: Wed, 2 Oct 2024 16:58:47 -0700 Subject: [PATCH 13/15] fix typo --- .../OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 index 06564e7cd..79d679b80 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 @@ -80,7 +80,7 @@ Do { $retryCount = $retryCount + 1 } catch { - # we expect to enter this 'catch' block if our HTTP request to the host fail. + # we expect to enter this 'catch' block if any of our HTTP requests to the host fail. # Some failures observed during development include: # - The host is not running/was restarting/was killed # - The host is running but not healthy (OOMs may cause this), so it needs to be forcibly restarted From 1cdd4b9feb7846cabe07399460a1d50334d113d3 Mon Sep 17 00:00:00 2001 From: David Justo Date: Wed, 2 Oct 2024 17:07:02 -0700 Subject: [PATCH 14/15] make faultyorchestrators compile by forcing all paths to return --- .../OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs index 63f9f2a5b..8332fa436 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs @@ -64,6 +64,7 @@ public static Task ProcessExitOrchestrator( // force sudden crash Environment.FailFast("Simulating crash!"); + throw new Exception("this should never be reached"); } else { // if it's not the first replay, delete the evidence file and return From c92726846c351aeb341c80fd3f0705e6b93653c4 Mon Sep 17 00:00:00 2001 From: David Justo Date: Thu, 3 Oct 2024 10:34:33 -0700 Subject: [PATCH 15/15] remove method-level comments, which generate xmls for private methods --- .../RemoteOrchestratorContext.cs | 28 ++++++++----------- ...t.Azure.WebJobs.Extensions.DurableTask.xml | 18 ------------ 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs index 07054579b..86cb21c84 100644 --- a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs +++ b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs @@ -120,24 +120,20 @@ internal void SetResult(string orchestratorResponseJsonText) this.SetResultInternal(result); } - /// - /// Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. - /// - /// - /// Today, this method only checks for . In the future, we may want to add more cases. - /// Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield - /// a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be - /// handled in this method. - /// However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` - /// by the isolated process, and thus need special handling. - /// It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level - /// errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them - /// to this method as we encounter them. - /// - /// The failure details of the orchestrator. - /// If an OOM error is detected. private void ThrowIfPlatformLevelException(FailureDetails failureDetails) { + // Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. + // + // Today, this method only checks for . In the future, we may want to add more cases. + // Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield + // a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be + // handled in this method. + // However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` + // by the isolated process, and thus need special handling. + // + // It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level + // errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them + // to this method as we encounter them. if (failureDetails.InnerFailure?.IsCausedBy() ?? false) { throw new SessionAbortedException(failureDetails.ErrorMessage); diff --git a/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml b/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml index 4461ba271..8faf3dfff 100644 --- a/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml +++ b/src/WebJobs.Extensions.DurableTask/Microsoft.Azure.WebJobs.Extensions.DurableTask.xml @@ -475,24 +475,6 @@ The result. - - - Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. - - - Today, this method only checks for . In the future, we may want to add more cases. - Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield - a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be - handled in this method. - However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` - by the isolated process, and thus need special handling. - It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level - errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them - to this method as we encounter them. - - The failure details of the orchestrator. - If an OOM error is detected. - Defines convenient overloads for calling the context methods, for all the contexts.