diff --git a/.github/workflows/smoketest-dotnet-isolated-v4.yml b/.github/workflows/smoketest-dotnet-isolated-v4.yml index f818ff7ae..474f48448 100644 --- a/.github/workflows/smoketest-dotnet-isolated-v4.yml +++ b/.github/workflows/smoketest-dotnet-isolated-v4.yml @@ -19,7 +19,79 @@ jobs: steps: - uses: actions/checkout@v2 - # Validation is blocked on https://github.com/Azure/azure-functions-host/issues/7995 - - name: Run V4 .NET Isolated Smoke Test - run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/StartHelloCitiesTyped -NoValidation + # Install .NET versions + - name: Set up .NET Core 3.1 + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '3.1.x' + + - name: Set up .NET Core 2.1 + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '2.1.x' + + - name: Set up .NET Core 6.x + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '6.x' + + - name: Set up .NET Core 8.x + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '8.x' + + # Install Azurite + - name: Set up Node.js (needed for Azurite) + uses: actions/setup-node@v3 + with: + node-version: '18.x' # Azurite requires at least Node 18 + + - name: Install Azurite + run: npm install -g azurite + + - name: Restore WebJobs extension + run: dotnet restore $solution + + - name: Build and pack WebJobs extension + run: cd ./src/WebJobs.Extensions.DurableTask && + mkdir ./out && + dotnet build -c Release WebJobs.Extensions.DurableTask.csproj --output ./out && + mkdir ~/packages && + dotnet nuget push ./out/Microsoft.Azure.WebJobs.Extensions.DurableTask.*.nupkg --source ~/packages && + dotnet nuget add source ~/packages + + - name: Build .NET Isolated Smoke Test + run: cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && + dotnet restore --verbosity normal && + dotnet build -c Release + + - name: Install core tools + run: npm i -g azure-functions-core-tools@4 --unsafe-perm true + + # Run smoke tests + # Unlike other smoke tests, the .NET isolated smoke tests run outside of a docker container, but to race conditions + # when building the smoke test app in docker, causing the build to fail. This is a temporary workaround until the + # root cause is identified and fixed. + + - name: Run smoke tests (Hello Cities) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/StartHelloCitiesTyped + + - name: Run smoke tests (Process Exit) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartProcessExitOrchestrator + + - name: Run smoke tests (Timeout) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartTimeoutOrchestrator + + - name: Run smoke tests (OOM) shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartOOMOrchestrator \ No newline at end of file diff --git a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs index 9c4d6a02e..86cb21c84 100644 --- a/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs +++ b/src/WebJobs.Extensions.DurableTask/ContextImplementations/RemoteOrchestratorContext.cs @@ -120,6 +120,31 @@ internal void SetResult(string orchestratorResponseJsonText) this.SetResultInternal(result); } + private void ThrowIfPlatformLevelException(FailureDetails failureDetails) + { + // Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. + // + // Today, this method only checks for . In the future, we may want to add more cases. + // Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield + // a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be + // handled in this method. + // However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` + // by the isolated process, and thus need special handling. + // + // It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level + // errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them + // to this method as we encounter them. + if (failureDetails.InnerFailure?.IsCausedBy() ?? false) + { + throw new SessionAbortedException(failureDetails.ErrorMessage); + } + + if (failureDetails.InnerFailure != null) + { + this.ThrowIfPlatformLevelException(failureDetails.InnerFailure); + } + } + private void SetResultInternal(OrchestratorExecutionResult result) { // Look for an orchestration completion action to see if we need to grab the output. @@ -133,6 +158,14 @@ private void SetResultInternal(OrchestratorExecutionResult result) if (completeAction.OrchestrationStatus == OrchestrationStatus.Failed) { + // If the orchestrator failed due to a platform-level error in the isolated process, + // we should re-throw that exception in the host (this process) invocation pipeline, + // so the invocation can be retried. + if (completeAction.FailureDetails != null) + { + this.ThrowIfPlatformLevelException(completeAction.FailureDetails); + } + string message = completeAction switch { { FailureDetails: { } f } => f.ErrorMessage, diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index 4d514c670..88a7612dc 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -138,10 +138,15 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( byte[] triggerReturnValueBytes = Convert.FromBase64String(triggerReturnValue); P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes); + + // TrySetResult may throw if a platform-level error is encountered (like an out of memory exception). context.SetResult( response.Actions.Select(ProtobufUtils.ToOrchestratorAction), response.CustomStatus); + // Here we throw if the orchestrator completed with an application-level error. When we do this, + // the function's result type will be of type `OrchestrationFailureException` which is reserved + // for application-level errors that do not need to be re-tried. context.ThrowIfFailed(); }, #pragma warning restore CS0618 // Type or member is obsolete (not intended for general public use) @@ -159,6 +164,19 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( // Re-throw so we can abort this invocation. this.HostLifetimeService.OnStopping.ThrowIfCancellationRequested(); } + + // we abort the invocation on "platform level errors" such as: + // - a timeout + // - an out of memory exception + // - a worker process exit + if (functionResult.Exception is Host.FunctionTimeoutException + || functionResult.Exception?.InnerException is SessionAbortedException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling + || (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) + { + // TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. + // Should we add that dependency or should it be exposed in WebJobs.Host? + throw functionResult.Exception; + } } catch (Exception hostRuntimeException) { @@ -238,8 +256,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( else { // the function failed for some other reason - - string exceptionDetails = functionResult.Exception.ToString(); + string exceptionDetails = functionResult.Exception?.ToString() ?? "Framework-internal message: exception details could not be extracted"; this.TraceHelper.FunctionFailed( this.Options.HubName, @@ -258,7 +275,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( orchestratorResult = OrchestratorExecutionResult.ForFailure( message: $"Function '{functionName}' failed with an unhandled exception.", - functionResult.Exception); + functionResult.Exception ?? new Exception($"Function '{functionName}' failed with an unknown unhandled exception")); } // Send the result of the orchestrator function to the DTFx dispatch pipeline. diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln new file mode 100644 index 000000000..a93cc6f6e --- /dev/null +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.5.002.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotNetIsolated", "DotNetIsolated.csproj", "{B2DBA49D-9D25-46DB-8968-15D5E83B4060}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {0954D7B4-582F-4F85-AE3E-5D503FB07DB1} + EndGlobalSection +EndGlobal diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs new file mode 100644 index 000000000..8332fa436 --- /dev/null +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs @@ -0,0 +1,165 @@ +using Microsoft.Azure.Functions.Worker; +using Microsoft.Azure.Functions.Worker.Http; +using Microsoft.DurableTask; +using Microsoft.DurableTask.Client; +using Microsoft.Extensions.Logging; +using System; + +namespace FaultOrchestrators +{ + public static class FaultyOrchestrators + { + [Function(nameof(OOMOrchestrator))] + public static Task OOMOrchestrator( + [OrchestrationTrigger] TaskOrchestrationContext context) + { + // this orchestrator is not deterministic, on purpose. + // we use the non-determinism to force an OOM exception on only the first replay + + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); + bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) + { + System.IO.File.Create(evidenceFile).Close(); + + // force the process to run out of memory + List data = new List(); + + for (int i = 0; i < 10000000; i++) + { + data.Add(new byte[1024 * 1024 * 1024]); + } + + // we expect the code to never reach this statement, it should OOM. + // we throw just in case the code does not time out. This should fail the test + throw new Exception("this should never be reached"); + } + else { + // if it's not the first replay, delete the evidence file and return + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; + } + } + + [Function(nameof(ProcessExitOrchestrator))] + public static Task ProcessExitOrchestrator( + [OrchestrationTrigger] TaskOrchestrationContext context) + { + // this orchestrator is not deterministic, on purpose. + // we use the non-determinism to force a sudden process exit on only the first replay + + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); + bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) + { + System.IO.File.Create(evidenceFile).Close(); + + // force sudden crash + Environment.FailFast("Simulating crash!"); + throw new Exception("this should never be reached"); + } + else { + // if it's not the first replay, delete the evidence file and return + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; + } + } + + [Function(nameof(TimeoutOrchestrator))] + public static Task TimeoutOrchestrator( + [OrchestrationTrigger] TaskOrchestrationContext context) + { + // this orchestrator is not deterministic, on purpose. + // we use the non-determinism to force a timeout on only the first replay + + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); + bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + + if (isTheFirstReplay) + { + System.IO.File.Create(evidenceFile).Close(); + + // force the process to timeout after a 1 minute wait + System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1)); + + // we expect the code to never reach this statement, it should time out. + // we throw just in case the code does not time out. This should fail the test + throw new Exception("this should never be reached"); + } + else { + // if it's not the first replay, delete the evidence file and return + System.IO.File.Delete(evidenceFile); + return Task.CompletedTask; + } + } + + [Function("durable_HttpStartOOMOrchestrator")] + public static async Task HttpStartOOMOrchestrator( + [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, + [DurableClient] DurableTaskClient client, + FunctionContext executionContext) + { + ILogger logger = executionContext.GetLogger("durable_HttpStartOOMOrchestrator"); + + // Function input comes from the request content. + string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( + nameof(OOMOrchestrator)); + + logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); + + // Returns an HTTP 202 response with an instance management payload. + // See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration + return await client.CreateCheckStatusResponseAsync(req, instanceId); + } + + [Function("durable_HttpStartProcessExitOrchestrator")] + public static async Task HttpStartProcessExitOrchestrator( + [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, + [DurableClient] DurableTaskClient client, + FunctionContext executionContext) + { + ILogger logger = executionContext.GetLogger("durable_HttpStartProcessExitOrchestrator"); + + // Function input comes from the request content. + string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( + nameof(ProcessExitOrchestrator)); + + logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); + + // Returns an HTTP 202 response with an instance management payload. + // See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration + return await client.CreateCheckStatusResponseAsync(req, instanceId); + } + + [Function("durable_HttpStartTimeoutOrchestrator")] + public static async Task HttpStartTimeoutOrchestrator( + [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, + [DurableClient] DurableTaskClient client, + FunctionContext executionContext) + { + ILogger logger = executionContext.GetLogger("durable_HttpStartTimeoutOrchestrator"); + + // Function input comes from the request content. + string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( + nameof(TimeoutOrchestrator)); + + logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); + + // Returns an HTTP 202 response with an instance management payload. + // See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration + return await client.CreateCheckStatusResponseAsync(req, instanceId); + } + } +} diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json index 278b52cde..0ec9c6a89 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json @@ -7,5 +7,14 @@ "excludedTypes": "Request" } } - } + }, + "extensions": { + "durableTask": { + "storageProvider": { + "maxQueuePollingInterval": "00:00:01", + "controlQueueVisibilityTimeout": "00:01:00" + } + } + }, + "functionTimeout": "00:00:30" } \ No newline at end of file diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 new file mode 100644 index 000000000..79d679b80 --- /dev/null +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 @@ -0,0 +1,119 @@ +# This is a simple test runner to validate the .NET isolated smoke tests. +# It supercedes the usual e2e-tests.ps1 script for the .NET isolated scenario because building the snmoke test app +# on the docker image is unreliable. For more details, see: https://github.com/Azure/azure-functions-host/issues/7995 + +# This script is designed specifically to test cases where the isolated worker process experiences a platform failure: +# timeouts, OOMs, etc. For that reason, it is careful to check that the Functions Host is running and healthy at regular +# intervals. This makes these tests run more slowly than other test categories. + +param( + [Parameter(Mandatory=$true)] + [string]$HttpStartPath +) + +$retryCount = 0; +$statusUrl = $null; +$success = $false; +$haveManuallyRestartedHost = $false; + +Do { + $testIsRunning = $true; + + # Start the functions host if it's not running already. + # Then give it up to 1 minute to start up. + # This is a long wait, but from experience the CI can be slow to start up the host, especially after a platform-error. + $isFunctionsHostRunning = (Get-Process -Name func -ErrorAction SilentlyContinue) + if ($isFunctionsHostRunning -eq $null) { + Write-Host "Starting the Functions host..." -ForegroundColor Yellow + + # The '&' operator is used to run the command in the background + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + Write-Host "Waiting for the Functions host to start up..." -ForegroundColor Yellow + Start-Sleep -Seconds 60 + } + + + try { + # Make sure the Functions runtime is up and running + $pingUrl = "http://localhost:7071/admin/host/ping" + Write-Host "Pinging app at $pingUrl to ensure the host is healthy" -ForegroundColor Yellow + Invoke-RestMethod -Method Post -Uri "http://localhost:7071/admin/host/ping" + Write-Host "Host is healthy!" -ForegroundColor Green + + # Start orchestrator if it hasn't been started yet + if ($statusUrl -eq $null){ + $startOrchestrationUri = "http://localhost:7071/$HttpStartPath" + Write-Host "Starting a new orchestration instance via POST to $startOrchestrationUri..." -ForegroundColor Yellow + + $result = Invoke-RestMethod -Method Post -Uri $startOrchestrationUri + Write-Host "Started orchestration with instance ID '$($result.id)'!" -ForegroundColor Yellow + Write-Host "Waiting for orchestration to complete..." -ForegroundColor Yellow + + $statusUrl = $result.statusQueryGetUri + + # sleep for a bit to give the orchestrator a chance to start, + # then loop once more in case the orchestrator ran quickly, made the host unhealthy, + # and the functions host needs to be restarted + Start-Sleep -Seconds 5 + continue; + } + + # Check the orchestrator status + $result = Invoke-RestMethod -Method Get -Uri $statusUrl + $runtimeStatus = $result.runtimeStatus + Write-Host "Orchestration is $runtimeStatus" -ForegroundColor Yellow + Write-Host $result + + if ($result.runtimeStatus -eq "Completed") { + $success = $true + $testIsRunning = $false + break + } + if ($result.runtimeStatus -eq "Failed") { + $success = $false + $testIsRunning = $false + break + } + + # If the orchestrator did not complete yet, wait for a bit before checking again + Start-Sleep -Seconds 2 + $retryCount = $retryCount + 1 + + } catch { + # we expect to enter this 'catch' block if any of our HTTP requests to the host fail. + # Some failures observed during development include: + # - The host is not running/was restarting/was killed + # - The host is running but not healthy (OOMs may cause this), so it needs to be forcibly restarted + Write-Host "An error occurred:" -ForegroundColor Red + Write-Host $_ -ForegroundColor Red + + # When testing for platform errors, we want to make sure the Functions host is healthy and ready to take requests. + # The Host can get into bad states (for example, in an OOM-inducing test) where it does not self-heal. + # For these cases, we manually restart the host to ensure it is in a good state. We only do this once per test. + if ($haveManuallyRestartedHost -eq $false) { + + # We stop the host process and wait for a bit before checking if it is running again. + Write-Host "Restarting the Functions host..." -ForegroundColor Yellow + Stop-Process -Name "func" -Force + Start-Sleep -Seconds 5 + + # Log whether the process kill succeeded + $haveManuallyRestartedHost = $true + $isFunctionsHostRunning = ((Get-Process -Name func -ErrorAction SilentlyContinue) -eq $null) + Write-Host "Host process killed: $isFunctionsHostRunning" -ForegroundColor Yellow + + # the beginning of the loop will restart the host + continue + } + + # Rethrow the original exception + throw + } + +} while (($testIsRunning -eq $true) -and ($retryCount -lt 65)) + +if ($success -eq $false) { + throw "Orchestration failed or did not compete in time! :(" +} + +Write-Host "Success!" -ForegroundColor Green \ No newline at end of file